Skip to content

Commit a5670b0

Browse files
committed
Use UMAP for node embedding visualization
1 parent 56beb01 commit a5670b0

File tree

5 files changed

+394
-103
lines changed

5 files changed

+394
-103
lines changed

domains/anomaly-detection/anomalyDetectionPipeline.sh

Lines changed: 31 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -59,24 +59,6 @@ source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
5959
# Define functions to create and delete Graph Projections like "createUndirectedDependencyProjection"
6060
source "${SCRIPTS_DIR}/projectionFunctions.sh"
6161

62-
# Create report directory
63-
REPORT_NAME="anomaly-detection"
64-
FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}"
65-
mkdir -p "${FULL_REPORT_DIRECTORY}"
66-
67-
# Query Parameter key pairs for projection and algorithm side
68-
PROJECTION_NAME="dependencies_projection"
69-
ALGORITHM_PROJECTION="projection_name"
70-
71-
PROJECTION_NODE="dependencies_projection_node"
72-
ALGORITHM_NODE="projection_node_label"
73-
74-
PROJECTION_WEIGHT="dependencies_projection_weight_property"
75-
ALGORITHM_WEIGHT="projection_weight_property"
76-
77-
# Code independent algorithm parameters
78-
COMMUNITY_PROPERTY="community_property=communityLeidenIdTuned"
79-
8062
# Query (or recalculate) features.
8163
#
8264
# Required Parameters:
@@ -87,16 +69,16 @@ COMMUNITY_PROPERTY="community_property=communityLeidenIdTuned"
8769
# - projection_weight_property=...
8870
# Name of the node property that contains the dependency weight. Example: "weight"
8971
anomaly_detection_features() {
90-
# Query Feature: Determine the Betweenness centrality (with the directed graph projection) if not already done
72+
# Determine the Betweenness centrality (with the directed graph projection) if not already done
9173
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Betweenness-Exists.cypher" \
9274
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Betweenness-Write.cypher" "${@}"
93-
# Query Feature: Determine the local clustering coefficient if not already done
75+
# Determine the local clustering coefficient if not already done
9476
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-LocalClusteringCoefficient-Exists.cypher" \
9577
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-LocalClusteringCoefficient-Write.cypher" "${@}"
96-
# Query Feature: Determine the page rank if not already done
78+
# Determine the page rank if not already done
9779
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageRank-Exists.cypher" \
9880
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageRank-Write.cypher" "${@}"
99-
# Query Feature: Determine the article rank if not already done
81+
# Determine the article rank if not already done
10082
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Exists.cypher" \
10183
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Write.cypher" "${@}"
10284
}
@@ -112,44 +94,65 @@ anomaly_detection_features() {
11294
# Name of the node property that contains the dependency weight. Example: "weight"
11395
anomaly_detection_pipeline() {
11496
time anomaly_detection_features "${@}"
115-
# Run Python: Get tuned Leiden communities as a reference to tune clustering
97+
# Get tuned Leiden communities as a reference to tune clustering
11698
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedLeidenCommunityDetection.py" "${@}" ${verboseMode}
117-
# Run Python: Tuned Fast Random Projection and tuned HDBSCAN clustering
99+
# Tuned Fast Random Projection and tuned HDBSCAN clustering
118100
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedNodeEmbeddingClustering.py" "${@}" ${verboseMode}
101+
# Reduce the dimensionality of the node embeddings down to 2D for visualization using UMAP
102+
time "${ANOMALY_DETECTION_SCRIPT_DIR}/umap2dNodeEmbeddings.py" "${@}" ${verboseMode}
119103

120104
# Query Results: Output all collected features into a CSV file.
121105
local nodeLabel
122106
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
123107
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeatures.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection.csv"
124108
}
125109

110+
# Create report directory
111+
REPORT_NAME="anomaly-detection"
112+
FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}"
113+
mkdir -p "${FULL_REPORT_DIRECTORY}"
114+
115+
# Query Parameter key pairs for projection and algorithm side
116+
PROJECTION_NAME="dependencies_projection"
117+
ALGORITHM_PROJECTION="projection_name"
118+
119+
PROJECTION_NODE="dependencies_projection_node"
120+
ALGORITHM_NODE="projection_node_label"
121+
122+
PROJECTION_WEIGHT="dependencies_projection_weight_property"
123+
ALGORITHM_WEIGHT="projection_weight_property"
124+
125+
# Code independent algorithm parameters
126+
COMMUNITY_PROPERTY="community_property=communityLeidenIdTuned"
127+
EMBEDDING_PROPERTY="embedding_property=embeddingsFastRandomProjectionTunedForClustering"
128+
126129
# -- Java Artifact Node Embeddings -------------------------------
127130

128131
if createUndirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight"; then
129132
createDirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection-directed" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight"
130-
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=artifact-anomaly-detection" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}"
133+
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=artifact-anomaly-detection" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
131134
fi
132135

133136
# -- Java Package Node Embeddings --------------------------------
134137

135138
if createUndirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces"; then
136139
createDirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection-directed" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces"
137-
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=package-anomaly-detection" "${ALGORITHM_NODE}=Package" "${ALGORITHM_WEIGHT}=weight25PercentInterfaces" "${COMMUNITY_PROPERTY}"
140+
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=package-anomaly-detection" "${ALGORITHM_NODE}=Package" "${ALGORITHM_WEIGHT}=weight25PercentInterfaces" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
138141
fi
139142

140143
# TODO reactivate
141144
# # -- Java Type Node Embeddings -----------------------------------
142145

143146
# if createUndirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection"; then
144147
# createDirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection-directed"
145-
# anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=type-anomaly-detection" "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}"
148+
# anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=type-anomaly-detection" "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
146149
# fi
147150

148151
# # -- Typescript Module Node Embeddings ---------------------------
149152

150153
# if createUndirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight"; then
151154
# createDirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding-directed" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight"
152-
# anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=typescript-module-embedding" "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=lowCouplingElement25PercentWeight" "${COMMUNITY_PROPERTY}"
155+
# anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=typescript-module-embedding" "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=lowCouplingElement25PercentWeight" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
153156
# fi
154157

155158
# ---------------------------------------------------------------

domains/anomaly-detection/tunedNodeEmbeddingClustering.py

Lines changed: 28 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
# This is useful for understanding code structure, detecting modular boundaries, and identifying anomalies or outliers in large software systems without requiring manual labeling.
55
# It takes the code structure as a graph in Neo4j and generates node embeddings using Fast Random Projection (FastRP).
66
# These embeddings capture structural similarity and are clustered using HDBSCAN to assign labels or detect noise.
7-
# For visualization, the embeddings are reduced to 2D using t-SNE.
87
# All results - including embeddings, cluster labels, and 2D coordinates — are written back to Neo4j for further use.
98

109
# Prerequisite:
@@ -25,9 +24,7 @@
2524

2625
from neo4j import GraphDatabase, Driver
2726

28-
from openTSNE.sklearn import TSNE
29-
30-
from sklearn.base import BaseEstimator
27+
# from sklearn.base import BaseEstimator # Extend from sklearn BaseEstimator to use e.g. GridSearchCV for hyperparameter tuning.
3128
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, normalized_mutual_info_score
3229
from sklearn.cluster import HDBSCAN # type: ignore
3330

@@ -38,7 +35,7 @@
3835

3936

4037
class Parameters:
41-
required_parameters_ = ["projection_name", "projection_node_label", "projection_weight_property", "community_property"]
38+
required_parameters_ = ["projection_name", "projection_node_label", "projection_weight_property", "community_property", "embedding_property"]
4239

4340
def __init__(self, input_parameters: typing.Dict[str, str], verbose: bool = False):
4441
self.query_parameters_ = input_parameters.copy() # copy enforces immutability
@@ -63,9 +60,6 @@ def log_dependency_versions_() -> None:
6360
from sklearn import __version__ as sklearn_version
6461
print('scikit-learn version: {}'.format(sklearn_version))
6562

66-
from openTSNE import __version__ as openTSNE_version
67-
print('openTSNE version: {}'.format(openTSNE_version))
68-
6963
from neo4j import __version__ as neo4j_version
7064
print('neo4j version: {}'.format(neo4j_version))
7165

@@ -116,6 +110,9 @@ def get_projection_name(self) -> str:
116110
def get_projection_node_label(self) -> str:
117111
return self.query_parameters_["projection_node_label"]
118112

113+
def get_embedding_property(self) -> str:
114+
return self.query_parameters_["embedding_property"]
115+
119116
def is_verbose(self) -> bool:
120117
return self.verbose_
121118

@@ -513,7 +510,8 @@ def __init__(self,
513510
forth_iteration_weight: float = 1.0,
514511
):
515512
self.parameters_ = parameters
516-
self.verbose = parameters.is_verbose()
513+
self.verbose_ = parameters.is_verbose()
514+
self.write_property_ = parameters.get_embedding_property()
517515

518516
self.embedding_dimension = embedding_dimension
519517
self.random_seed = random_seed
@@ -526,15 +524,15 @@ def __to_algorithm_parameters(self) -> typing.Dict['str', 'str']:
526524
"normalization_strength": str(self.normalization_strength),
527525
"forth_iteration_weight": str(self.forth_iteration_weight),
528526
"embedding_random_seed": str(self.random_seed),
529-
"write_property": "embeddingsFastRandomProjectionForClustering",
527+
"write_property": str(self.write_property_),
530528
**self.parameters_.get_query_parameters()
531529
}
532530

533531
def __run_algorithm(self) -> pd.DataFrame:
534532
algorithm_parameters = self.__to_algorithm_parameters()
535533
# For Debugging:
536534
# print("Generating embeddings using Neo4j Graph Data Science with the following parameters: " + str(algorithm_parameters))
537-
if self.verbose:
535+
if self.verbose_:
538536
return query_cypher_to_data_frame(self.cypher_query_for_generating_embeddings_, parameters=algorithm_parameters)
539537

540538
return query_cypher_to_data_frame_suppress_warnings(self.cypher_query_for_generating_embeddings_, parameters=algorithm_parameters)
@@ -568,12 +566,12 @@ def write_embeddings(self) -> typing.Self:
568566
This is useful for further processing or analysis of the embeddings.
569567
"""
570568
algorithm_parameters = self.__to_algorithm_parameters()
571-
if self.verbose:
569+
if self.verbose_:
572570
print("")
573571
print("Writing embeddings to Neo4j with the following parameters: " + str(algorithm_parameters))
574572
print("")
575573

576-
if self.verbose:
574+
if self.verbose_:
577575
query_cypher_to_data_frame(self.cypher_query_for_writing_embeddings_, parameters=algorithm_parameters)
578576
else:
579577
query_cypher_to_data_frame_suppress_warnings(self.cypher_query_for_writing_embeddings_, parameters=algorithm_parameters)
@@ -633,63 +631,27 @@ def objective(trial):
633631
return TuneableFastRandomProjectionNodeEmbeddings(parameters, **study.best_params).fit()
634632

635633

636-
def prepare_node_embeddings_for_2d_visualization(embeddings: pd.DataFrame) -> pd.DataFrame:
637-
"""
638-
Reduces the dimensionality of the node embeddings (e.g. 64 floating point numbers in an array)
639-
to two dimensions for 2D visualization.
640-
see https://opentsne.readthedocs.io
641-
"""
642-
643-
if embeddings.empty:
644-
print("No projected data for node embeddings dimensionality reduction available")
645-
return embeddings
646-
647-
# Calling the fit_transform method just with a list doesn't work.
648-
# It leads to an error with the following message: 'list' object has no attribute 'shape'
649-
# This can be solved by converting the list to a numpy array using np.array(..).
650-
# See https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape
651-
embeddings_as_numpy_array = np.array(embeddings.embedding.to_list())
652-
653-
# Use t-distributed Stochastic Neighbor Embedding (t-SNE) to reduce the dimensionality
654-
# of the previously calculated node embeddings to 2 dimensions for visualization
655-
t_distributed_stochastic_neighbor_embedding = TSNE(n_components=2, verbose=False, random_state=47)
656-
two_dimension_node_embeddings = t_distributed_stochastic_neighbor_embedding.fit_transform(embeddings_as_numpy_array)
657-
# display(two_dimension_node_embeddings.shape) # Display the shape of the t-SNE result
658-
659-
# Create a new DataFrame with the results of the 2 dimensional node embeddings
660-
# and the code unit and artifact name of the query above as preparation for the plot
661-
embeddings['embeddingVisualizationX'] = [value[0] for value in two_dimension_node_embeddings]
662-
embeddings['embeddingVisualizationY'] = [value[1] for value in two_dimension_node_embeddings]
663-
664-
return embeddings
665-
666-
667-
def execute_tuned_node_embeddings_clustering(parameters: Parameters) -> None:
668-
tuned_fast_random_projection = get_tuned_fast_random_projection_node_embeddings(parameters)
669-
embeddings = tuned_fast_random_projection.get_embeddings()
670-
clustering_results = coordinate_tuned_hierarchical_density_based_spatial_clustering(embeddings)
671-
if parameters.is_verbose():
672-
print("HDBSCAN clustered labels by their size descending (top 10):", clustering_results.clustering_results_distribution.head(10))
673-
print("HDBSCAN clustered labels by their probability descending (top 10):", clustering_results.clustering_results_distribution.sort_values(by='probability', ascending=False).head(10))
674-
675-
embeddings = prepare_node_embeddings_for_2d_visualization(clustering_results.embeddings)
676-
677-
tuned_fast_random_projection.write_embeddings()
678-
data_to_write = pd.DataFrame(data={
679-
'nodeElementId': embeddings["nodeElementId"],
680-
'clusteringHDBSCANLabel': embeddings['clusteringTunedHDBSCANLabel'],
681-
'clusteringHDBSCANProbability': embeddings['clusteringTunedHDBSCANProbability'],
682-
'clusteringHDBSCANNoise': (embeddings['clusteringTunedHDBSCANLabel'] == -1).astype(int),
683-
'embeddingFastRandomProjectionVisualizationX': embeddings["embeddingVisualizationX"],
684-
'embeddingFastRandomProjectionVisualizationY': embeddings["embeddingVisualizationY"],
685-
})
686-
write_batch_data_into_database(data_to_write, parameters.get_projection_node_label())
687-
688634
# ------------------------------------------------------------------------------------------------------------
689635
# MAIN
690636
# ------------------------------------------------------------------------------------------------------------
691637

692638

693639
parameters = parse_input_parameters()
694640
driver = get_graph_database_driver()
695-
execute_tuned_node_embeddings_clustering(parameters)
641+
642+
tuned_fast_random_projection = get_tuned_fast_random_projection_node_embeddings(parameters)
643+
embeddings = tuned_fast_random_projection.get_embeddings()
644+
645+
clustering_results = coordinate_tuned_hierarchical_density_based_spatial_clustering(embeddings)
646+
if parameters.is_verbose():
647+
print("HDBSCAN clustered labels by their size descending (top 10):", clustering_results.clustering_results_distribution.head(10))
648+
print("HDBSCAN clustered labels by their probability descending (top 10):", clustering_results.clustering_results_distribution.sort_values(by='probability', ascending=False).head(10))
649+
650+
tuned_fast_random_projection.write_embeddings()
651+
data_to_write = pd.DataFrame(data={
652+
'nodeElementId': embeddings["nodeElementId"],
653+
'clusteringHDBSCANLabel': embeddings['clusteringTunedHDBSCANLabel'],
654+
'clusteringHDBSCANProbability': embeddings['clusteringTunedHDBSCANProbability'],
655+
'clusteringHDBSCANNoise': (embeddings['clusteringTunedHDBSCANLabel'] == -1).astype(int),
656+
})
657+
write_batch_data_into_database(data_to_write, parameters.get_projection_node_label())

0 commit comments

Comments
 (0)