Skip to content

Commit 7c3ba9d

Browse files
committed
Use UMAP for node embedding visualization
1 parent 8e9d7f5 commit 7c3ba9d

File tree

5 files changed

+366
-123
lines changed

5 files changed

+366
-123
lines changed

domains/anomaly-detection/anomalyDetectionPipeline.sh

Lines changed: 3 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -137,65 +137,20 @@ ALGORITHM_WEIGHT="projection_weight_property"
137137

138138
# Code independent algorithm parameters
139139
COMMUNITY_PROPERTY="community_property=communityLeidenIdTuned"
140-
141-
# Query (or recalculate) features.
142-
#
143-
# Required Parameters:
144-
# - projection_name=...
145-
# Name prefix for the in-memory projection name. Example: "package-anomaly-detection"
146-
# - projection_node_label=...
147-
# Label of the nodes that will be used for the projection. Example: "Package"
148-
# - projection_weight_property=...
149-
# Name of the node property that contains the dependency weight. Example: "weight"
150-
anomaly_detection_features() {
151-
# Query Feature: Determine the Betweenness centrality (with the directed graph projection) if not already done
152-
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Betweenness-Exists.cypher" \
153-
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Betweenness-Write.cypher" "${@}"
154-
# Query Feature: Determine the local clustering coefficient if not already done
155-
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-LocalClusteringCoefficient-Exists.cypher" \
156-
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-LocalClusteringCoefficient-Write.cypher" "${@}"
157-
# Query Feature: Determine the page rank if not already done
158-
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageRank-Exists.cypher" \
159-
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageRank-Write.cypher" "${@}"
160-
# Query Feature: Determine the article rank if not already done
161-
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Exists.cypher" \
162-
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Write.cypher" "${@}"
163-
}
164-
165-
# Run the anomaly detection pipeline.
166-
#
167-
# Required Parameters:
168-
# - projection_name=...
169-
# Name prefix for the in-memory projection name. Example: "package-anomaly-detection"
170-
# - projection_node_label=...
171-
# Label of the nodes that will be used for the projection. Example: "Package"
172-
# - projection_weight_property=...
173-
# Name of the node property that contains the dependency weight. Example: "weight"
174-
anomaly_detection_pipeline() {
175-
time anomaly_detection_features "${@}"
176-
# Run Python: Get tuned Leiden communities as a reference to tune clustering
177-
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedLeidenCommunityDetection.py" "${@}" ${verboseMode}
178-
# Run Python: Tuned Fast Random Projection and tuned HDBSCAN clustering
179-
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedNodeEmbeddingClustering.py" "${@}" ${verboseMode}
180-
181-
# Query Results: Output all collected features into a CSV file.
182-
local nodeLabel
183-
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
184-
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeatures.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection.csv"
185-
}
140+
EMBEDDING_PROPERTY="embedding_property=embeddingsFastRandomProjectionTunedForClustering"
186141

187142
# -- Java Artifact Node Embeddings -------------------------------
188143

189144
if createUndirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight"; then
190145
createDirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection-directed" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight"
191-
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=artifact-anomaly-detection" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}"
146+
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=artifact-anomaly-detection" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
192147
fi
193148

194149
# -- Java Package Node Embeddings --------------------------------
195150

196151
if createUndirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces"; then
197152
createDirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection-directed" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces"
198-
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=package-anomaly-detection" "${ALGORITHM_NODE}=Package" "${ALGORITHM_WEIGHT}=weight25PercentInterfaces" "${COMMUNITY_PROPERTY}"
153+
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=package-anomaly-detection" "${ALGORITHM_NODE}=Package" "${ALGORITHM_WEIGHT}=weight25PercentInterfaces" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
199154
fi
200155

201156
# -- Java Type Node Embeddings -----------------------------------

domains/anomaly-detection/tunedNodeEmbeddingClustering.py

Lines changed: 28 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
# This is useful for understanding code structure, detecting modular boundaries, and identifying anomalies or outliers in large software systems without requiring manual labeling.
55
# It takes the code structure as a graph in Neo4j and generates node embeddings using Fast Random Projection (FastRP).
66
# These embeddings capture structural similarity and are clustered using HDBSCAN to assign labels or detect noise.
7-
# For visualization, the embeddings are reduced to 2D using t-SNE.
87
# All results - including embeddings, cluster labels, and 2D coordinates — are written back to Neo4j for further use.
98

109
# Prerequisite:
@@ -25,9 +24,7 @@
2524

2625
from neo4j import GraphDatabase, Driver
2726

28-
from openTSNE.sklearn import TSNE
29-
30-
from sklearn.base import BaseEstimator
27+
# from sklearn.base import BaseEstimator # Extend from sklearn BaseEstimator to use e.g. GridSearchCV for hyperparameter tuning.
3128
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, normalized_mutual_info_score
3229
from sklearn.cluster import HDBSCAN # type: ignore
3330

@@ -38,7 +35,7 @@
3835

3936

4037
class Parameters:
41-
required_parameters_ = ["projection_name", "projection_node_label", "projection_weight_property", "community_property"]
38+
required_parameters_ = ["projection_name", "projection_node_label", "projection_weight_property", "community_property", "embedding_property"]
4239

4340
def __init__(self, input_parameters: typing.Dict[str, str], verbose: bool = False):
4441
self.query_parameters_ = input_parameters.copy() # copy enforces immutability
@@ -63,9 +60,6 @@ def log_dependency_versions_() -> None:
6360
from sklearn import __version__ as sklearn_version
6461
print('scikit-learn version: {}'.format(sklearn_version))
6562

66-
from openTSNE import __version__ as openTSNE_version
67-
print('openTSNE version: {}'.format(openTSNE_version))
68-
6963
from neo4j import __version__ as neo4j_version
7064
print('neo4j version: {}'.format(neo4j_version))
7165

@@ -116,6 +110,9 @@ def get_projection_name(self) -> str:
116110
def get_projection_node_label(self) -> str:
117111
return self.query_parameters_["projection_node_label"]
118112

113+
def get_embedding_property(self) -> str:
114+
return self.query_parameters_["embedding_property"]
115+
119116
def is_verbose(self) -> bool:
120117
return self.verbose_
121118

@@ -513,7 +510,8 @@ def __init__(self,
513510
forth_iteration_weight: float = 1.0,
514511
):
515512
self.parameters_ = parameters
516-
self.verbose = parameters.is_verbose()
513+
self.verbose_ = parameters.is_verbose()
514+
self.write_property_ = parameters.get_embedding_property()
517515

518516
self.embedding_dimension = embedding_dimension
519517
self.random_seed = random_seed
@@ -526,15 +524,15 @@ def __to_algorithm_parameters(self) -> typing.Dict['str', 'str']:
526524
"normalization_strength": str(self.normalization_strength),
527525
"forth_iteration_weight": str(self.forth_iteration_weight),
528526
"embedding_random_seed": str(self.random_seed),
529-
"write_property": "embeddingsFastRandomProjectionForClustering",
527+
"write_property": str(self.write_property_),
530528
**self.parameters_.get_query_parameters()
531529
}
532530

533531
def __run_algorithm(self) -> pd.DataFrame:
534532
algorithm_parameters = self.__to_algorithm_parameters()
535533
# For Debugging:
536534
# print("Generating embeddings using Neo4j Graph Data Science with the following parameters: " + str(algorithm_parameters))
537-
if self.verbose:
535+
if self.verbose_:
538536
return query_cypher_to_data_frame(self.cypher_query_for_generating_embeddings_, parameters=algorithm_parameters)
539537

540538
return query_cypher_to_data_frame_suppress_warnings(self.cypher_query_for_generating_embeddings_, parameters=algorithm_parameters)
@@ -568,12 +566,12 @@ def write_embeddings(self) -> typing.Self:
568566
This is useful for further processing or analysis of the embeddings.
569567
"""
570568
algorithm_parameters = self.__to_algorithm_parameters()
571-
if self.verbose:
569+
if self.verbose_:
572570
print("")
573571
print("Writing embeddings to Neo4j with the following parameters: " + str(algorithm_parameters))
574572
print("")
575573

576-
if self.verbose:
574+
if self.verbose_:
577575
query_cypher_to_data_frame(self.cypher_query_for_writing_embeddings_, parameters=algorithm_parameters)
578576
else:
579577
query_cypher_to_data_frame_suppress_warnings(self.cypher_query_for_writing_embeddings_, parameters=algorithm_parameters)
@@ -633,63 +631,27 @@ def objective(trial):
633631
return TuneableFastRandomProjectionNodeEmbeddings(parameters, **study.best_params).fit()
634632

635633

636-
def prepare_node_embeddings_for_2d_visualization(embeddings: pd.DataFrame) -> pd.DataFrame:
637-
"""
638-
Reduces the dimensionality of the node embeddings (e.g. 64 floating point numbers in an array)
639-
to two dimensions for 2D visualization.
640-
see https://opentsne.readthedocs.io
641-
"""
642-
643-
if embeddings.empty:
644-
print("No projected data for node embeddings dimensionality reduction available")
645-
return embeddings
646-
647-
# Calling the fit_transform method just with a list doesn't work.
648-
# It leads to an error with the following message: 'list' object has no attribute 'shape'
649-
# This can be solved by converting the list to a numpy array using np.array(..).
650-
# See https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape
651-
embeddings_as_numpy_array = np.array(embeddings.embedding.to_list())
652-
653-
# Use t-distributed Stochastic Neighbor Embedding (t-SNE) to reduce the dimensionality
654-
# of the previously calculated node embeddings to 2 dimensions for visualization
655-
t_distributed_stochastic_neighbor_embedding = TSNE(n_components=2, verbose=False, random_state=47)
656-
two_dimension_node_embeddings = t_distributed_stochastic_neighbor_embedding.fit_transform(embeddings_as_numpy_array)
657-
# display(two_dimension_node_embeddings.shape) # Display the shape of the t-SNE result
658-
659-
# Create a new DataFrame with the results of the 2 dimensional node embeddings
660-
# and the code unit and artifact name of the query above as preparation for the plot
661-
embeddings['embeddingVisualizationX'] = [value[0] for value in two_dimension_node_embeddings]
662-
embeddings['embeddingVisualizationY'] = [value[1] for value in two_dimension_node_embeddings]
663-
664-
return embeddings
665-
666-
667-
def execute_tuned_node_embeddings_clustering(parameters: Parameters) -> None:
668-
tuned_fast_random_projection = get_tuned_fast_random_projection_node_embeddings(parameters)
669-
embeddings = tuned_fast_random_projection.get_embeddings()
670-
clustering_results = coordinate_tuned_hierarchical_density_based_spatial_clustering(embeddings)
671-
if parameters.is_verbose():
672-
print("HDBSCAN clustered labels by their size descending (top 10):", clustering_results.clustering_results_distribution.head(10))
673-
print("HDBSCAN clustered labels by their probability descending (top 10):", clustering_results.clustering_results_distribution.sort_values(by='probability', ascending=False).head(10))
674-
675-
embeddings = prepare_node_embeddings_for_2d_visualization(clustering_results.embeddings)
676-
677-
tuned_fast_random_projection.write_embeddings()
678-
data_to_write = pd.DataFrame(data={
679-
'nodeElementId': embeddings["nodeElementId"],
680-
'clusteringHDBSCANLabel': embeddings['clusteringTunedHDBSCANLabel'],
681-
'clusteringHDBSCANProbability': embeddings['clusteringTunedHDBSCANProbability'],
682-
'clusteringHDBSCANNoise': (embeddings['clusteringTunedHDBSCANLabel'] == -1).astype(int),
683-
'embeddingFastRandomProjectionVisualizationX': embeddings["embeddingVisualizationX"],
684-
'embeddingFastRandomProjectionVisualizationY': embeddings["embeddingVisualizationY"],
685-
})
686-
write_batch_data_into_database(data_to_write, parameters.get_projection_node_label())
687-
688634
# ------------------------------------------------------------------------------------------------------------
689635
# MAIN
690636
# ------------------------------------------------------------------------------------------------------------
691637

692638

693639
parameters = parse_input_parameters()
694640
driver = get_graph_database_driver()
695-
execute_tuned_node_embeddings_clustering(parameters)
641+
642+
tuned_fast_random_projection = get_tuned_fast_random_projection_node_embeddings(parameters)
643+
embeddings = tuned_fast_random_projection.get_embeddings()
644+
645+
clustering_results = coordinate_tuned_hierarchical_density_based_spatial_clustering(embeddings)
646+
if parameters.is_verbose():
647+
print("HDBSCAN clustered labels by their size descending (top 10):", clustering_results.clustering_results_distribution.head(10))
648+
print("HDBSCAN clustered labels by their probability descending (top 10):", clustering_results.clustering_results_distribution.sort_values(by='probability', ascending=False).head(10))
649+
650+
tuned_fast_random_projection.write_embeddings()
651+
data_to_write = pd.DataFrame(data={
652+
'nodeElementId': embeddings["nodeElementId"],
653+
'clusteringHDBSCANLabel': embeddings['clusteringTunedHDBSCANLabel'],
654+
'clusteringHDBSCANProbability': embeddings['clusteringTunedHDBSCANProbability'],
655+
'clusteringHDBSCANNoise': (embeddings['clusteringTunedHDBSCANLabel'] == -1).astype(int),
656+
})
657+
write_batch_data_into_database(data_to_write, parameters.get_projection_node_label())

0 commit comments

Comments
 (0)