Skip to content

Commit 6e8c37e

Browse files
committed
Mark center (=medoid) of embedding clusters
1 parent 26863ea commit 6e8c37e

File tree

3 files changed

+48
-5
lines changed

3 files changed

+48
-5
lines changed

domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -556,6 +556,7 @@
556556
" AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n",
557557
" AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n",
558558
" AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n",
559+
" AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL\n",
559560
" AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL\n",
560561
" AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL\n",
561562
" RETURN DISTINCT \n",
@@ -568,6 +569,7 @@
568569
" ,codeUnit.clusteringHDBSCANLabel AS clusteringHDBSCANLabel\n",
569570
" ,codeUnit.clusteringHDBSCANProbability AS clusteringHDBSCANProbability\n",
570571
" ,codeUnit.clusteringHDBSCANNoise AS clusteringHDBSCANNoise\n",
572+
" ,codeUnit.clusteringHDBSCANMedoid AS clusteringHDBSCANMedoid\n",
571573
" ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX\n",
572574
" ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\n",
573575
"\"\"\"\n",
@@ -605,6 +607,7 @@
605607
" title: str,\n",
606608
" main_color_map: str = \"tab20\",\n",
607609
" cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n",
610+
" cluster_medoid_column_name: str = \"clusteringHDBSCANMedoid\",\n",
608611
" centrality_column_name: str = \"pageRank\",\n",
609612
" x_position_column = 'embeddingVisualizationX',\n",
610613
" y_position_column = 'embeddingVisualizationY'\n",
@@ -668,6 +671,16 @@
668671
" label=f\"Cluster {cluster_label}\"\n",
669672
" )\n",
670673
"\n",
674+
" # Annotate medoids of the cluster\n",
675+
" medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n",
676+
" for index, row in medoids.iterrows():\n",
677+
" plot.annotate(\n",
678+
" text=f\"{row['shortCodeUnitName']} ({row[cluster_label_column_name]})\",\n",
679+
" xy=(row[x_position_column], row[y_position_column]),\n",
680+
" xytext=(5, 5), # Offset y position for better visibility\n",
681+
" **plot_annotation_style\n",
682+
" )\n",
683+
"\n",
671684
" # Plot noise points in gray\n",
672685
" plot.scatter(\n",
673686
" x=node_embeddings_noise_only[x_position_column],\n",
@@ -697,6 +710,7 @@
697710
" title: str,\n",
698711
" main_color_map: str = \"tab20\",\n",
699712
" cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n",
713+
" cluster_medoid_column_name: str = \"clusteringHDBSCANMedoid\",\n",
700714
" centrality_column_name: str = \"pageRank\",\n",
701715
" x_position_column = 'embeddingVisualizationX',\n",
702716
" y_position_column = 'embeddingVisualizationY'\n",
@@ -764,7 +778,17 @@
764778
" label=f\"Cluster {cluster_label}\"\n",
765779
" )\n",
766780
"\n",
767-
" # Plot noise points in gray\n",
781+
" # Annotate medoids of the cluster\n",
782+
" medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n",
783+
" for index, row in medoids.iterrows():\n",
784+
" axis.annotate(\n",
785+
" text=f\"{row['shortCodeUnitName']} ({row[cluster_label_column_name]})\",\n",
786+
" xy=(row[x_position_column], row[y_position_column]),\n",
787+
" xytext=(5, 5), # Offset y position for better visibility\n",
788+
" **plot_annotation_style\n",
789+
" )\n",
790+
"\n",
791+
" # Plot noise points in gray\n",
768792
" axis.scatter(\n",
769793
" x=node_embeddings_noise_only[x_position_column],\n",
770794
" y=node_embeddings_noise_only[y_position_column],\n",
@@ -986,6 +1010,7 @@
9861010
" AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n",
9871011
" AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n",
9881012
" AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n",
1013+
" AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL\n",
9891014
" AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL\n",
9901015
" AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL\n",
9911016
" RETURN DISTINCT \n",
@@ -998,6 +1023,7 @@
9981023
" ,codeUnit.clusteringHDBSCANLabel AS clusteringHDBSCANLabel\n",
9991024
" ,codeUnit.clusteringHDBSCANProbability AS clusteringHDBSCANProbability\n",
10001025
" ,codeUnit.clusteringHDBSCANNoise AS clusteringHDBSCANNoise\n",
1026+
" ,codeUnit.clusteringHDBSCANMedoid AS clusteringHDBSCANMedoid\n",
10011027
" ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX\n",
10021028
" ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\n",
10031029
"\"\"\"\n",

domains/anomaly-detection/features/AnomalyDetectionFeatures.cypher

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
AND codeUnit.clusteringHDBSCANLabel IS NOT NULL
1313
AND codeUnit.clusteringHDBSCANProbability IS NOT NULL
1414
AND codeUnit.clusteringHDBSCANNoise IS NOT NULL
15+
AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL
1516
AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL
1617
AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL
1718
OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
@@ -34,6 +35,7 @@ OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS
3435
,codeUnit.clusteringHDBSCANLabel AS clusteringLabel
3536
,codeUnit.clusteringHDBSCANProbability AS clusteringProbability
3637
,codeUnit.clusteringHDBSCANNoise AS clusteringIsNoise
38+
,codeUnit.clusteringHDBSCANMedoid AS clusteringIsMedoid
3739
,codeUnit.embeddingFastRandomProjectionVisualizationX AS visualizationX
3840
,codeUnit.embeddingFastRandomProjectionVisualizationY AS visualizationY
3941
,coalesce(codeUnit.centralityPageRank, 0.00001) AS centrality

domains/anomaly-detection/tunedNodeEmbeddingClustering.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -262,15 +262,16 @@ def output_detailed_optuna_tuning_results(optimized_study: optuna.Study, name_of
262262

263263

264264
class TunedClusteringResult:
265-
def __init__(self, labels: numpy_typing.NDArray, probabilities: numpy_typing.NDArray):
265+
def __init__(self, labels: numpy_typing.NDArray, probabilities: numpy_typing.NDArray, medoids: numpy_typing.NDArray):
266266
self.labels = labels
267267
self.probabilities = probabilities
268+
self.medoids = medoids
268269
self.cluster_count = len(set(labels)) - (1 if -1 in labels else 0)
269270
self.noise_count = np.sum(labels == -1)
270271
self.noise_ratio = self.noise_count / len(labels) if len(labels) > 0 else 0
271272

272273
def __repr__(self):
273-
return f"TunedClusteringResult(cluster_count={self.cluster_count}, noise_count={self.noise_count}, noise_ratio={self.noise_ratio}, labels=[...], probabilities=[...], )"
274+
return f"TunedClusteringResult(cluster_count={self.cluster_count}, noise_count={self.noise_count}, noise_ratio={self.noise_ratio}, labels=[...], probabilities=[...], medoids=[...])"
274275

275276

276277
def tuned_hierarchical_density_based_spatial_clustering(embeddings: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> TunedClusteringResult:
@@ -310,10 +311,10 @@ def objective(trial):
310311
output_detailed_optuna_tuning_results(study, 'HDBSCAN')
311312

312313
# Run the clustering again with the best parameters
313-
cluster_algorithm = HDBSCAN(**base_clustering_parameter, **study.best_params, n_jobs=-1)
314+
cluster_algorithm = HDBSCAN(**base_clustering_parameter, **study.best_params, n_jobs=-1, store_centers='medoid')
314315
best_model = cluster_algorithm.fit(embeddings)
315316

316-
return TunedClusteringResult(best_model.labels_, best_model.probabilities_)
317+
return TunedClusteringResult(best_model.labels_, best_model.probabilities_, best_model.medoids_)
317318

318319

319320
class CommunityComparingScores:
@@ -352,6 +353,19 @@ def add_clustering_results_to_embeddings(embeddings: pd.DataFrame, clustering_re
352353
"""
353354
embeddings['clusteringTunedHDBSCANLabel'] = clustering_result.labels
354355
embeddings['clusteringTunedHDBSCANProbability'] = clustering_result.probabilities
356+
357+
def is_medoid(row, medoids):
358+
""" Checks if the embedding of the given row is a medoid (=center node of the cluster that may act as a representative)."""
359+
for medoid in medoids:
360+
if np.array_equal(row['embedding'], medoid):
361+
return 1
362+
return 0
363+
364+
embeddings['clusteringTunedHDBSCANIsMedoid'] = embeddings.apply(
365+
lambda row: is_medoid(row, clustering_result.medoids),
366+
axis=1
367+
)
368+
355369
return embeddings
356370

357371

@@ -653,5 +667,6 @@ def objective(trial):
653667
'clusteringHDBSCANLabel': embeddings['clusteringTunedHDBSCANLabel'],
654668
'clusteringHDBSCANProbability': embeddings['clusteringTunedHDBSCANProbability'],
655669
'clusteringHDBSCANNoise': (embeddings['clusteringTunedHDBSCANLabel'] == -1).astype(int),
670+
'clusteringHDBSCANMedoid': embeddings['clusteringTunedHDBSCANIsMedoid'].astype(int),
656671
})
657672
write_batch_data_into_database(data_to_write, parameters.get_projection_node_label())

0 commit comments

Comments
 (0)