Skip to content

Commit 92dd427

Browse files
committed
Mark center (=medoid) of embedding clusters
1 parent 26863ea commit 92dd427

File tree

3 files changed

+67
-18
lines changed

3 files changed

+67
-18
lines changed

domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb

Lines changed: 44 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -556,6 +556,7 @@
556556
" AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n",
557557
" AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n",
558558
" AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n",
559+
" AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL\n",
559560
" AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL\n",
560561
" AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL\n",
561562
" RETURN DISTINCT \n",
@@ -568,6 +569,7 @@
568569
" ,codeUnit.clusteringHDBSCANLabel AS clusteringHDBSCANLabel\n",
569570
" ,codeUnit.clusteringHDBSCANProbability AS clusteringHDBSCANProbability\n",
570571
" ,codeUnit.clusteringHDBSCANNoise AS clusteringHDBSCANNoise\n",
572+
" ,codeUnit.clusteringHDBSCANMedoid AS clusteringHDBSCANMedoid\n",
571573
" ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX\n",
572574
" ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\n",
573575
"\"\"\"\n",
@@ -604,7 +606,9 @@
604606
" clustering_visualization_dataframe: pd.DataFrame,\n",
605607
" title: str,\n",
606608
" main_color_map: str = \"tab20\",\n",
609+
" code_unit_column_name: str = \"shortCodeUnitName\",\n",
607610
" cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n",
611+
" cluster_medoid_column_name: str = \"clusteringHDBSCANMedoid\",\n",
608612
" centrality_column_name: str = \"pageRank\",\n",
609613
" x_position_column = 'embeddingVisualizationX',\n",
610614
" y_position_column = 'embeddingVisualizationY'\n",
@@ -641,7 +645,7 @@
641645
" # we can adjust the alpha value for the KDE plot to visualize smaller clusters more clearly.\n",
642646
" # This way, larger clusters will have a lower alpha value, making them less prominent and less prone to overshadow smaller clusters.\n",
643647
" cluster_diameter = max_pairwise_distance(cluster_nodes[[x_position_column, y_position_column]].to_numpy())\n",
644-
" alpha = max((1.0 - (cluster_diameter / max_diameter)) * 0.45 - 0.25, 0.001)\n",
648+
" alpha = max((1.0 - (cluster_diameter / max_diameter)) * 0.45 - 0.25, 0.02)\n",
645649
"\n",
646650
" # KDE cloud shape\n",
647651
" if len(cluster_nodes) > 1 and (\n",
@@ -668,6 +672,16 @@
668672
" label=f\"Cluster {cluster_label}\"\n",
669673
" )\n",
670674
"\n",
675+
" # Annotate medoids of the cluster\n",
676+
" medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n",
677+
" for index, row in medoids.iterrows():\n",
678+
" plot.annotate(\n",
679+
" text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n",
680+
" xy=(row[x_position_column], row[y_position_column]),\n",
681+
" xytext=(5, 5), # Offset for better visibility\n",
682+
" **plot_annotation_style\n",
683+
" )\n",
684+
"\n",
671685
" # Plot noise points in gray\n",
672686
" plot.scatter(\n",
673687
" x=node_embeddings_noise_only[x_position_column],\n",
@@ -678,10 +692,11 @@
678692
" label=\"Noise\"\n",
679693
" )\n",
680694
"\n",
681-
" legend = plot.legend(title=\"HDBSCAN Clusters\", loc=\"best\", prop={'size': 6}, ncols=2)\n",
682-
" # Workaround to set all legend dots to the same size\n",
683-
" for handle in legend.legend_handles:\n",
684-
" handle.set_sizes([30])\n",
695+
" # Legend not needed since the clusters are now annotated at their center (medoid)\n",
696+
" # legend = plot.legend(title=\"HDBSCAN Clusters\", loc=\"best\", prop={'size': 6}, ncols=2)\n",
697+
" # # Workaround to set all legend dots to the same size\n",
698+
" # for handle in legend.legend_handles:\n",
699+
" # handle.set_sizes([30])\n",
685700
" \n"
686701
]
687702
},
@@ -696,7 +711,9 @@
696711
" clustering_visualization_dataframe: pd.DataFrame,\n",
697712
" title: str,\n",
698713
" main_color_map: str = \"tab20\",\n",
714+
" code_unit_column_name: str = \"shortCodeUnitName\",\n",
699715
" cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n",
716+
" cluster_medoid_column_name: str = \"clusteringHDBSCANMedoid\",\n",
700717
" centrality_column_name: str = \"pageRank\",\n",
701718
" x_position_column = 'embeddingVisualizationX',\n",
702719
" y_position_column = 'embeddingVisualizationY'\n",
@@ -721,7 +738,7 @@
721738
" quartile_size = (n_clusters + 3) // 4 # ceil division\n",
722739
" quartiles = [clusters[i*quartile_size:(i+1)*quartile_size] for i in range(4)]\n",
723740
"\n",
724-
" figure, axes = plot.subplots(4, 1, figsize=(10, 32), squeeze=False)\n",
741+
" figure, axes = plot.subplots(4, 1, figsize=(10, 40), squeeze=False)\n",
725742
" figure.suptitle(title, fontsize=14)\n",
726743
"\n",
727744
" for index, cluster_group in enumerate(quartiles):\n",
@@ -742,7 +759,7 @@
742759
" # we can adjust the alpha value for the KDE plot to visualize smaller clusters more clearly.\n",
743760
" # This way, larger clusters will have a lower alpha value, making them less prominent and less prone to overshadow smaller clusters.\n",
744761
" cluster_diameter = max_pairwise_distance(cluster_nodes[[x_position_column, y_position_column]].to_numpy())\n",
745-
" alpha = max((1.0 - (cluster_diameter / quartile_diameter)) * 0.45 - 0.25, 0.001)\n",
762+
" alpha = max((1.0 - (cluster_diameter / quartile_diameter)) * 0.45 - 0.25, 0.02)\n",
746763
"\n",
747764
" if len(cluster_nodes) > 1 and cluster_diameter > 0:\n",
748765
" seaborn.kdeplot(\n",
@@ -764,7 +781,17 @@
764781
" label=f\"Cluster {cluster_label}\"\n",
765782
" )\n",
766783
"\n",
767-
" # Plot noise points in gray\n",
784+
" # Annotate medoids of the cluster\n",
785+
" medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n",
786+
" for index, row in medoids.iterrows():\n",
787+
" axis.annotate(\n",
788+
" text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n",
789+
" xy=(row[x_position_column], row[y_position_column]),\n",
790+
" xytext=(5, 5), # Offset y position for better visibility\n",
791+
" **plot_annotation_style\n",
792+
" )\n",
793+
"\n",
794+
" # Plot noise points in gray\n",
768795
" axis.scatter(\n",
769796
" x=node_embeddings_noise_only[x_position_column],\n",
770797
" y=node_embeddings_noise_only[y_position_column],\n",
@@ -775,12 +802,14 @@
775802
" )\n",
776803
"\n",
777804
" axis.set_title(f\"Quartile {index+1}: {len(cluster_group)} clusters\")\n",
778-
" legend = axis.legend(title=\"Cluster\", prop={'size': 6}, loc=\"best\", ncols=1)\n",
779-
" # Workaround to set all legend dots to the same size\n",
780-
" for handle in legend.legend_handles:\n",
781-
" handle.set_sizes([50])\n",
782805
"\n",
783-
" plot.tight_layout(rect=(0, 0, 1, 0.99))\n"
806+
" # Legend not needed since the clusters are now annotated at their center (medoid)\n",
807+
" # legend = axis.legend(title=\"Cluster\", prop={'size': 6}, loc=\"best\", ncols=1)\n",
808+
" # # Workaround to set all legend dots to the same size\n",
809+
" # for handle in legend.legend_handles:\n",
810+
" # handle.set_sizes([50])\n",
811+
"\n",
812+
" plot.tight_layout(rect=(0, 0, 1, 0.98))\n"
784813
]
785814
},
786815
{
@@ -804,7 +833,6 @@
804833
" return\n",
805834
" \n",
806835
" number_of_distinct_clusters = clustering_visualization_dataframe[cluster_label_column_name].nunique()\n",
807-
" print(f\"Number of distinct clusters: {number_of_distinct_clusters}\")\n",
808836
" if number_of_distinct_clusters > 30:\n",
809837
" plot_clusters_by_size_quartiles(\n",
810838
" clustering_visualization_dataframe=clustering_visualization_dataframe,\n",
@@ -986,6 +1014,7 @@
9861014
" AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n",
9871015
" AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n",
9881016
" AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n",
1017+
" AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL\n",
9891018
" AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL\n",
9901019
" AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL\n",
9911020
" RETURN DISTINCT \n",
@@ -998,6 +1027,7 @@
9981027
" ,codeUnit.clusteringHDBSCANLabel AS clusteringHDBSCANLabel\n",
9991028
" ,codeUnit.clusteringHDBSCANProbability AS clusteringHDBSCANProbability\n",
10001029
" ,codeUnit.clusteringHDBSCANNoise AS clusteringHDBSCANNoise\n",
1030+
" ,codeUnit.clusteringHDBSCANMedoid AS clusteringHDBSCANMedoid\n",
10011031
" ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX\n",
10021032
" ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\n",
10031033
"\"\"\"\n",

domains/anomaly-detection/features/AnomalyDetectionFeatures.cypher

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
AND codeUnit.clusteringHDBSCANLabel IS NOT NULL
1313
AND codeUnit.clusteringHDBSCANProbability IS NOT NULL
1414
AND codeUnit.clusteringHDBSCANNoise IS NOT NULL
15+
AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL
1516
AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL
1617
AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL
1718
OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
@@ -34,6 +35,7 @@ OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS
3435
,codeUnit.clusteringHDBSCANLabel AS clusteringLabel
3536
,codeUnit.clusteringHDBSCANProbability AS clusteringProbability
3637
,codeUnit.clusteringHDBSCANNoise AS clusteringIsNoise
38+
,codeUnit.clusteringHDBSCANMedoid AS clusteringIsMedoid
3739
,codeUnit.embeddingFastRandomProjectionVisualizationX AS visualizationX
3840
,codeUnit.embeddingFastRandomProjectionVisualizationY AS visualizationY
3941
,coalesce(codeUnit.centralityPageRank, 0.00001) AS centrality

domains/anomaly-detection/tunedNodeEmbeddingClustering.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -262,15 +262,16 @@ def output_detailed_optuna_tuning_results(optimized_study: optuna.Study, name_of
262262

263263

264264
class TunedClusteringResult:
265-
def __init__(self, labels: numpy_typing.NDArray, probabilities: numpy_typing.NDArray):
265+
def __init__(self, labels: numpy_typing.NDArray, probabilities: numpy_typing.NDArray, medoids: numpy_typing.NDArray):
266266
self.labels = labels
267267
self.probabilities = probabilities
268+
self.medoids = medoids
268269
self.cluster_count = len(set(labels)) - (1 if -1 in labels else 0)
269270
self.noise_count = np.sum(labels == -1)
270271
self.noise_ratio = self.noise_count / len(labels) if len(labels) > 0 else 0
271272

272273
def __repr__(self):
273-
return f"TunedClusteringResult(cluster_count={self.cluster_count}, noise_count={self.noise_count}, noise_ratio={self.noise_ratio}, labels=[...], probabilities=[...], )"
274+
return f"TunedClusteringResult(cluster_count={self.cluster_count}, noise_count={self.noise_count}, noise_ratio={self.noise_ratio}, labels=[...], probabilities=[...], medoids=[...])"
274275

275276

276277
def tuned_hierarchical_density_based_spatial_clustering(embeddings: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> TunedClusteringResult:
@@ -310,10 +311,10 @@ def objective(trial):
310311
output_detailed_optuna_tuning_results(study, 'HDBSCAN')
311312

312313
# Run the clustering again with the best parameters
313-
cluster_algorithm = HDBSCAN(**base_clustering_parameter, **study.best_params, n_jobs=-1)
314+
cluster_algorithm = HDBSCAN(**base_clustering_parameter, **study.best_params, n_jobs=-1, store_centers='medoid')
314315
best_model = cluster_algorithm.fit(embeddings)
315316

316-
return TunedClusteringResult(best_model.labels_, best_model.probabilities_)
317+
return TunedClusteringResult(best_model.labels_, best_model.probabilities_, best_model.medoids_)
317318

318319

319320
class CommunityComparingScores:
@@ -352,6 +353,21 @@ def add_clustering_results_to_embeddings(embeddings: pd.DataFrame, clustering_re
352353
"""
353354
embeddings['clusteringTunedHDBSCANLabel'] = clustering_result.labels
354355
embeddings['clusteringTunedHDBSCANProbability'] = clustering_result.probabilities
356+
357+
assigned_labels = []
358+
359+
def is_medoid(row):
360+
""" Checks if the embedding of the given row is a medoid (=center node of the cluster that may act as a representative)."""
361+
for medoid in clustering_result.medoids:
362+
if row['clusteringTunedHDBSCANLabel'] in assigned_labels:
363+
return 0 # The cluster with this label already has a medoid assigned
364+
if np.array_equal(row['embedding'], medoid):
365+
assigned_labels.append(row['clusteringTunedHDBSCANLabel'])
366+
return 1
367+
return 0
368+
369+
embeddings['clusteringTunedHDBSCANIsMedoid'] = embeddings.apply(is_medoid, axis=1)
370+
355371
return embeddings
356372

357373

@@ -653,5 +669,6 @@ def objective(trial):
653669
'clusteringHDBSCANLabel': embeddings['clusteringTunedHDBSCANLabel'],
654670
'clusteringHDBSCANProbability': embeddings['clusteringTunedHDBSCANProbability'],
655671
'clusteringHDBSCANNoise': (embeddings['clusteringTunedHDBSCANLabel'] == -1).astype(int),
672+
'clusteringHDBSCANMedoid': embeddings['clusteringTunedHDBSCANIsMedoid'].astype(int),
656673
})
657674
write_batch_data_into_database(data_to_write, parameters.get_projection_node_label())

0 commit comments

Comments
 (0)