Mark center (=medoid) of embedding clusters

JohT · JohT · commit 92dd4278bc1b · 2025-07-09T08:34:28.000+02:00
diff --git a/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb
@@ -556,6 +556,7 @@
     "      AND codeUnit.clusteringHDBSCANLabel                      IS NOT NULL\n",
     "      AND codeUnit.clusteringHDBSCANProbability                IS NOT NULL\n",
     "      AND codeUnit.clusteringHDBSCANNoise                      IS NOT NULL\n",
+    "      AND codeUnit.clusteringHDBSCANMedoid                     IS NOT NULL\n",
     "      AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL\n",
     "      AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL\n",
     "   RETURN DISTINCT \n",
@@ -568,6 +569,7 @@
     "        ,codeUnit.clusteringHDBSCANLabel                      AS clusteringHDBSCANLabel\n",
     "        ,codeUnit.clusteringHDBSCANProbability                AS clusteringHDBSCANProbability\n",
     "        ,codeUnit.clusteringHDBSCANNoise                      AS clusteringHDBSCANNoise\n",
+    "        ,codeUnit.clusteringHDBSCANMedoid                     AS clusteringHDBSCANMedoid\n",
     "        ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX\n",
     "        ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\n",
     "\"\"\"\n",
@@ -604,7 +606,9 @@
     "    clustering_visualization_dataframe: pd.DataFrame,\n",
     "    title: str,\n",
     "    main_color_map: str = \"tab20\",\n",
+    "    code_unit_column_name: str = \"shortCodeUnitName\",\n",
     "    cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n",
+    "    cluster_medoid_column_name: str = \"clusteringHDBSCANMedoid\",\n",
     "    centrality_column_name: str = \"pageRank\",\n",
     "    x_position_column = 'embeddingVisualizationX',\n",
     "    y_position_column = 'embeddingVisualizationY'\n",
@@ -641,7 +645,7 @@
     "        # we can adjust the alpha value for the KDE plot to visualize smaller clusters more clearly.\n",
     "        # This way, larger clusters will have a lower alpha value, making them less prominent and less prone to overshadow smaller clusters.\n",
     "        cluster_diameter = max_pairwise_distance(cluster_nodes[[x_position_column, y_position_column]].to_numpy())\n",
-    "        alpha = max((1.0 - (cluster_diameter / max_diameter)) * 0.45 - 0.25, 0.001)\n",
+    "        alpha = max((1.0 - (cluster_diameter / max_diameter)) * 0.45 - 0.25, 0.02)\n",
     "\n",
     "        # KDE cloud shape\n",
     "        if len(cluster_nodes) > 1 and (\n",
@@ -668,6 +672,16 @@
     "            label=f\"Cluster {cluster_label}\"\n",
     "        )\n",
     "\n",
+    "        # Annotate medoids of the cluster\n",
+    "        medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n",
+    "        for index, row in medoids.iterrows():\n",
+    "            plot.annotate(\n",
+    "                text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n",
+    "                xy=(row[x_position_column], row[y_position_column]),\n",
+    "                xytext=(5, 5),  # Offset for better visibility\n",
+    "                **plot_annotation_style\n",
+    "            )\n",
+    "\n",
     "    # Plot noise points in gray\n",
     "    plot.scatter(\n",
     "        x=node_embeddings_noise_only[x_position_column],\n",
@@ -678,10 +692,11 @@
     "        label=\"Noise\"\n",
     "    )\n",
     "\n",
-    "    legend = plot.legend(title=\"HDBSCAN Clusters\", loc=\"best\", prop={'size': 6}, ncols=2)\n",
-    "    # Workaround to set all legend dots to the same size\n",
-    "    for handle in legend.legend_handles:\n",
-    "        handle.set_sizes([30])\n",
+    "    # Legend not needed since the clusters are now annotated at their center (medoid)\n",
+    "    # legend = plot.legend(title=\"HDBSCAN Clusters\", loc=\"best\", prop={'size': 6}, ncols=2)\n",
+    "    # # Workaround to set all legend dots to the same size\n",
+    "    # for handle in legend.legend_handles:\n",
+    "    #     handle.set_sizes([30])\n",
     "        \n"
    ]
   },
@@ -696,7 +711,9 @@
     "    clustering_visualization_dataframe: pd.DataFrame,\n",
     "    title: str,\n",
     "    main_color_map: str = \"tab20\",\n",
+    "    code_unit_column_name: str = \"shortCodeUnitName\",\n",
     "    cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n",
+    "    cluster_medoid_column_name: str = \"clusteringHDBSCANMedoid\",\n",
     "    centrality_column_name: str = \"pageRank\",\n",
     "    x_position_column = 'embeddingVisualizationX',\n",
     "    y_position_column = 'embeddingVisualizationY'\n",
@@ -721,7 +738,7 @@
     "    quartile_size = (n_clusters + 3) // 4  # ceil division\n",
     "    quartiles = [clusters[i*quartile_size:(i+1)*quartile_size] for i in range(4)]\n",
     "\n",
-    "    figure, axes = plot.subplots(4, 1, figsize=(10, 32), squeeze=False)\n",
+    "    figure, axes = plot.subplots(4, 1, figsize=(10, 40), squeeze=False)\n",
     "    figure.suptitle(title, fontsize=14)\n",
     "\n",
     "    for index, cluster_group in enumerate(quartiles):\n",
@@ -742,7 +759,7 @@
     "            # we can adjust the alpha value for the KDE plot to visualize smaller clusters more clearly.\n",
     "            # This way, larger clusters will have a lower alpha value, making them less prominent and less prone to overshadow smaller clusters.\n",
     "            cluster_diameter = max_pairwise_distance(cluster_nodes[[x_position_column, y_position_column]].to_numpy())\n",
-    "            alpha = max((1.0 - (cluster_diameter / quartile_diameter)) * 0.45 - 0.25, 0.001)\n",
+    "            alpha = max((1.0 - (cluster_diameter / quartile_diameter)) * 0.45 - 0.25, 0.02)\n",
     "\n",
     "            if len(cluster_nodes) > 1 and cluster_diameter > 0:\n",
     "                seaborn.kdeplot(\n",
@@ -764,7 +781,17 @@
     "                label=f\"Cluster {cluster_label}\"\n",
     "            )\n",
     "\n",
-    "            # Plot noise points in gray\n",
+    "            # Annotate medoids of the cluster\n",
+    "            medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n",
+    "            for index, row in medoids.iterrows():\n",
+    "                axis.annotate(\n",
+    "                    text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n",
+    "                    xy=(row[x_position_column], row[y_position_column]),\n",
+    "                    xytext=(5, 5),  # Offset y position for better visibility\n",
+    "                    **plot_annotation_style\n",
+    "                )\n",
+    "\n",
+    "        # Plot noise points in gray\n",
     "        axis.scatter(\n",
     "            x=node_embeddings_noise_only[x_position_column],\n",
     "            y=node_embeddings_noise_only[y_position_column],\n",
@@ -775,12 +802,14 @@
     "        )\n",
     "\n",
     "        axis.set_title(f\"Quartile {index+1}: {len(cluster_group)} clusters\")\n",
-    "        legend = axis.legend(title=\"Cluster\", prop={'size': 6}, loc=\"best\", ncols=1)\n",
-    "        # Workaround to set all legend dots to the same size\n",
-    "        for handle in legend.legend_handles:\n",
-    "            handle.set_sizes([50])\n",
     "\n",
-    "    plot.tight_layout(rect=(0, 0, 1, 0.99))\n"
+    "        # Legend not needed since the clusters are now annotated at their center (medoid)\n",
+    "        # legend = axis.legend(title=\"Cluster\", prop={'size': 6}, loc=\"best\", ncols=1)\n",
+    "        # # Workaround to set all legend dots to the same size\n",
+    "        # for handle in legend.legend_handles:\n",
+    "        #     handle.set_sizes([50])\n",
+    "\n",
+    "    plot.tight_layout(rect=(0, 0, 1, 0.98))\n"
    ]
   },
   {
@@ -804,7 +833,6 @@
     "        return\n",
     "    \n",
     "    number_of_distinct_clusters = clustering_visualization_dataframe[cluster_label_column_name].nunique()\n",
-    "    print(f\"Number of distinct clusters: {number_of_distinct_clusters}\")\n",
     "    if number_of_distinct_clusters > 30:\n",
     "        plot_clusters_by_size_quartiles(\n",
     "            clustering_visualization_dataframe=clustering_visualization_dataframe,\n",
@@ -986,6 +1014,7 @@
     "      AND codeUnit.clusteringHDBSCANLabel                      IS NOT NULL\n",
     "      AND codeUnit.clusteringHDBSCANProbability                IS NOT NULL\n",
     "      AND codeUnit.clusteringHDBSCANNoise                      IS NOT NULL\n",
+    "      AND codeUnit.clusteringHDBSCANMedoid                     IS NOT NULL\n",
     "      AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL\n",
     "      AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL\n",
     "   RETURN DISTINCT \n",
@@ -998,6 +1027,7 @@
     "        ,codeUnit.clusteringHDBSCANLabel                      AS clusteringHDBSCANLabel\n",
     "        ,codeUnit.clusteringHDBSCANProbability                AS clusteringHDBSCANProbability\n",
     "        ,codeUnit.clusteringHDBSCANNoise                      AS clusteringHDBSCANNoise\n",
+    "        ,codeUnit.clusteringHDBSCANMedoid                     AS clusteringHDBSCANMedoid\n",
     "        ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX\n",
     "        ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\n",
     "\"\"\"\n",
diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeatures.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeatures.cypher
@@ -12,6 +12,7 @@
      AND codeUnit.clusteringHDBSCANLabel                      IS NOT NULL
      AND codeUnit.clusteringHDBSCANProbability                IS NOT NULL
      AND codeUnit.clusteringHDBSCANNoise                      IS NOT NULL
+     AND codeUnit.clusteringHDBSCANMedoid                     IS NOT NULL
      AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL
      AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL
 OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
@@ -34,6 +35,7 @@ OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS
         ,codeUnit.clusteringHDBSCANLabel                      AS clusteringLabel
         ,codeUnit.clusteringHDBSCANProbability                AS clusteringProbability
         ,codeUnit.clusteringHDBSCANNoise                      AS clusteringIsNoise
+        ,codeUnit.clusteringHDBSCANMedoid                     AS clusteringIsMedoid
         ,codeUnit.embeddingFastRandomProjectionVisualizationX AS visualizationX
         ,codeUnit.embeddingFastRandomProjectionVisualizationY AS visualizationY
         ,coalesce(codeUnit.centralityPageRank, 0.00001)       AS centrality
diff --git a/domains/anomaly-detection/tunedNodeEmbeddingClustering.py b/domains/anomaly-detection/tunedNodeEmbeddingClustering.py
@@ -262,15 +262,16 @@ def output_detailed_optuna_tuning_results(optimized_study: optuna.Study, name_of
 
 
 class TunedClusteringResult:
-    def __init__(self, labels: numpy_typing.NDArray, probabilities: numpy_typing.NDArray):
+    def __init__(self, labels: numpy_typing.NDArray, probabilities: numpy_typing.NDArray, medoids: numpy_typing.NDArray):
         self.labels = labels
         self.probabilities = probabilities
+        self.medoids = medoids
         self.cluster_count = len(set(labels)) - (1 if -1 in labels else 0)
         self.noise_count = np.sum(labels == -1)
         self.noise_ratio = self.noise_count / len(labels) if len(labels) > 0 else 0
 
     def __repr__(self):
-        return f"TunedClusteringResult(cluster_count={self.cluster_count}, noise_count={self.noise_count}, noise_ratio={self.noise_ratio}, labels=[...], probabilities=[...], )"
+        return f"TunedClusteringResult(cluster_count={self.cluster_count}, noise_count={self.noise_count}, noise_ratio={self.noise_ratio}, labels=[...], probabilities=[...], medoids=[...])"
 
 
 def tuned_hierarchical_density_based_spatial_clustering(embeddings: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> TunedClusteringResult:
@@ -310,10 +311,10 @@ def objective(trial):
         output_detailed_optuna_tuning_results(study, 'HDBSCAN')
 
     # Run the clustering again with the best parameters
-    cluster_algorithm = HDBSCAN(**base_clustering_parameter, **study.best_params, n_jobs=-1)
+    cluster_algorithm = HDBSCAN(**base_clustering_parameter, **study.best_params, n_jobs=-1, store_centers='medoid')
     best_model = cluster_algorithm.fit(embeddings)
 
-    return TunedClusteringResult(best_model.labels_, best_model.probabilities_)
+    return TunedClusteringResult(best_model.labels_, best_model.probabilities_, best_model.medoids_)
 
 
 class CommunityComparingScores:
@@ -352,6 +353,21 @@ def add_clustering_results_to_embeddings(embeddings: pd.DataFrame, clustering_re
     """
     embeddings['clusteringTunedHDBSCANLabel'] = clustering_result.labels
     embeddings['clusteringTunedHDBSCANProbability'] = clustering_result.probabilities
+
+    assigned_labels = []
+
+    def is_medoid(row):
+        """ Checks if the embedding of the given row is a medoid (=center node of the cluster that may act as a representative)."""
+        for medoid in clustering_result.medoids:
+            if row['clusteringTunedHDBSCANLabel'] in assigned_labels:
+                return 0 # The cluster with this label already has a medoid assigned
+            if np.array_equal(row['embedding'], medoid):
+                assigned_labels.append(row['clusteringTunedHDBSCANLabel'])
+                return 1
+        return 0
+
+    embeddings['clusteringTunedHDBSCANIsMedoid'] = embeddings.apply(is_medoid, axis=1)
+    
     return embeddings
 
 
@@ -653,5 +669,6 @@ def objective(trial):
     'clusteringHDBSCANLabel': embeddings['clusteringTunedHDBSCANLabel'],
     'clusteringHDBSCANProbability': embeddings['clusteringTunedHDBSCANProbability'],
     'clusteringHDBSCANNoise': (embeddings['clusteringTunedHDBSCANLabel'] == -1).astype(int),
+    'clusteringHDBSCANMedoid': embeddings['clusteringTunedHDBSCANIsMedoid'].astype(int),
 })
 write_batch_data_into_database(data_to_write, parameters.get_projection_node_label())