Plot clusters filtered by different criterions

JohT · JohT · commit 966c955fea41 · 2025-07-12T22:42:16.000+02:00
diff --git a/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb
@@ -700,7 +700,12 @@
     "    if clustering_visualization_dataframe.empty:\n",
     "        print(\"No projected data to plot available\")\n",
     "        return\n",
-    "\n",
+    "    \n",
+    "    def truncate(text: str, max_length: int):\n",
+    "        if len(text) <= max_length:\n",
+    "            return text\n",
+    "        return text[:max_length - 3] + \"...\"\n",
+    "    \n",
     "    # Create figure and subplots\n",
     "    plot.figure(figsize=(10, 10))\n",
     "\n",
@@ -760,7 +765,7 @@
     "        medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n",
     "        for index, row in medoids.iterrows():\n",
     "            plot.annotate(\n",
-    "                text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n",
+    "                text=f\"{truncate(row[code_unit_column_name], 30)} ({row[cluster_label_column_name]})\",\n",
     "                xy=(row[x_position_column], row[y_position_column]),\n",
     "                xytext=(5, 5),  # Offset for better visibility\n",
     "                **plot_annotation_style\n",
@@ -774,169 +779,7 @@
     "        color='lightgrey',\n",
     "        alpha=0.4,\n",
     "        label=\"Noise\"\n",
-    "    )\n",
-    "\n",
-    "    # Legend not needed since the clusters are now annotated at their center (medoid)\n",
-    "    # legend = plot.legend(title=\"HDBSCAN Clusters\", loc=\"best\", prop={'size': 6}, ncols=2)\n",
-    "    # # Workaround to set all legend dots to the same size\n",
-    "    # for handle in legend.legend_handles:\n",
-    "    #     handle.set_sizes([30])\n",
-    "        \n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a13271b4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def plot_clusters_by_size_quartiles(\n",
-    "    clustering_visualization_dataframe: pd.DataFrame,\n",
-    "    title: str,\n",
-    "    main_color_map: str = \"tab20\",\n",
-    "    code_unit_column_name: str = \"shortCodeUnitName\",\n",
-    "    cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n",
-    "    cluster_medoid_column_name: str = \"clusteringHDBSCANMedoid\",\n",
-    "    centrality_column_name: str = \"pageRank\",\n",
-    "    x_position_column = 'embeddingVisualizationX',\n",
-    "    y_position_column = 'embeddingVisualizationY'\n",
-    ") -> None:\n",
-    "    if clustering_visualization_dataframe.empty:\n",
-    "        print(\"No projected data to plot available\")\n",
-    "        return\n",
-    "\n",
-    "    node_embeddings_noise_only = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] == -1]\n",
-    "    # Exclude noise\n",
-    "    non_noise = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] != -1]\n",
-    "    if non_noise.empty:\n",
-    "        print(\"No clusters to plot (all noise)\")\n",
-    "        return\n",
-    "\n",
-    "    # Compute cluster sizes\n",
-    "    cluster_sizes = non_noise.groupby(cluster_label_column_name).size().sort_values(ascending=False)\n",
-    "    n_clusters = len(cluster_sizes)\n",
-    "    clusters = cluster_sizes.index.tolist()\n",
-    "\n",
-    "    # Split clusters into 4 quartiles by size\n",
-    "    quartile_size = (n_clusters + 3) // 4  # ceil division\n",
-    "    quartiles = [clusters[i*quartile_size:(i+1)*quartile_size] for i in range(4)]\n",
-    "\n",
-    "    figure, axes = plot.subplots(4, 1, figsize=(10, 40), squeeze=False)\n",
-    "    figure.suptitle(title, fontsize=14)\n",
-    "\n",
-    "    for index, cluster_group in enumerate(quartiles):\n",
-    "        axis = axes[index][0]\n",
-    "        group_df = non_noise[non_noise[cluster_label_column_name].isin(cluster_group)]\n",
-    "        if group_df.empty:\n",
-    "            axis.set_title(f\"Quartile {index+1} (empty)\")\n",
-    "            continue\n",
-    "\n",
-    "        unique_labels = group_df[cluster_label_column_name].unique()\n",
-    "        color_palette = seaborn.color_palette(main_color_map, len(unique_labels))\n",
-    "        cluster_to_color = dict(zip(unique_labels, color_palette))\n",
-    "        quartile_diameter = max_pairwise_distance(group_df[[x_position_column, y_position_column]].to_numpy())\n",
-    "\n",
-    "        for cluster_label in unique_labels:\n",
-    "            cluster_nodes = group_df[group_df[cluster_label_column_name] == cluster_label]\n",
-    "            # By comparing the cluster diameter to the max diameter of all clusters in the quartile,\n",
-    "            # we can adjust the alpha value for the KDE plot to visualize smaller clusters more clearly.\n",
-    "            # This way, larger clusters will have a lower alpha value, making them less prominent and less prone to overshadow smaller clusters.\n",
-    "            cluster_diameter = max_pairwise_distance(cluster_nodes[[x_position_column, y_position_column]].to_numpy())\n",
-    "            alpha = max((1.0 - (cluster_diameter / quartile_diameter)) * 0.45 - 0.25, 0.02)\n",
-    "\n",
-    "            if len(cluster_nodes) > 1 and cluster_diameter > 0:\n",
-    "                seaborn.kdeplot(\n",
-    "                    x=cluster_nodes[x_position_column],\n",
-    "                    y=cluster_nodes[y_position_column],\n",
-    "                    fill=True,\n",
-    "                    alpha=alpha,\n",
-    "                    levels=2,\n",
-    "                    color=cluster_to_color[cluster_label],\n",
-    "                    ax=axis,\n",
-    "                    warn_singular=False,\n",
-    "                )\n",
-    "            axis.scatter(\n",
-    "                x=cluster_nodes[x_position_column],\n",
-    "                y=cluster_nodes[y_position_column],\n",
-    "                s=cluster_nodes[centrality_column_name] * 300 + 2,\n",
-    "                color=cluster_to_color[cluster_label],\n",
-    "                alpha=0.9,\n",
-    "                label=f\"Cluster {cluster_label}\"\n",
-    "            )\n",
-    "\n",
-    "            # Annotate medoids of the cluster\n",
-    "            medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n",
-    "            for medoid_index, row in medoids.iterrows():\n",
-    "                axis.annotate(\n",
-    "                    text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n",
-    "                    xy=(row[x_position_column], row[y_position_column]),\n",
-    "                    xytext=(5, 5),  # Offset y position for better visibility\n",
-    "                    **plot_annotation_style\n",
-    "                )\n",
-    "\n",
-    "        # Plot noise points in gray\n",
-    "        axis.scatter(\n",
-    "            x=node_embeddings_noise_only[x_position_column],\n",
-    "            y=node_embeddings_noise_only[y_position_column],\n",
-    "            s=node_embeddings_noise_only[centrality_column_name] * 300 + 2,\n",
-    "            color='gainsboro',\n",
-    "            alpha=0.4,\n",
-    "            label=\"Noise\"\n",
-    "        )\n",
-    "\n",
-    "        axis.set_title(f\"Quartile {index+1}: {len(cluster_group)} clusters\")\n",
-    "\n",
-    "        # Legend not needed since the clusters are now annotated at their center (medoid)\n",
-    "        # legend = axis.legend(title=\"Cluster\", prop={'size': 6}, loc=\"best\", ncols=1)\n",
-    "        # # Workaround to set all legend dots to the same size\n",
-    "        # for handle in legend.legend_handles:\n",
-    "        #     handle.set_sizes([50])\n",
-    "\n",
-    "    plot.tight_layout(rect=(0, 0, 1, 0.98))\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "647e79e9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def plot_clusters(\n",
-    "    clustering_visualization_dataframe: pd.DataFrame,\n",
-    "    title: str,\n",
-    "    main_color_map: str = \"tab20\",\n",
-    "    cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n",
-    "    centrality_column_name: str = \"pageRank\",\n",
-    "    x_position_column = 'embeddingVisualizationX',\n",
-    "    y_position_column = 'embeddingVisualizationY'\n",
-    ") -> None:\n",
-    "    if clustering_visualization_dataframe.empty:\n",
-    "        print(\"No projected data to plot available\")\n",
-    "        return\n",
-    "    \n",
-    "    number_of_distinct_clusters = clustering_visualization_dataframe[cluster_label_column_name].nunique()\n",
-    "    if number_of_distinct_clusters > 30:\n",
-    "        plot_clusters_by_size_quartiles(\n",
-    "            clustering_visualization_dataframe=clustering_visualization_dataframe,\n",
-    "            title=title,\n",
-    "            main_color_map=main_color_map,\n",
-    "            cluster_label_column_name=cluster_label_column_name,\n",
-    "            centrality_column_name=centrality_column_name,\n",
-    "            x_position_column=x_position_column,\n",
-    "            y_position_column=y_position_column\n",
-    "        )\n",
-    "    else:\n",
-    "        plot_all_clusters(\n",
-    "            clustering_visualization_dataframe=clustering_visualization_dataframe,\n",
-    "            title=title,\n",
-    "            main_color_map=main_color_map,\n",
-    "            cluster_label_column_name=cluster_label_column_name,\n",
-    "            centrality_column_name=centrality_column_name,\n",
-    "            x_position_column=x_position_column,\n",
-    "            y_position_column=y_position_column\n",
-    "        )"
+    "    )"
    ]
   },
   {