Skip to content

Commit 966c955

Browse files
committed
Plot clusters filtered by different criterions
1 parent ba0b090 commit 966c955

File tree

1 file changed

+8
-165
lines changed

1 file changed

+8
-165
lines changed

domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb

Lines changed: 8 additions & 165 deletions
Original file line numberDiff line numberDiff line change
@@ -700,7 +700,12 @@
700700
" if clustering_visualization_dataframe.empty:\n",
701701
" print(\"No projected data to plot available\")\n",
702702
" return\n",
703-
"\n",
703+
" \n",
704+
" def truncate(text: str, max_length: int):\n",
705+
" if len(text) <= max_length:\n",
706+
" return text\n",
707+
" return text[:max_length - 3] + \"...\"\n",
708+
" \n",
704709
" # Create figure and subplots\n",
705710
" plot.figure(figsize=(10, 10))\n",
706711
"\n",
@@ -760,7 +765,7 @@
760765
" medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n",
761766
" for index, row in medoids.iterrows():\n",
762767
" plot.annotate(\n",
763-
" text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n",
768+
" text=f\"{truncate(row[code_unit_column_name], 30)} ({row[cluster_label_column_name]})\",\n",
764769
" xy=(row[x_position_column], row[y_position_column]),\n",
765770
" xytext=(5, 5), # Offset for better visibility\n",
766771
" **plot_annotation_style\n",
@@ -774,169 +779,7 @@
774779
" color='lightgrey',\n",
775780
" alpha=0.4,\n",
776781
" label=\"Noise\"\n",
777-
" )\n",
778-
"\n",
779-
" # Legend not needed since the clusters are now annotated at their center (medoid)\n",
780-
" # legend = plot.legend(title=\"HDBSCAN Clusters\", loc=\"best\", prop={'size': 6}, ncols=2)\n",
781-
" # # Workaround to set all legend dots to the same size\n",
782-
" # for handle in legend.legend_handles:\n",
783-
" # handle.set_sizes([30])\n",
784-
" \n"
785-
]
786-
},
787-
{
788-
"cell_type": "code",
789-
"execution_count": null,
790-
"id": "a13271b4",
791-
"metadata": {},
792-
"outputs": [],
793-
"source": [
794-
"def plot_clusters_by_size_quartiles(\n",
795-
" clustering_visualization_dataframe: pd.DataFrame,\n",
796-
" title: str,\n",
797-
" main_color_map: str = \"tab20\",\n",
798-
" code_unit_column_name: str = \"shortCodeUnitName\",\n",
799-
" cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n",
800-
" cluster_medoid_column_name: str = \"clusteringHDBSCANMedoid\",\n",
801-
" centrality_column_name: str = \"pageRank\",\n",
802-
" x_position_column = 'embeddingVisualizationX',\n",
803-
" y_position_column = 'embeddingVisualizationY'\n",
804-
") -> None:\n",
805-
" if clustering_visualization_dataframe.empty:\n",
806-
" print(\"No projected data to plot available\")\n",
807-
" return\n",
808-
"\n",
809-
" node_embeddings_noise_only = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] == -1]\n",
810-
" # Exclude noise\n",
811-
" non_noise = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] != -1]\n",
812-
" if non_noise.empty:\n",
813-
" print(\"No clusters to plot (all noise)\")\n",
814-
" return\n",
815-
"\n",
816-
" # Compute cluster sizes\n",
817-
" cluster_sizes = non_noise.groupby(cluster_label_column_name).size().sort_values(ascending=False)\n",
818-
" n_clusters = len(cluster_sizes)\n",
819-
" clusters = cluster_sizes.index.tolist()\n",
820-
"\n",
821-
" # Split clusters into 4 quartiles by size\n",
822-
" quartile_size = (n_clusters + 3) // 4 # ceil division\n",
823-
" quartiles = [clusters[i*quartile_size:(i+1)*quartile_size] for i in range(4)]\n",
824-
"\n",
825-
" figure, axes = plot.subplots(4, 1, figsize=(10, 40), squeeze=False)\n",
826-
" figure.suptitle(title, fontsize=14)\n",
827-
"\n",
828-
" for index, cluster_group in enumerate(quartiles):\n",
829-
" axis = axes[index][0]\n",
830-
" group_df = non_noise[non_noise[cluster_label_column_name].isin(cluster_group)]\n",
831-
" if group_df.empty:\n",
832-
" axis.set_title(f\"Quartile {index+1} (empty)\")\n",
833-
" continue\n",
834-
"\n",
835-
" unique_labels = group_df[cluster_label_column_name].unique()\n",
836-
" color_palette = seaborn.color_palette(main_color_map, len(unique_labels))\n",
837-
" cluster_to_color = dict(zip(unique_labels, color_palette))\n",
838-
" quartile_diameter = max_pairwise_distance(group_df[[x_position_column, y_position_column]].to_numpy())\n",
839-
"\n",
840-
" for cluster_label in unique_labels:\n",
841-
" cluster_nodes = group_df[group_df[cluster_label_column_name] == cluster_label]\n",
842-
" # By comparing the cluster diameter to the max diameter of all clusters in the quartile,\n",
843-
" # we can adjust the alpha value for the KDE plot to visualize smaller clusters more clearly.\n",
844-
" # This way, larger clusters will have a lower alpha value, making them less prominent and less prone to overshadow smaller clusters.\n",
845-
" cluster_diameter = max_pairwise_distance(cluster_nodes[[x_position_column, y_position_column]].to_numpy())\n",
846-
" alpha = max((1.0 - (cluster_diameter / quartile_diameter)) * 0.45 - 0.25, 0.02)\n",
847-
"\n",
848-
" if len(cluster_nodes) > 1 and cluster_diameter > 0:\n",
849-
" seaborn.kdeplot(\n",
850-
" x=cluster_nodes[x_position_column],\n",
851-
" y=cluster_nodes[y_position_column],\n",
852-
" fill=True,\n",
853-
" alpha=alpha,\n",
854-
" levels=2,\n",
855-
" color=cluster_to_color[cluster_label],\n",
856-
" ax=axis,\n",
857-
" warn_singular=False,\n",
858-
" )\n",
859-
" axis.scatter(\n",
860-
" x=cluster_nodes[x_position_column],\n",
861-
" y=cluster_nodes[y_position_column],\n",
862-
" s=cluster_nodes[centrality_column_name] * 300 + 2,\n",
863-
" color=cluster_to_color[cluster_label],\n",
864-
" alpha=0.9,\n",
865-
" label=f\"Cluster {cluster_label}\"\n",
866-
" )\n",
867-
"\n",
868-
" # Annotate medoids of the cluster\n",
869-
" medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n",
870-
" for medoid_index, row in medoids.iterrows():\n",
871-
" axis.annotate(\n",
872-
" text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n",
873-
" xy=(row[x_position_column], row[y_position_column]),\n",
874-
" xytext=(5, 5), # Offset y position for better visibility\n",
875-
" **plot_annotation_style\n",
876-
" )\n",
877-
"\n",
878-
" # Plot noise points in gray\n",
879-
" axis.scatter(\n",
880-
" x=node_embeddings_noise_only[x_position_column],\n",
881-
" y=node_embeddings_noise_only[y_position_column],\n",
882-
" s=node_embeddings_noise_only[centrality_column_name] * 300 + 2,\n",
883-
" color='gainsboro',\n",
884-
" alpha=0.4,\n",
885-
" label=\"Noise\"\n",
886-
" )\n",
887-
"\n",
888-
" axis.set_title(f\"Quartile {index+1}: {len(cluster_group)} clusters\")\n",
889-
"\n",
890-
" # Legend not needed since the clusters are now annotated at their center (medoid)\n",
891-
" # legend = axis.legend(title=\"Cluster\", prop={'size': 6}, loc=\"best\", ncols=1)\n",
892-
" # # Workaround to set all legend dots to the same size\n",
893-
" # for handle in legend.legend_handles:\n",
894-
" # handle.set_sizes([50])\n",
895-
"\n",
896-
" plot.tight_layout(rect=(0, 0, 1, 0.98))\n"
897-
]
898-
},
899-
{
900-
"cell_type": "code",
901-
"execution_count": null,
902-
"id": "647e79e9",
903-
"metadata": {},
904-
"outputs": [],
905-
"source": [
906-
"def plot_clusters(\n",
907-
" clustering_visualization_dataframe: pd.DataFrame,\n",
908-
" title: str,\n",
909-
" main_color_map: str = \"tab20\",\n",
910-
" cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n",
911-
" centrality_column_name: str = \"pageRank\",\n",
912-
" x_position_column = 'embeddingVisualizationX',\n",
913-
" y_position_column = 'embeddingVisualizationY'\n",
914-
") -> None:\n",
915-
" if clustering_visualization_dataframe.empty:\n",
916-
" print(\"No projected data to plot available\")\n",
917-
" return\n",
918-
" \n",
919-
" number_of_distinct_clusters = clustering_visualization_dataframe[cluster_label_column_name].nunique()\n",
920-
" if number_of_distinct_clusters > 30:\n",
921-
" plot_clusters_by_size_quartiles(\n",
922-
" clustering_visualization_dataframe=clustering_visualization_dataframe,\n",
923-
" title=title,\n",
924-
" main_color_map=main_color_map,\n",
925-
" cluster_label_column_name=cluster_label_column_name,\n",
926-
" centrality_column_name=centrality_column_name,\n",
927-
" x_position_column=x_position_column,\n",
928-
" y_position_column=y_position_column\n",
929-
" )\n",
930-
" else:\n",
931-
" plot_all_clusters(\n",
932-
" clustering_visualization_dataframe=clustering_visualization_dataframe,\n",
933-
" title=title,\n",
934-
" main_color_map=main_color_map,\n",
935-
" cluster_label_column_name=cluster_label_column_name,\n",
936-
" centrality_column_name=centrality_column_name,\n",
937-
" x_position_column=x_position_column,\n",
938-
" y_position_column=y_position_column\n",
939-
" )"
782+
" )"
940783
]
941784
},
942785
{

0 commit comments

Comments
 (0)