Skip to content

Commit d79ffe5

Browse files
committed
Plot cluster noise and mark different anomalies
1 parent 92dd427 commit d79ffe5

File tree

1 file changed

+178
-3
lines changed

1 file changed

+178
-3
lines changed

domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb

Lines changed: 178 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,9 @@
553553
" WHERE codeUnit.incomingDependencies IS NOT NULL\n",
554554
" AND codeUnit.outgoingDependencies IS NOT NULL\n",
555555
" AND codeUnit.centralityPageRank IS NOT NULL\n",
556+
" AND codeUnit.centralityArticleRank IS NOT NULL\n",
557+
" AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL\n",
558+
" AND codeUnit.centralityBetweenness IS NOT NULL\n",
556559
" AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n",
557560
" AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n",
558561
" AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n",
@@ -566,15 +569,18 @@
566569
" ,codeUnit.incomingDependencies AS incomingDependencies\n",
567570
" ,codeUnit.outgoingDependencies AS outgoingDependencies\n",
568571
" ,codeUnit.centralityPageRank AS pageRank\n",
572+
" ,1.0 - codeUnit.communityLocalClusteringCoefficient AS inverseClusteringCoefficient\n",
573+
" ,codeUnit.centralityBetweenness AS betweenness\n",
574+
" ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference\n",
569575
" ,codeUnit.clusteringHDBSCANLabel AS clusteringHDBSCANLabel\n",
570576
" ,codeUnit.clusteringHDBSCANProbability AS clusteringHDBSCANProbability\n",
571577
" ,codeUnit.clusteringHDBSCANNoise AS clusteringHDBSCANNoise\n",
572578
" ,codeUnit.clusteringHDBSCANMedoid AS clusteringHDBSCANMedoid\n",
573579
" ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX\n",
574-
" ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\n",
575-
"\"\"\"\n",
580+
" ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\"\"\"\n",
576581
"\n",
577582
"java_package_clustering_features = query_cypher_to_data_frame(java_package_clustering_query)\n",
583+
"java_package_clustering_features['degree'] = java_package_clustering_features['incomingDependencies'] + java_package_clustering_features['outgoingDependencies']\n",
578584
"display(java_package_clustering_features.head(5))"
579585
]
580586
},
@@ -783,7 +789,7 @@
783789
"\n",
784790
" # Annotate medoids of the cluster\n",
785791
" medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n",
786-
" for index, row in medoids.iterrows():\n",
792+
" for medoid_index, row in medoids.iterrows():\n",
787793
" axis.annotate(\n",
788794
" text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n",
789795
" xy=(row[x_position_column], row[y_position_column]),\n",
@@ -868,6 +874,122 @@
868874
")"
869875
]
870876
},
877+
{
878+
"cell_type": "code",
879+
"execution_count": null,
880+
"id": "c9580ddb",
881+
"metadata": {},
882+
"outputs": [],
883+
"source": [
884+
"def plot_cluster_noise(\n",
885+
" clustering_visualization_dataframe: pd.DataFrame,\n",
886+
" title: str,\n",
887+
" main_color_map: str = \"bwr\",\n",
888+
" code_unit_column_name: str = \"shortCodeUnitName\",\n",
889+
" cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n",
890+
" size_column_name: str = \"degree\",\n",
891+
" color_column_name: str = \"pageRank\",\n",
892+
" x_position_column = 'embeddingVisualizationX',\n",
893+
" y_position_column = 'embeddingVisualizationY'\n",
894+
") -> None:\n",
895+
" if clustering_visualization_dataframe.empty:\n",
896+
" print(\"No projected data to plot available\")\n",
897+
" return\n",
898+
"\n",
899+
" # Filter only noise points\n",
900+
" noise_points = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] == -1]\n",
901+
" noise_points = noise_points.sort_values(by=size_column_name, ascending=False).reset_index(drop=True)\n",
902+
"\n",
903+
" if noise_points.empty:\n",
904+
" print(\"No noise points to plot.\")\n",
905+
" return\n",
906+
"\n",
907+
" plot.figure(figsize=(10, 10))\n",
908+
" plot.title(title)\n",
909+
"\n",
910+
" # Determine the color threshold for noise points\n",
911+
" color_10th_highest_value = noise_points[color_column_name].nlargest(10).iloc[-1] # Get the 10th largest value\n",
912+
" color_90_quantile = noise_points[color_column_name].quantile(0.90)\n",
913+
" color_threshold = max(color_10th_highest_value, color_90_quantile)\n",
914+
"\n",
915+
" # Color the color column values above the 90% quantile threshold red, the rest light grey \n",
916+
" colors = noise_points[color_column_name].apply(\n",
917+
" lambda x: \"red\" if x >= color_threshold else \"lightgrey\"\n",
918+
" )\n",
919+
" normalized_size = noise_points[size_column_name] / noise_points[size_column_name].max()\n",
920+
"\n",
921+
" # Scatter plot for noise points\n",
922+
" scatter = plot.scatter(\n",
923+
" x=noise_points[x_position_column],\n",
924+
" y=noise_points[y_position_column],\n",
925+
" s=normalized_size.clip(lower=0.01) * 800 + 2,\n",
926+
" c=colors,\n",
927+
" alpha=0.6\n",
928+
" )\n",
929+
"\n",
930+
" # Annotate the largest 10 points and all colored ones with their names\n",
931+
" for index, row in noise_points.iterrows():\n",
932+
" index = typing.cast(int, index)\n",
933+
" if colors[index] != 'red' and index >= 10:\n",
934+
" continue\n",
935+
" plot.annotate(\n",
936+
" text=row[code_unit_column_name],\n",
937+
" xy=(row[x_position_column], row[y_position_column]),\n",
938+
" xytext=(5, 5 + (index % 2) * 20), # Offset for better visibility\n",
939+
" **plot_annotation_style\n",
940+
" )\n",
941+
"\n",
942+
" plot.xlabel(x_position_column)\n",
943+
" plot.ylabel(y_position_column)\n",
944+
" plot.tight_layout()\n",
945+
" plot.show()"
946+
]
947+
},
948+
{
949+
"cell_type": "code",
950+
"execution_count": null,
951+
"id": "5c56606c",
952+
"metadata": {},
953+
"outputs": [],
954+
"source": [
955+
"plot_cluster_noise(\n",
956+
" clustering_visualization_dataframe=java_package_clustering_features,\n",
957+
" title=\"Java Package Clustering Noise - Noise points that are surprisingly central (color) or popular (size)\",\n",
958+
" size_column_name='degree',\n",
959+
" color_column_name='pageRank'\n",
960+
")"
961+
]
962+
},
963+
{
964+
"cell_type": "code",
965+
"execution_count": null,
966+
"id": "d9b2010c",
967+
"metadata": {},
968+
"outputs": [],
969+
"source": [
970+
"plot_cluster_noise(\n",
971+
" clustering_visualization_dataframe=java_package_clustering_features,\n",
972+
" title=\"Java Package Clustering Noise - Noise points that bridge flow (color) and are poorly integrated (size)\",\n",
973+
" size_column_name='inverseClusteringCoefficient',\n",
974+
" color_column_name='betweenness'\n",
975+
")"
976+
]
977+
},
978+
{
979+
"cell_type": "code",
980+
"execution_count": null,
981+
"id": "891d79b2",
982+
"metadata": {},
983+
"outputs": [],
984+
"source": [
985+
"plot_cluster_noise(\n",
986+
" clustering_visualization_dataframe=java_package_clustering_features,\n",
987+
" title=\"Java Package Clustering Noise - Noise points with role inversion (size), possibly violating layering or dependency direction (color)\",\n",
988+
" size_column_name='pageToArticleRankDifference',\n",
989+
" color_column_name='betweenness'\n",
990+
")"
991+
]
992+
},
871993
{
872994
"cell_type": "markdown",
873995
"id": "5682bb64",
@@ -1011,6 +1133,9 @@
10111133
" WHERE codeUnit.incomingDependencies IS NOT NULL\n",
10121134
" AND codeUnit.outgoingDependencies IS NOT NULL\n",
10131135
" AND codeUnit.centralityPageRank IS NOT NULL\n",
1136+
" AND codeUnit.centralityArticleRank IS NOT NULL\n",
1137+
" AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL\n",
1138+
" AND codeUnit.centralityBetweenness IS NOT NULL\n",
10141139
" AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n",
10151140
" AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n",
10161141
" AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n",
@@ -1024,6 +1149,9 @@
10241149
" ,codeUnit.incomingDependencies AS incomingDependencies\n",
10251150
" ,codeUnit.outgoingDependencies AS outgoingDependencies\n",
10261151
" ,codeUnit.centralityPageRank AS pageRank\n",
1152+
" ,1.0 - codeUnit.communityLocalClusteringCoefficient AS inverseClusteringCoefficient\n",
1153+
" ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference\n",
1154+
" ,codeUnit.centralityBetweenness AS betweenness\n",
10271155
" ,codeUnit.clusteringHDBSCANLabel AS clusteringHDBSCANLabel\n",
10281156
" ,codeUnit.clusteringHDBSCANProbability AS clusteringHDBSCANProbability\n",
10291157
" ,codeUnit.clusteringHDBSCANNoise AS clusteringHDBSCANNoise\n",
@@ -1033,6 +1161,8 @@
10331161
"\"\"\"\n",
10341162
"\n",
10351163
"java_type_clustering_features = query_cypher_to_data_frame(java_type_clustering_query)\n",
1164+
"java_type_clustering_features['degree'] = java_type_clustering_features['incomingDependencies'] + java_type_clustering_features['outgoingDependencies']\n",
1165+
"\n",
10361166
"display(java_type_clustering_features.head(5))"
10371167
]
10381168
},
@@ -1048,6 +1178,51 @@
10481178
" title=\"Java Type Clustering Visualization\"\n",
10491179
")"
10501180
]
1181+
},
1182+
{
1183+
"cell_type": "code",
1184+
"execution_count": null,
1185+
"id": "d70ec20c",
1186+
"metadata": {},
1187+
"outputs": [],
1188+
"source": [
1189+
"plot_cluster_noise(\n",
1190+
" clustering_visualization_dataframe=java_type_clustering_features,\n",
1191+
" title=\"Java Type Clustering Noise - Noise points that are surprisingly central (color) or popular (size)\",\n",
1192+
" size_column_name='degree',\n",
1193+
" color_column_name='pageRank'\n",
1194+
")"
1195+
]
1196+
},
1197+
{
1198+
"cell_type": "code",
1199+
"execution_count": null,
1200+
"id": "e8d888be",
1201+
"metadata": {},
1202+
"outputs": [],
1203+
"source": [
1204+
"plot_cluster_noise(\n",
1205+
" clustering_visualization_dataframe=java_type_clustering_features,\n",
1206+
" title=\"Java Type Clustering Noise - Noise points that bridge flow (color) and are poorly integrated (size)\",\n",
1207+
" size_column_name='inverseClusteringCoefficient',\n",
1208+
" color_column_name='betweenness'\n",
1209+
")"
1210+
]
1211+
},
1212+
{
1213+
"cell_type": "code",
1214+
"execution_count": null,
1215+
"id": "c9921ad7",
1216+
"metadata": {},
1217+
"outputs": [],
1218+
"source": [
1219+
"plot_cluster_noise(\n",
1220+
" clustering_visualization_dataframe=java_type_clustering_features,\n",
1221+
" title=\"Java Type Clustering Noise - Noise points with role inversion (size), possibly violating layering or dependency direction (color)\",\n",
1222+
" size_column_name='pageToArticleRankDifference',\n",
1223+
" color_column_name='betweenness'\n",
1224+
")"
1225+
]
10511226
}
10521227
],
10531228
"metadata": {

0 commit comments

Comments
 (0)