Skip to content

Commit ba0b090

Browse files
committed
Plot cluster noise and mark different anomalies
1 parent a36d278 commit ba0b090

File tree

1 file changed

+177
-1
lines changed

1 file changed

+177
-1
lines changed

domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb

Lines changed: 177 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,9 @@
553553
" WHERE codeUnit.incomingDependencies IS NOT NULL\n",
554554
" AND codeUnit.outgoingDependencies IS NOT NULL\n",
555555
" AND codeUnit.centralityPageRank IS NOT NULL\n",
556+
" AND codeUnit.centralityArticleRank IS NOT NULL\n",
557+
" AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL\n",
558+
" AND codeUnit.centralityBetweenness IS NOT NULL\n",
556559
" AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n",
557560
" AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n",
558561
" AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n",
@@ -569,6 +572,9 @@
569572
" ,codeUnit.incomingDependencies AS incomingDependencies\n",
570573
" ,codeUnit.outgoingDependencies AS outgoingDependencies\n",
571574
" ,codeUnit.centralityPageRank AS pageRank\n",
575+
" ,1.0 - codeUnit.communityLocalClusteringCoefficient AS inverseClusteringCoefficient\n",
576+
" ,codeUnit.centralityBetweenness AS betweenness\n",
577+
" ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference\n",
572578
" ,codeUnit.clusteringHDBSCANLabel AS clusteringHDBSCANLabel\n",
573579
" ,codeUnit.clusteringHDBSCANProbability AS clusteringHDBSCANProbability\n",
574580
" ,codeUnit.clusteringHDBSCANNoise AS clusteringHDBSCANNoise\n",
@@ -581,6 +587,7 @@
581587
" \"\"\"\n",
582588
"\n",
583589
"java_package_clustering_features = query_cypher_to_data_frame(java_package_clustering_query)\n",
590+
"java_package_clustering_features['degree'] = java_package_clustering_features['incomingDependencies'] + java_package_clustering_features['outgoingDependencies']\n",
584591
"display(java_package_clustering_features.head(5))"
585592
]
586593
},
@@ -860,7 +867,7 @@
860867
"\n",
861868
" # Annotate medoids of the cluster\n",
862869
" medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n",
863-
" for index, row in medoids.iterrows():\n",
870+
" for medoid_index, row in medoids.iterrows():\n",
864871
" axis.annotate(\n",
865872
" text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n",
866873
" xy=(row[x_position_column], row[y_position_column]),\n",
@@ -1064,6 +1071,122 @@
10641071
")"
10651072
]
10661073
},
1074+
{
1075+
"cell_type": "code",
1076+
"execution_count": null,
1077+
"id": "c9580ddb",
1078+
"metadata": {},
1079+
"outputs": [],
1080+
"source": [
1081+
"def plot_cluster_noise(\n",
1082+
" clustering_visualization_dataframe: pd.DataFrame,\n",
1083+
" title: str,\n",
1084+
" main_color_map: str = \"bwr\",\n",
1085+
" code_unit_column_name: str = \"shortCodeUnitName\",\n",
1086+
" cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n",
1087+
" size_column_name: str = \"degree\",\n",
1088+
" color_column_name: str = \"pageRank\",\n",
1089+
" x_position_column = 'embeddingVisualizationX',\n",
1090+
" y_position_column = 'embeddingVisualizationY'\n",
1091+
") -> None:\n",
1092+
" if clustering_visualization_dataframe.empty:\n",
1093+
" print(\"No projected data to plot available\")\n",
1094+
" return\n",
1095+
"\n",
1096+
" # Filter only noise points\n",
1097+
" noise_points = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] == -1]\n",
1098+
" noise_points = noise_points.sort_values(by=size_column_name, ascending=False).reset_index(drop=True)\n",
1099+
"\n",
1100+
" if noise_points.empty:\n",
1101+
" print(\"No noise points to plot.\")\n",
1102+
" return\n",
1103+
"\n",
1104+
" plot.figure(figsize=(10, 10))\n",
1105+
" plot.title(title)\n",
1106+
"\n",
1107+
" # Determine the color threshold for noise points\n",
1108+
" color_10th_highest_value = noise_points[color_column_name].nlargest(10).iloc[-1] # Get the 10th largest value\n",
1109+
" color_90_quantile = noise_points[color_column_name].quantile(0.90)\n",
1110+
" color_threshold = max(color_10th_highest_value, color_90_quantile)\n",
1111+
"\n",
1112+
" # Color the color column values above the 90% quantile threshold red, the rest light grey \n",
1113+
" colors = noise_points[color_column_name].apply(\n",
1114+
" lambda x: \"red\" if x >= color_threshold else \"lightgrey\"\n",
1115+
" )\n",
1116+
" normalized_size = noise_points[size_column_name] / noise_points[size_column_name].max()\n",
1117+
"\n",
1118+
" # Scatter plot for noise points\n",
1119+
" scatter = plot.scatter(\n",
1120+
" x=noise_points[x_position_column],\n",
1121+
" y=noise_points[y_position_column],\n",
1122+
" s=normalized_size.clip(lower=0.01) * 800 + 2,\n",
1123+
" c=colors,\n",
1124+
" alpha=0.6\n",
1125+
" )\n",
1126+
"\n",
1127+
" # Annotate the largest 10 points and all colored ones with their names\n",
1128+
" for index, row in noise_points.iterrows():\n",
1129+
" index = typing.cast(int, index)\n",
1130+
" if colors[index] != 'red' and index >= 10:\n",
1131+
" continue\n",
1132+
" plot.annotate(\n",
1133+
" text=row[code_unit_column_name],\n",
1134+
" xy=(row[x_position_column], row[y_position_column]),\n",
1135+
" xytext=(5, 5 + (index % 2) * 20), # Offset for better visibility\n",
1136+
" **plot_annotation_style\n",
1137+
" )\n",
1138+
"\n",
1139+
" plot.xlabel(x_position_column)\n",
1140+
" plot.ylabel(y_position_column)\n",
1141+
" plot.tight_layout()\n",
1142+
" plot.show()"
1143+
]
1144+
},
1145+
{
1146+
"cell_type": "code",
1147+
"execution_count": null,
1148+
"id": "5c56606c",
1149+
"metadata": {},
1150+
"outputs": [],
1151+
"source": [
1152+
"plot_cluster_noise(\n",
1153+
" clustering_visualization_dataframe=java_package_clustering_features,\n",
1154+
" title=\"Java Package Clustering Noise - Noise points that are surprisingly central (color) or popular (size)\",\n",
1155+
" size_column_name='degree',\n",
1156+
" color_column_name='pageRank'\n",
1157+
")"
1158+
]
1159+
},
1160+
{
1161+
"cell_type": "code",
1162+
"execution_count": null,
1163+
"id": "d9b2010c",
1164+
"metadata": {},
1165+
"outputs": [],
1166+
"source": [
1167+
"plot_cluster_noise(\n",
1168+
" clustering_visualization_dataframe=java_package_clustering_features,\n",
1169+
" title=\"Java Package Clustering Noise - Noise points that bridge flow (color) and are poorly integrated (size)\",\n",
1170+
" size_column_name='inverseClusteringCoefficient',\n",
1171+
" color_column_name='betweenness'\n",
1172+
")"
1173+
]
1174+
},
1175+
{
1176+
"cell_type": "code",
1177+
"execution_count": null,
1178+
"id": "891d79b2",
1179+
"metadata": {},
1180+
"outputs": [],
1181+
"source": [
1182+
"plot_cluster_noise(\n",
1183+
" clustering_visualization_dataframe=java_package_clustering_features,\n",
1184+
" title=\"Java Package Clustering Noise - Noise points with role inversion (size), possibly violating layering or dependency direction (color)\",\n",
1185+
" size_column_name='pageToArticleRankDifference',\n",
1186+
" color_column_name='betweenness'\n",
1187+
")"
1188+
]
1189+
},
10671190
{
10681191
"cell_type": "markdown",
10691192
"id": "5682bb64",
@@ -1207,6 +1330,9 @@
12071330
" WHERE codeUnit.incomingDependencies IS NOT NULL\n",
12081331
" AND codeUnit.outgoingDependencies IS NOT NULL\n",
12091332
" AND codeUnit.centralityPageRank IS NOT NULL\n",
1333+
" AND codeUnit.centralityArticleRank IS NOT NULL\n",
1334+
" AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL\n",
1335+
" AND codeUnit.centralityBetweenness IS NOT NULL\n",
12101336
" AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n",
12111337
" AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n",
12121338
" AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n",
@@ -1223,6 +1349,9 @@
12231349
" ,codeUnit.incomingDependencies AS incomingDependencies\n",
12241350
" ,codeUnit.outgoingDependencies AS outgoingDependencies\n",
12251351
" ,codeUnit.centralityPageRank AS pageRank\n",
1352+
" ,1.0 - codeUnit.communityLocalClusteringCoefficient AS inverseClusteringCoefficient\n",
1353+
" ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference\n",
1354+
" ,codeUnit.centralityBetweenness AS betweenness\n",
12261355
" ,codeUnit.clusteringHDBSCANLabel AS clusteringHDBSCANLabel\n",
12271356
" ,codeUnit.clusteringHDBSCANProbability AS clusteringHDBSCANProbability\n",
12281357
" ,codeUnit.clusteringHDBSCANNoise AS clusteringHDBSCANNoise\n",
@@ -1235,6 +1364,8 @@
12351364
"\"\"\"\n",
12361365
"\n",
12371366
"java_type_clustering_features = query_cypher_to_data_frame(java_type_clustering_query)\n",
1367+
"java_type_clustering_features['degree'] = java_type_clustering_features['incomingDependencies'] + java_type_clustering_features['outgoingDependencies']\n",
1368+
"\n",
12381369
"display(java_type_clustering_features.head(5))"
12391370
]
12401371
},
@@ -1298,6 +1429,51 @@
12981429
" title=\"Java Type Clustering Visualization\"\n",
12991430
")"
13001431
]
1432+
},
1433+
{
1434+
"cell_type": "code",
1435+
"execution_count": null,
1436+
"id": "d70ec20c",
1437+
"metadata": {},
1438+
"outputs": [],
1439+
"source": [
1440+
"plot_cluster_noise(\n",
1441+
" clustering_visualization_dataframe=java_type_clustering_features,\n",
1442+
" title=\"Java Type Clustering Noise - Noise points that are surprisingly central (color) or popular (size)\",\n",
1443+
" size_column_name='degree',\n",
1444+
" color_column_name='pageRank'\n",
1445+
")"
1446+
]
1447+
},
1448+
{
1449+
"cell_type": "code",
1450+
"execution_count": null,
1451+
"id": "e8d888be",
1452+
"metadata": {},
1453+
"outputs": [],
1454+
"source": [
1455+
"plot_cluster_noise(\n",
1456+
" clustering_visualization_dataframe=java_type_clustering_features,\n",
1457+
" title=\"Java Type Clustering Noise - Noise points that bridge flow (color) and are poorly integrated (size)\",\n",
1458+
" size_column_name='inverseClusteringCoefficient',\n",
1459+
" color_column_name='betweenness'\n",
1460+
")"
1461+
]
1462+
},
1463+
{
1464+
"cell_type": "code",
1465+
"execution_count": null,
1466+
"id": "c9921ad7",
1467+
"metadata": {},
1468+
"outputs": [],
1469+
"source": [
1470+
"plot_cluster_noise(\n",
1471+
" clustering_visualization_dataframe=java_type_clustering_features,\n",
1472+
" title=\"Java Type Clustering Noise - Noise points with role inversion (size), possibly violating layering or dependency direction (color)\",\n",
1473+
" size_column_name='pageToArticleRankDifference',\n",
1474+
" color_column_name='betweenness'\n",
1475+
")"
1476+
]
13011477
}
13021478
],
13031479
"metadata": {

0 commit comments

Comments
 (0)