|
553 | 553 | " WHERE codeUnit.incomingDependencies IS NOT NULL\n",
|
554 | 554 | " AND codeUnit.outgoingDependencies IS NOT NULL\n",
|
555 | 555 | " AND codeUnit.centralityPageRank IS NOT NULL\n",
|
| 556 | + " AND codeUnit.centralityArticleRank IS NOT NULL\n", |
| 557 | + " AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL\n", |
| 558 | + " AND codeUnit.centralityBetweenness IS NOT NULL\n", |
556 | 559 | " AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n",
|
557 | 560 | " AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n",
|
558 | 561 | " AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n",
|
|
569 | 572 | " ,codeUnit.incomingDependencies AS incomingDependencies\n",
|
570 | 573 | " ,codeUnit.outgoingDependencies AS outgoingDependencies\n",
|
571 | 574 | " ,codeUnit.centralityPageRank AS pageRank\n",
|
| 575 | + " ,1.0 - codeUnit.communityLocalClusteringCoefficient AS inverseClusteringCoefficient\n", |
| 576 | + " ,codeUnit.centralityBetweenness AS betweenness\n", |
| 577 | + " ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference\n", |
572 | 578 | " ,codeUnit.clusteringHDBSCANLabel AS clusteringHDBSCANLabel\n",
|
573 | 579 | " ,codeUnit.clusteringHDBSCANProbability AS clusteringHDBSCANProbability\n",
|
574 | 580 | " ,codeUnit.clusteringHDBSCANNoise AS clusteringHDBSCANNoise\n",
|
|
581 | 587 | " \"\"\"\n",
|
582 | 588 | "\n",
|
583 | 589 | "java_package_clustering_features = query_cypher_to_data_frame(java_package_clustering_query)\n",
|
| 590 | + "java_package_clustering_features['degree'] = java_package_clustering_features['incomingDependencies'] + java_package_clustering_features['outgoingDependencies']\n", |
584 | 591 | "display(java_package_clustering_features.head(5))"
|
585 | 592 | ]
|
586 | 593 | },
|
|
860 | 867 | "\n",
|
861 | 868 | " # Annotate medoids of the cluster\n",
|
862 | 869 | " medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n",
|
863 |
| - " for index, row in medoids.iterrows():\n", |
| 870 | + " for medoid_index, row in medoids.iterrows():\n", |
864 | 871 | " axis.annotate(\n",
|
865 | 872 | " text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n",
|
866 | 873 | " xy=(row[x_position_column], row[y_position_column]),\n",
|
|
1064 | 1071 | ")"
|
1065 | 1072 | ]
|
1066 | 1073 | },
|
| 1074 | + { |
| 1075 | + "cell_type": "code", |
| 1076 | + "execution_count": null, |
| 1077 | + "id": "c9580ddb", |
| 1078 | + "metadata": {}, |
| 1079 | + "outputs": [], |
| 1080 | + "source": [ |
| 1081 | + "def plot_cluster_noise(\n", |
| 1082 | + " clustering_visualization_dataframe: pd.DataFrame,\n", |
| 1083 | + " title: str,\n", |
| 1084 | + " main_color_map: str = \"bwr\",\n", |
| 1085 | + " code_unit_column_name: str = \"shortCodeUnitName\",\n", |
| 1086 | + " cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n", |
| 1087 | + " size_column_name: str = \"degree\",\n", |
| 1088 | + " color_column_name: str = \"pageRank\",\n", |
| 1089 | + " x_position_column = 'embeddingVisualizationX',\n", |
| 1090 | + " y_position_column = 'embeddingVisualizationY'\n", |
| 1091 | + ") -> None:\n", |
| 1092 | + " if clustering_visualization_dataframe.empty:\n", |
| 1093 | + " print(\"No projected data to plot available\")\n", |
| 1094 | + " return\n", |
| 1095 | + "\n", |
| 1096 | + " # Filter only noise points\n", |
| 1097 | + " noise_points = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] == -1]\n", |
| 1098 | + " noise_points = noise_points.sort_values(by=size_column_name, ascending=False).reset_index(drop=True)\n", |
| 1099 | + "\n", |
| 1100 | + " if noise_points.empty:\n", |
| 1101 | + " print(\"No noise points to plot.\")\n", |
| 1102 | + " return\n", |
| 1103 | + "\n", |
| 1104 | + " plot.figure(figsize=(10, 10))\n", |
| 1105 | + " plot.title(title)\n", |
| 1106 | + "\n", |
| 1107 | + " # Determine the color threshold for noise points\n", |
| 1108 | + " color_10th_highest_value = noise_points[color_column_name].nlargest(10).iloc[-1] # Get the 10th largest value\n", |
| 1109 | + " color_90_quantile = noise_points[color_column_name].quantile(0.90)\n", |
| 1110 | + " color_threshold = max(color_10th_highest_value, color_90_quantile)\n", |
| 1111 | + "\n", |
| 1112 | + " # Color the color column values above the 90% quantile threshold red, the rest light grey \n", |
| 1113 | + " colors = noise_points[color_column_name].apply(\n", |
| 1114 | + " lambda x: \"red\" if x >= color_threshold else \"lightgrey\"\n", |
| 1115 | + " )\n", |
| 1116 | + " normalized_size = noise_points[size_column_name] / noise_points[size_column_name].max()\n", |
| 1117 | + "\n", |
| 1118 | + " # Scatter plot for noise points\n", |
| 1119 | + " scatter = plot.scatter(\n", |
| 1120 | + " x=noise_points[x_position_column],\n", |
| 1121 | + " y=noise_points[y_position_column],\n", |
| 1122 | + " s=normalized_size.clip(lower=0.01) * 800 + 2,\n", |
| 1123 | + " c=colors,\n", |
| 1124 | + " alpha=0.6\n", |
| 1125 | + " )\n", |
| 1126 | + "\n", |
| 1127 | + " # Annotate the largest 10 points and all colored ones with their names\n", |
| 1128 | + " for index, row in noise_points.iterrows():\n", |
| 1129 | + " index = typing.cast(int, index)\n", |
| 1130 | + " if colors[index] != 'red' and index >= 10:\n", |
| 1131 | + " continue\n", |
| 1132 | + " plot.annotate(\n", |
| 1133 | + " text=row[code_unit_column_name],\n", |
| 1134 | + " xy=(row[x_position_column], row[y_position_column]),\n", |
| 1135 | + " xytext=(5, 5 + (index % 2) * 20), # Offset for better visibility\n", |
| 1136 | + " **plot_annotation_style\n", |
| 1137 | + " )\n", |
| 1138 | + "\n", |
| 1139 | + " plot.xlabel(x_position_column)\n", |
| 1140 | + " plot.ylabel(y_position_column)\n", |
| 1141 | + " plot.tight_layout()\n", |
| 1142 | + " plot.show()" |
| 1143 | + ] |
| 1144 | + }, |
| 1145 | + { |
| 1146 | + "cell_type": "code", |
| 1147 | + "execution_count": null, |
| 1148 | + "id": "5c56606c", |
| 1149 | + "metadata": {}, |
| 1150 | + "outputs": [], |
| 1151 | + "source": [ |
| 1152 | + "plot_cluster_noise(\n", |
| 1153 | + " clustering_visualization_dataframe=java_package_clustering_features,\n", |
| 1154 | + " title=\"Java Package Clustering Noise - Noise points that are surprisingly central (color) or popular (size)\",\n", |
| 1155 | + " size_column_name='degree',\n", |
| 1156 | + " color_column_name='pageRank'\n", |
| 1157 | + ")" |
| 1158 | + ] |
| 1159 | + }, |
| 1160 | + { |
| 1161 | + "cell_type": "code", |
| 1162 | + "execution_count": null, |
| 1163 | + "id": "d9b2010c", |
| 1164 | + "metadata": {}, |
| 1165 | + "outputs": [], |
| 1166 | + "source": [ |
| 1167 | + "plot_cluster_noise(\n", |
| 1168 | + " clustering_visualization_dataframe=java_package_clustering_features,\n", |
| 1169 | + " title=\"Java Package Clustering Noise - Noise points that bridge flow (color) and are poorly integrated (size)\",\n", |
| 1170 | + " size_column_name='inverseClusteringCoefficient',\n", |
| 1171 | + " color_column_name='betweenness'\n", |
| 1172 | + ")" |
| 1173 | + ] |
| 1174 | + }, |
| 1175 | + { |
| 1176 | + "cell_type": "code", |
| 1177 | + "execution_count": null, |
| 1178 | + "id": "891d79b2", |
| 1179 | + "metadata": {}, |
| 1180 | + "outputs": [], |
| 1181 | + "source": [ |
| 1182 | + "plot_cluster_noise(\n", |
| 1183 | + " clustering_visualization_dataframe=java_package_clustering_features,\n", |
| 1184 | + " title=\"Java Package Clustering Noise - Noise points with role inversion (size), possibly violating layering or dependency direction (color)\",\n", |
| 1185 | + " size_column_name='pageToArticleRankDifference',\n", |
| 1186 | + " color_column_name='betweenness'\n", |
| 1187 | + ")" |
| 1188 | + ] |
| 1189 | + }, |
1067 | 1190 | {
|
1068 | 1191 | "cell_type": "markdown",
|
1069 | 1192 | "id": "5682bb64",
|
|
1207 | 1330 | " WHERE codeUnit.incomingDependencies IS NOT NULL\n",
|
1208 | 1331 | " AND codeUnit.outgoingDependencies IS NOT NULL\n",
|
1209 | 1332 | " AND codeUnit.centralityPageRank IS NOT NULL\n",
|
| 1333 | + " AND codeUnit.centralityArticleRank IS NOT NULL\n", |
| 1334 | + " AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL\n", |
| 1335 | + " AND codeUnit.centralityBetweenness IS NOT NULL\n", |
1210 | 1336 | " AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n",
|
1211 | 1337 | " AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n",
|
1212 | 1338 | " AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n",
|
|
1223 | 1349 | " ,codeUnit.incomingDependencies AS incomingDependencies\n",
|
1224 | 1350 | " ,codeUnit.outgoingDependencies AS outgoingDependencies\n",
|
1225 | 1351 | " ,codeUnit.centralityPageRank AS pageRank\n",
|
| 1352 | + " ,1.0 - codeUnit.communityLocalClusteringCoefficient AS inverseClusteringCoefficient\n", |
| 1353 | + " ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference\n", |
| 1354 | + " ,codeUnit.centralityBetweenness AS betweenness\n", |
1226 | 1355 | " ,codeUnit.clusteringHDBSCANLabel AS clusteringHDBSCANLabel\n",
|
1227 | 1356 | " ,codeUnit.clusteringHDBSCANProbability AS clusteringHDBSCANProbability\n",
|
1228 | 1357 | " ,codeUnit.clusteringHDBSCANNoise AS clusteringHDBSCANNoise\n",
|
|
1235 | 1364 | "\"\"\"\n",
|
1236 | 1365 | "\n",
|
1237 | 1366 | "java_type_clustering_features = query_cypher_to_data_frame(java_type_clustering_query)\n",
|
| 1367 | + "java_type_clustering_features['degree'] = java_type_clustering_features['incomingDependencies'] + java_type_clustering_features['outgoingDependencies']\n", |
| 1368 | + "\n", |
1238 | 1369 | "display(java_type_clustering_features.head(5))"
|
1239 | 1370 | ]
|
1240 | 1371 | },
|
|
1298 | 1429 | " title=\"Java Type Clustering Visualization\"\n",
|
1299 | 1430 | ")"
|
1300 | 1431 | ]
|
| 1432 | + }, |
| 1433 | + { |
| 1434 | + "cell_type": "code", |
| 1435 | + "execution_count": null, |
| 1436 | + "id": "d70ec20c", |
| 1437 | + "metadata": {}, |
| 1438 | + "outputs": [], |
| 1439 | + "source": [ |
| 1440 | + "plot_cluster_noise(\n", |
| 1441 | + " clustering_visualization_dataframe=java_type_clustering_features,\n", |
| 1442 | + " title=\"Java Type Clustering Noise - Noise points that are surprisingly central (color) or popular (size)\",\n", |
| 1443 | + " size_column_name='degree',\n", |
| 1444 | + " color_column_name='pageRank'\n", |
| 1445 | + ")" |
| 1446 | + ] |
| 1447 | + }, |
| 1448 | + { |
| 1449 | + "cell_type": "code", |
| 1450 | + "execution_count": null, |
| 1451 | + "id": "e8d888be", |
| 1452 | + "metadata": {}, |
| 1453 | + "outputs": [], |
| 1454 | + "source": [ |
| 1455 | + "plot_cluster_noise(\n", |
| 1456 | + " clustering_visualization_dataframe=java_type_clustering_features,\n", |
| 1457 | + " title=\"Java Type Clustering Noise - Noise points that bridge flow (color) and are poorly integrated (size)\",\n", |
| 1458 | + " size_column_name='inverseClusteringCoefficient',\n", |
| 1459 | + " color_column_name='betweenness'\n", |
| 1460 | + ")" |
| 1461 | + ] |
| 1462 | + }, |
| 1463 | + { |
| 1464 | + "cell_type": "code", |
| 1465 | + "execution_count": null, |
| 1466 | + "id": "c9921ad7", |
| 1467 | + "metadata": {}, |
| 1468 | + "outputs": [], |
| 1469 | + "source": [ |
| 1470 | + "plot_cluster_noise(\n", |
| 1471 | + " clustering_visualization_dataframe=java_type_clustering_features,\n", |
| 1472 | + " title=\"Java Type Clustering Noise - Noise points with role inversion (size), possibly violating layering or dependency direction (color)\",\n", |
| 1473 | + " size_column_name='pageToArticleRankDifference',\n", |
| 1474 | + " color_column_name='betweenness'\n", |
| 1475 | + ")" |
| 1476 | + ] |
1301 | 1477 | }
|
1302 | 1478 | ],
|
1303 | 1479 | "metadata": {
|
|
0 commit comments