|
553 | 553 | " WHERE codeUnit.incomingDependencies IS NOT NULL\n",
|
554 | 554 | " AND codeUnit.outgoingDependencies IS NOT NULL\n",
|
555 | 555 | " AND codeUnit.centralityPageRank IS NOT NULL\n",
|
| 556 | + " AND codeUnit.centralityArticleRank IS NOT NULL\n", |
| 557 | + " AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL\n", |
| 558 | + " AND codeUnit.centralityBetweenness IS NOT NULL\n", |
556 | 559 | " AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n",
|
557 | 560 | " AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n",
|
558 | 561 | " AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n",
|
|
566 | 569 | " ,codeUnit.incomingDependencies AS incomingDependencies\n",
|
567 | 570 | " ,codeUnit.outgoingDependencies AS outgoingDependencies\n",
|
568 | 571 | " ,codeUnit.centralityPageRank AS pageRank\n",
|
| 572 | + " ,1.0 - codeUnit.communityLocalClusteringCoefficient AS inverseClusteringCoefficient\n", |
| 573 | + " ,codeUnit.centralityBetweenness AS betweenness\n", |
| 574 | + " ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference\n", |
569 | 575 | " ,codeUnit.clusteringHDBSCANLabel AS clusteringHDBSCANLabel\n",
|
570 | 576 | " ,codeUnit.clusteringHDBSCANProbability AS clusteringHDBSCANProbability\n",
|
571 | 577 | " ,codeUnit.clusteringHDBSCANNoise AS clusteringHDBSCANNoise\n",
|
572 | 578 | " ,codeUnit.clusteringHDBSCANMedoid AS clusteringHDBSCANMedoid\n",
|
573 | 579 | " ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX\n",
|
574 |
| - " ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\n", |
575 |
| - "\"\"\"\n", |
| 580 | + " ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\"\"\"\n", |
576 | 581 | "\n",
|
577 | 582 | "java_package_clustering_features = query_cypher_to_data_frame(java_package_clustering_query)\n",
|
| 583 | + "java_package_clustering_features['degree'] = java_package_clustering_features['incomingDependencies'] + java_package_clustering_features['outgoingDependencies']\n", |
578 | 584 | "display(java_package_clustering_features.head(5))"
|
579 | 585 | ]
|
580 | 586 | },
|
|
783 | 789 | "\n",
|
784 | 790 | " # Annotate medoids of the cluster\n",
|
785 | 791 | " medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n",
|
786 |
| - " for index, row in medoids.iterrows():\n", |
| 792 | + " for medoid_index, row in medoids.iterrows():\n", |
787 | 793 | " axis.annotate(\n",
|
788 | 794 | " text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n",
|
789 | 795 | " xy=(row[x_position_column], row[y_position_column]),\n",
|
|
868 | 874 | ")"
|
869 | 875 | ]
|
870 | 876 | },
|
| 877 | + { |
| 878 | + "cell_type": "code", |
| 879 | + "execution_count": null, |
| 880 | + "id": "c9580ddb", |
| 881 | + "metadata": {}, |
| 882 | + "outputs": [], |
| 883 | + "source": [ |
| 884 | + "def plot_cluster_noise(\n", |
| 885 | + " clustering_visualization_dataframe: pd.DataFrame,\n", |
| 886 | + " title: str,\n", |
| 887 | + " main_color_map: str = \"bwr\",\n", |
| 888 | + " code_unit_column_name: str = \"shortCodeUnitName\",\n", |
| 889 | + " cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n", |
| 890 | + " size_column_name: str = \"degree\",\n", |
| 891 | + " color_column_name: str = \"pageRank\",\n", |
| 892 | + " x_position_column = 'embeddingVisualizationX',\n", |
| 893 | + " y_position_column = 'embeddingVisualizationY'\n", |
| 894 | + ") -> None:\n", |
| 895 | + " if clustering_visualization_dataframe.empty:\n", |
| 896 | + " print(\"No projected data to plot available\")\n", |
| 897 | + " return\n", |
| 898 | + "\n", |
| 899 | + " # Filter only noise points\n", |
| 900 | + " noise_points = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] == -1]\n", |
| 901 | + " noise_points = noise_points.sort_values(by=size_column_name, ascending=False).reset_index(drop=True)\n", |
| 902 | + "\n", |
| 903 | + " if noise_points.empty:\n", |
| 904 | + " print(\"No noise points to plot.\")\n", |
| 905 | + " return\n", |
| 906 | + "\n", |
| 907 | + " plot.figure(figsize=(10, 10))\n", |
| 908 | + " plot.title(title)\n", |
| 909 | + "\n", |
| 910 | + " # Determine the color threshold for noise points\n", |
| 911 | + " color_10th_highest_value = noise_points[color_column_name].nlargest(10).iloc[-1] # Get the 10th largest value\n", |
| 912 | + " color_90_quantile = noise_points[color_column_name].quantile(0.90)\n", |
| 913 | + " color_threshold = max(color_10th_highest_value, color_90_quantile)\n", |
| 914 | + "\n", |
| 915 | + " # Color the color column values above the 90% quantile threshold red, the rest light grey \n", |
| 916 | + " colors = noise_points[color_column_name].apply(\n", |
| 917 | + " lambda x: \"red\" if x >= color_threshold else \"lightgrey\"\n", |
| 918 | + " )\n", |
| 919 | + " normalized_size = noise_points[size_column_name] / noise_points[size_column_name].max()\n", |
| 920 | + "\n", |
| 921 | + " # Scatter plot for noise points\n", |
| 922 | + " scatter = plot.scatter(\n", |
| 923 | + " x=noise_points[x_position_column],\n", |
| 924 | + " y=noise_points[y_position_column],\n", |
| 925 | + " s=normalized_size.clip(lower=0.01) * 800 + 2,\n", |
| 926 | + " c=colors,\n", |
| 927 | + " alpha=0.6\n", |
| 928 | + " )\n", |
| 929 | + "\n", |
| 930 | + " # Annotate the largest 10 points and all colored ones with their names\n", |
| 931 | + " for index, row in noise_points.iterrows():\n", |
| 932 | + " index = typing.cast(int, index)\n", |
| 933 | + " if colors[index] != 'red' and index >= 10:\n", |
| 934 | + " continue\n", |
| 935 | + " plot.annotate(\n", |
| 936 | + " text=row[code_unit_column_name],\n", |
| 937 | + " xy=(row[x_position_column], row[y_position_column]),\n", |
| 938 | + " xytext=(5, 5 + (index % 2) * 20), # Offset for better visibility\n", |
| 939 | + " **plot_annotation_style\n", |
| 940 | + " )\n", |
| 941 | + "\n", |
| 942 | + " plot.xlabel(x_position_column)\n", |
| 943 | + " plot.ylabel(y_position_column)\n", |
| 944 | + " plot.tight_layout()\n", |
| 945 | + " plot.show()" |
| 946 | + ] |
| 947 | + }, |
| 948 | + { |
| 949 | + "cell_type": "code", |
| 950 | + "execution_count": null, |
| 951 | + "id": "5c56606c", |
| 952 | + "metadata": {}, |
| 953 | + "outputs": [], |
| 954 | + "source": [ |
| 955 | + "plot_cluster_noise(\n", |
| 956 | + " clustering_visualization_dataframe=java_package_clustering_features,\n", |
| 957 | + " title=\"Java Package Clustering Noise - Noise points that are surprisingly central (color) or popular (size)\",\n", |
| 958 | + " size_column_name='degree',\n", |
| 959 | + " color_column_name='pageRank'\n", |
| 960 | + ")" |
| 961 | + ] |
| 962 | + }, |
| 963 | + { |
| 964 | + "cell_type": "code", |
| 965 | + "execution_count": null, |
| 966 | + "id": "d9b2010c", |
| 967 | + "metadata": {}, |
| 968 | + "outputs": [], |
| 969 | + "source": [ |
| 970 | + "plot_cluster_noise(\n", |
| 971 | + " clustering_visualization_dataframe=java_package_clustering_features,\n", |
| 972 | + " title=\"Java Package Clustering Noise - Noise points that bridge flow (color) and are poorly integrated (size)\",\n", |
| 973 | + " size_column_name='inverseClusteringCoefficient',\n", |
| 974 | + " color_column_name='betweenness'\n", |
| 975 | + ")" |
| 976 | + ] |
| 977 | + }, |
| 978 | + { |
| 979 | + "cell_type": "code", |
| 980 | + "execution_count": null, |
| 981 | + "id": "891d79b2", |
| 982 | + "metadata": {}, |
| 983 | + "outputs": [], |
| 984 | + "source": [ |
| 985 | + "plot_cluster_noise(\n", |
| 986 | + " clustering_visualization_dataframe=java_package_clustering_features,\n", |
| 987 | + " title=\"Java Package Clustering Noise - Noise points with role inversion (size), possibly violating layering or dependency direction (color)\",\n", |
| 988 | + " size_column_name='pageToArticleRankDifference',\n", |
| 989 | + " color_column_name='betweenness'\n", |
| 990 | + ")" |
| 991 | + ] |
| 992 | + }, |
871 | 993 | {
|
872 | 994 | "cell_type": "markdown",
|
873 | 995 | "id": "5682bb64",
|
|
1011 | 1133 | " WHERE codeUnit.incomingDependencies IS NOT NULL\n",
|
1012 | 1134 | " AND codeUnit.outgoingDependencies IS NOT NULL\n",
|
1013 | 1135 | " AND codeUnit.centralityPageRank IS NOT NULL\n",
|
| 1136 | + " AND codeUnit.centralityArticleRank IS NOT NULL\n", |
| 1137 | + " AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL\n", |
| 1138 | + " AND codeUnit.centralityBetweenness IS NOT NULL\n", |
1014 | 1139 | " AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n",
|
1015 | 1140 | " AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n",
|
1016 | 1141 | " AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n",
|
|
1024 | 1149 | " ,codeUnit.incomingDependencies AS incomingDependencies\n",
|
1025 | 1150 | " ,codeUnit.outgoingDependencies AS outgoingDependencies\n",
|
1026 | 1151 | " ,codeUnit.centralityPageRank AS pageRank\n",
|
| 1152 | + " ,1.0 - codeUnit.communityLocalClusteringCoefficient AS inverseClusteringCoefficient\n", |
| 1153 | + " ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference\n", |
| 1154 | + " ,codeUnit.centralityBetweenness AS betweenness\n", |
1027 | 1155 | " ,codeUnit.clusteringHDBSCANLabel AS clusteringHDBSCANLabel\n",
|
1028 | 1156 | " ,codeUnit.clusteringHDBSCANProbability AS clusteringHDBSCANProbability\n",
|
1029 | 1157 | " ,codeUnit.clusteringHDBSCANNoise AS clusteringHDBSCANNoise\n",
|
|
1033 | 1161 | "\"\"\"\n",
|
1034 | 1162 | "\n",
|
1035 | 1163 | "java_type_clustering_features = query_cypher_to_data_frame(java_type_clustering_query)\n",
|
| 1164 | + "java_type_clustering_features['degree'] = java_type_clustering_features['incomingDependencies'] + java_type_clustering_features['outgoingDependencies']\n", |
| 1165 | + "\n", |
1036 | 1166 | "display(java_type_clustering_features.head(5))"
|
1037 | 1167 | ]
|
1038 | 1168 | },
|
|
1048 | 1178 | " title=\"Java Type Clustering Visualization\"\n",
|
1049 | 1179 | ")"
|
1050 | 1180 | ]
|
| 1181 | + }, |
| 1182 | + { |
| 1183 | + "cell_type": "code", |
| 1184 | + "execution_count": null, |
| 1185 | + "id": "d70ec20c", |
| 1186 | + "metadata": {}, |
| 1187 | + "outputs": [], |
| 1188 | + "source": [ |
| 1189 | + "plot_cluster_noise(\n", |
| 1190 | + " clustering_visualization_dataframe=java_type_clustering_features,\n", |
| 1191 | + " title=\"Java Type Clustering Noise - Noise points that are surprisingly central (color) or popular (size)\",\n", |
| 1192 | + " size_column_name='degree',\n", |
| 1193 | + " color_column_name='pageRank'\n", |
| 1194 | + ")" |
| 1195 | + ] |
| 1196 | + }, |
| 1197 | + { |
| 1198 | + "cell_type": "code", |
| 1199 | + "execution_count": null, |
| 1200 | + "id": "e8d888be", |
| 1201 | + "metadata": {}, |
| 1202 | + "outputs": [], |
| 1203 | + "source": [ |
| 1204 | + "plot_cluster_noise(\n", |
| 1205 | + " clustering_visualization_dataframe=java_type_clustering_features,\n", |
| 1206 | + " title=\"Java Type Clustering Noise - Noise points that bridge flow (color) and are poorly integrated (size)\",\n", |
| 1207 | + " size_column_name='inverseClusteringCoefficient',\n", |
| 1208 | + " color_column_name='betweenness'\n", |
| 1209 | + ")" |
| 1210 | + ] |
| 1211 | + }, |
| 1212 | + { |
| 1213 | + "cell_type": "code", |
| 1214 | + "execution_count": null, |
| 1215 | + "id": "c9921ad7", |
| 1216 | + "metadata": {}, |
| 1217 | + "outputs": [], |
| 1218 | + "source": [ |
| 1219 | + "plot_cluster_noise(\n", |
| 1220 | + " clustering_visualization_dataframe=java_type_clustering_features,\n", |
| 1221 | + " title=\"Java Type Clustering Noise - Noise points with role inversion (size), possibly violating layering or dependency direction (color)\",\n", |
| 1222 | + " size_column_name='pageToArticleRankDifference',\n", |
| 1223 | + " color_column_name='betweenness'\n", |
| 1224 | + ")" |
| 1225 | + ] |
1051 | 1226 | }
|
1052 | 1227 | ],
|
1053 | 1228 | "metadata": {
|
|
0 commit comments