Plot cluster noise and mark different anomalies

JohT · JohT · commit d79ffe56cc2c · 2025-07-10T08:36:27.000+02:00
diff --git a/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb
@@ -553,6 +553,9 @@
     "    WHERE codeUnit.incomingDependencies                        IS NOT NULL\n",
     "      AND codeUnit.outgoingDependencies                        IS NOT NULL\n",
     "      AND codeUnit.centralityPageRank                          IS NOT NULL\n",
+    "      AND codeUnit.centralityArticleRank                       IS NOT NULL\n",
+    "      AND codeUnit.communityLocalClusteringCoefficient         IS NOT NULL\n",
+    "      AND codeUnit.centralityBetweenness                       IS NOT NULL\n",
     "      AND codeUnit.clusteringHDBSCANLabel                      IS NOT NULL\n",
     "      AND codeUnit.clusteringHDBSCANProbability                IS NOT NULL\n",
     "      AND codeUnit.clusteringHDBSCANNoise                      IS NOT NULL\n",
@@ -566,15 +569,18 @@
     "        ,codeUnit.incomingDependencies                        AS incomingDependencies\n",
     "        ,codeUnit.outgoingDependencies                        AS outgoingDependencies\n",
     "        ,codeUnit.centralityPageRank                          AS pageRank\n",
+    "        ,1.0 - codeUnit.communityLocalClusteringCoefficient   AS inverseClusteringCoefficient\n",
+    "        ,codeUnit.centralityBetweenness                       AS betweenness\n",
+    "        ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference\n",
     "        ,codeUnit.clusteringHDBSCANLabel                      AS clusteringHDBSCANLabel\n",
     "        ,codeUnit.clusteringHDBSCANProbability                AS clusteringHDBSCANProbability\n",
     "        ,codeUnit.clusteringHDBSCANNoise                      AS clusteringHDBSCANNoise\n",
     "        ,codeUnit.clusteringHDBSCANMedoid                     AS clusteringHDBSCANMedoid\n",
     "        ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX\n",
-    "        ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\n",
-    "\"\"\"\n",
+    "        ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\"\"\"\n",
     "\n",
     "java_package_clustering_features = query_cypher_to_data_frame(java_package_clustering_query)\n",
+    "java_package_clustering_features['degree'] = java_package_clustering_features['incomingDependencies'] + java_package_clustering_features['outgoingDependencies']\n",
     "display(java_package_clustering_features.head(5))"
    ]
   },
@@ -783,7 +789,7 @@
     "\n",
     "            # Annotate medoids of the cluster\n",
     "            medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n",
-    "            for index, row in medoids.iterrows():\n",
+    "            for medoid_index, row in medoids.iterrows():\n",
     "                axis.annotate(\n",
     "                    text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n",
     "                    xy=(row[x_position_column], row[y_position_column]),\n",
@@ -868,6 +874,122 @@
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9580ddb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_cluster_noise(\n",
+    "    clustering_visualization_dataframe: pd.DataFrame,\n",
+    "    title: str,\n",
+    "    main_color_map: str = \"bwr\",\n",
+    "    code_unit_column_name: str = \"shortCodeUnitName\",\n",
+    "    cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n",
+    "    size_column_name: str = \"degree\",\n",
+    "    color_column_name: str = \"pageRank\",\n",
+    "    x_position_column = 'embeddingVisualizationX',\n",
+    "    y_position_column = 'embeddingVisualizationY'\n",
+    ") -> None:\n",
+    "    if clustering_visualization_dataframe.empty:\n",
+    "        print(\"No projected data to plot available\")\n",
+    "        return\n",
+    "\n",
+    "    # Filter only noise points\n",
+    "    noise_points = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] == -1]\n",
+    "    noise_points = noise_points.sort_values(by=size_column_name, ascending=False).reset_index(drop=True)\n",
+    "\n",
+    "    if noise_points.empty:\n",
+    "        print(\"No noise points to plot.\")\n",
+    "        return\n",
+    "\n",
+    "    plot.figure(figsize=(10, 10))\n",
+    "    plot.title(title)\n",
+    "\n",
+    "    # Determine the color threshold for noise points\n",
+    "    color_10th_highest_value = noise_points[color_column_name].nlargest(10).iloc[-1]  # Get the 10th largest value\n",
+    "    color_90_quantile = noise_points[color_column_name].quantile(0.90)\n",
+    "    color_threshold = max(color_10th_highest_value, color_90_quantile)\n",
+    "\n",
+    "    # Color the color column values above the 90% quantile threshold red, the rest light grey \n",
+    "    colors = noise_points[color_column_name].apply(\n",
+    "        lambda x: \"red\" if x >= color_threshold else \"lightgrey\"\n",
+    "    )\n",
+    "    normalized_size = noise_points[size_column_name] / noise_points[size_column_name].max()\n",
+    "\n",
+    "    # Scatter plot for noise points\n",
+    "    scatter = plot.scatter(\n",
+    "        x=noise_points[x_position_column],\n",
+    "        y=noise_points[y_position_column],\n",
+    "        s=normalized_size.clip(lower=0.01) * 800 + 2,\n",
+    "        c=colors,\n",
+    "        alpha=0.6\n",
+    "    )\n",
+    "\n",
+    "    # Annotate the largest 10 points and all colored ones with their names\n",
+    "    for index, row in noise_points.iterrows():\n",
+    "        index = typing.cast(int, index)\n",
+    "        if colors[index] != 'red' and index >= 10:\n",
+    "            continue\n",
+    "        plot.annotate(\n",
+    "            text=row[code_unit_column_name],\n",
+    "            xy=(row[x_position_column], row[y_position_column]),\n",
+    "            xytext=(5, 5 + (index % 2) * 20),  # Offset for better visibility\n",
+    "            **plot_annotation_style\n",
+    "        )\n",
+    "\n",
+    "    plot.xlabel(x_position_column)\n",
+    "    plot.ylabel(y_position_column)\n",
+    "    plot.tight_layout()\n",
+    "    plot.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c56606c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_cluster_noise(\n",
+    "    clustering_visualization_dataframe=java_package_clustering_features,\n",
+    "    title=\"Java Package Clustering Noise - Noise points that are surprisingly central (color) or popular (size)\",\n",
+    "    size_column_name='degree',\n",
+    "    color_column_name='pageRank'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d9b2010c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_cluster_noise(\n",
+    "    clustering_visualization_dataframe=java_package_clustering_features,\n",
+    "    title=\"Java Package Clustering Noise - Noise points that bridge flow (color) and are poorly integrated (size)\",\n",
+    "    size_column_name='inverseClusteringCoefficient',\n",
+    "    color_column_name='betweenness'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "891d79b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_cluster_noise(\n",
+    "    clustering_visualization_dataframe=java_package_clustering_features,\n",
+    "    title=\"Java Package Clustering Noise - Noise points with role inversion (size), possibly violating layering or dependency direction (color)\",\n",
+    "    size_column_name='pageToArticleRankDifference',\n",
+    "    color_column_name='betweenness'\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "5682bb64",
@@ -1011,6 +1133,9 @@
     "    WHERE codeUnit.incomingDependencies                        IS NOT NULL\n",
     "      AND codeUnit.outgoingDependencies                        IS NOT NULL\n",
     "      AND codeUnit.centralityPageRank                          IS NOT NULL\n",
+    "      AND codeUnit.centralityArticleRank                       IS NOT NULL\n",
+    "      AND codeUnit.communityLocalClusteringCoefficient         IS NOT NULL\n",
+    "      AND codeUnit.centralityBetweenness                       IS NOT NULL\n",
     "      AND codeUnit.clusteringHDBSCANLabel                      IS NOT NULL\n",
     "      AND codeUnit.clusteringHDBSCANProbability                IS NOT NULL\n",
     "      AND codeUnit.clusteringHDBSCANNoise                      IS NOT NULL\n",
@@ -1024,6 +1149,9 @@
     "        ,codeUnit.incomingDependencies                        AS incomingDependencies\n",
     "        ,codeUnit.outgoingDependencies                        AS outgoingDependencies\n",
     "        ,codeUnit.centralityPageRank                          AS pageRank\n",
+    "        ,1.0 - codeUnit.communityLocalClusteringCoefficient   AS inverseClusteringCoefficient\n",
+    "        ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference\n",
+    "        ,codeUnit.centralityBetweenness                       AS betweenness\n",
     "        ,codeUnit.clusteringHDBSCANLabel                      AS clusteringHDBSCANLabel\n",
     "        ,codeUnit.clusteringHDBSCANProbability                AS clusteringHDBSCANProbability\n",
     "        ,codeUnit.clusteringHDBSCANNoise                      AS clusteringHDBSCANNoise\n",
@@ -1033,6 +1161,8 @@
     "\"\"\"\n",
     "\n",
     "java_type_clustering_features = query_cypher_to_data_frame(java_type_clustering_query)\n",
+    "java_type_clustering_features['degree'] = java_type_clustering_features['incomingDependencies'] + java_type_clustering_features['outgoingDependencies']\n",
+    "\n",
     "display(java_type_clustering_features.head(5))"
    ]
   },
@@ -1048,6 +1178,51 @@
     "    title=\"Java Type Clustering Visualization\"\n",
     ")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d70ec20c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_cluster_noise(\n",
+    "    clustering_visualization_dataframe=java_type_clustering_features,\n",
+    "    title=\"Java Type Clustering Noise - Noise points that are surprisingly central (color) or popular (size)\",\n",
+    "    size_column_name='degree',\n",
+    "    color_column_name='pageRank'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e8d888be",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_cluster_noise(\n",
+    "    clustering_visualization_dataframe=java_type_clustering_features,\n",
+    "    title=\"Java Type Clustering Noise - Noise points that bridge flow (color) and are poorly integrated (size)\",\n",
+    "    size_column_name='inverseClusteringCoefficient',\n",
+    "    color_column_name='betweenness'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9921ad7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_cluster_noise(\n",
+    "    clustering_visualization_dataframe=java_type_clustering_features,\n",
+    "    title=\"Java Type Clustering Noise - Noise points with role inversion (size), possibly violating layering or dependency direction (color)\",\n",
+    "    size_column_name='pageToArticleRankDifference',\n",
+    "    color_column_name='betweenness'\n",
+    ")"
+   ]
   }
  ],
  "metadata": {