|
556 | 556 | " AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n",
|
557 | 557 | " AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n",
|
558 | 558 | " AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n",
|
| 559 | + " AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL\n", |
559 | 560 | " AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL\n",
|
560 | 561 | " AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL\n",
|
561 | 562 | " RETURN DISTINCT \n",
|
|
568 | 569 | " ,codeUnit.clusteringHDBSCANLabel AS clusteringHDBSCANLabel\n",
|
569 | 570 | " ,codeUnit.clusteringHDBSCANProbability AS clusteringHDBSCANProbability\n",
|
570 | 571 | " ,codeUnit.clusteringHDBSCANNoise AS clusteringHDBSCANNoise\n",
|
| 572 | + " ,codeUnit.clusteringHDBSCANMedoid AS clusteringHDBSCANMedoid\n", |
571 | 573 | " ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX\n",
|
572 | 574 | " ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\n",
|
573 | 575 | "\"\"\"\n",
|
|
604 | 606 | " clustering_visualization_dataframe: pd.DataFrame,\n",
|
605 | 607 | " title: str,\n",
|
606 | 608 | " main_color_map: str = \"tab20\",\n",
|
| 609 | + " code_unit_column_name: str = \"shortCodeUnitName\",\n", |
607 | 610 | " cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n",
|
| 611 | + " cluster_medoid_column_name: str = \"clusteringHDBSCANMedoid\",\n", |
608 | 612 | " centrality_column_name: str = \"pageRank\",\n",
|
609 | 613 | " x_position_column = 'embeddingVisualizationX',\n",
|
610 | 614 | " y_position_column = 'embeddingVisualizationY'\n",
|
|
641 | 645 | " # we can adjust the alpha value for the KDE plot to visualize smaller clusters more clearly.\n",
|
642 | 646 | " # This way, larger clusters will have a lower alpha value, making them less prominent and less prone to overshadow smaller clusters.\n",
|
643 | 647 | " cluster_diameter = max_pairwise_distance(cluster_nodes[[x_position_column, y_position_column]].to_numpy())\n",
|
644 |
| - " alpha = max((1.0 - (cluster_diameter / max_diameter)) * 0.45 - 0.25, 0.001)\n", |
| 648 | + " alpha = max((1.0 - (cluster_diameter / max_diameter)) * 0.45 - 0.25, 0.02)\n", |
645 | 649 | "\n",
|
646 | 650 | " # KDE cloud shape\n",
|
647 | 651 | " if len(cluster_nodes) > 1 and (\n",
|
|
668 | 672 | " label=f\"Cluster {cluster_label}\"\n",
|
669 | 673 | " )\n",
|
670 | 674 | "\n",
|
| 675 | + " # Annotate medoids of the cluster\n", |
| 676 | + " medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n", |
| 677 | + " for index, row in medoids.iterrows():\n", |
| 678 | + " plot.annotate(\n", |
| 679 | + " text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n", |
| 680 | + " xy=(row[x_position_column], row[y_position_column]),\n", |
| 681 | + " xytext=(5, 5), # Offset for better visibility\n", |
| 682 | + " **plot_annotation_style\n", |
| 683 | + " )\n", |
| 684 | + "\n", |
671 | 685 | " # Plot noise points in gray\n",
|
672 | 686 | " plot.scatter(\n",
|
673 | 687 | " x=node_embeddings_noise_only[x_position_column],\n",
|
|
678 | 692 | " label=\"Noise\"\n",
|
679 | 693 | " )\n",
|
680 | 694 | "\n",
|
681 |
| - " legend = plot.legend(title=\"HDBSCAN Clusters\", loc=\"best\", prop={'size': 6}, ncols=2)\n", |
682 |
| - " # Workaround to set all legend dots to the same size\n", |
683 |
| - " for handle in legend.legend_handles:\n", |
684 |
| - " handle.set_sizes([30])\n", |
| 695 | + " # Legend not needed since the clusters are now annotated at their center (medoid)\n", |
| 696 | + " # legend = plot.legend(title=\"HDBSCAN Clusters\", loc=\"best\", prop={'size': 6}, ncols=2)\n", |
| 697 | + " # # Workaround to set all legend dots to the same size\n", |
| 698 | + " # for handle in legend.legend_handles:\n", |
| 699 | + " # handle.set_sizes([30])\n", |
685 | 700 | " \n"
|
686 | 701 | ]
|
687 | 702 | },
|
|
696 | 711 | " clustering_visualization_dataframe: pd.DataFrame,\n",
|
697 | 712 | " title: str,\n",
|
698 | 713 | " main_color_map: str = \"tab20\",\n",
|
| 714 | + " code_unit_column_name: str = \"shortCodeUnitName\",\n", |
699 | 715 | " cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n",
|
| 716 | + " cluster_medoid_column_name: str = \"clusteringHDBSCANMedoid\",\n", |
700 | 717 | " centrality_column_name: str = \"pageRank\",\n",
|
701 | 718 | " x_position_column = 'embeddingVisualizationX',\n",
|
702 | 719 | " y_position_column = 'embeddingVisualizationY'\n",
|
|
721 | 738 | " quartile_size = (n_clusters + 3) // 4 # ceil division\n",
|
722 | 739 | " quartiles = [clusters[i*quartile_size:(i+1)*quartile_size] for i in range(4)]\n",
|
723 | 740 | "\n",
|
724 |
| - " figure, axes = plot.subplots(4, 1, figsize=(10, 32), squeeze=False)\n", |
| 741 | + " figure, axes = plot.subplots(4, 1, figsize=(10, 40), squeeze=False)\n", |
725 | 742 | " figure.suptitle(title, fontsize=14)\n",
|
726 | 743 | "\n",
|
727 | 744 | " for index, cluster_group in enumerate(quartiles):\n",
|
|
742 | 759 | " # we can adjust the alpha value for the KDE plot to visualize smaller clusters more clearly.\n",
|
743 | 760 | " # This way, larger clusters will have a lower alpha value, making them less prominent and less prone to overshadow smaller clusters.\n",
|
744 | 761 | " cluster_diameter = max_pairwise_distance(cluster_nodes[[x_position_column, y_position_column]].to_numpy())\n",
|
745 |
| - " alpha = max((1.0 - (cluster_diameter / quartile_diameter)) * 0.45 - 0.25, 0.001)\n", |
| 762 | + " alpha = max((1.0 - (cluster_diameter / quartile_diameter)) * 0.45 - 0.25, 0.02)\n", |
746 | 763 | "\n",
|
747 | 764 | " if len(cluster_nodes) > 1 and cluster_diameter > 0:\n",
|
748 | 765 | " seaborn.kdeplot(\n",
|
|
764 | 781 | " label=f\"Cluster {cluster_label}\"\n",
|
765 | 782 | " )\n",
|
766 | 783 | "\n",
|
767 |
| - " # Plot noise points in gray\n", |
| 784 | + " # Annotate medoids of the cluster\n", |
| 785 | + " medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n", |
| 786 | + " for index, row in medoids.iterrows():\n", |
| 787 | + " axis.annotate(\n", |
| 788 | + " text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n", |
| 789 | + " xy=(row[x_position_column], row[y_position_column]),\n", |
| 790 | + " xytext=(5, 5), # Offset y position for better visibility\n", |
| 791 | + " **plot_annotation_style\n", |
| 792 | + " )\n", |
| 793 | + "\n", |
| 794 | + " # Plot noise points in gray\n", |
768 | 795 | " axis.scatter(\n",
|
769 | 796 | " x=node_embeddings_noise_only[x_position_column],\n",
|
770 | 797 | " y=node_embeddings_noise_only[y_position_column],\n",
|
|
775 | 802 | " )\n",
|
776 | 803 | "\n",
|
777 | 804 | " axis.set_title(f\"Quartile {index+1}: {len(cluster_group)} clusters\")\n",
|
778 |
| - " legend = axis.legend(title=\"Cluster\", prop={'size': 6}, loc=\"best\", ncols=1)\n", |
779 |
| - " # Workaround to set all legend dots to the same size\n", |
780 |
| - " for handle in legend.legend_handles:\n", |
781 |
| - " handle.set_sizes([50])\n", |
782 | 805 | "\n",
|
783 |
| - " plot.tight_layout(rect=(0, 0, 1, 0.99))\n" |
| 806 | + " # Legend not needed since the clusters are now annotated at their center (medoid)\n", |
| 807 | + " # legend = axis.legend(title=\"Cluster\", prop={'size': 6}, loc=\"best\", ncols=1)\n", |
| 808 | + " # # Workaround to set all legend dots to the same size\n", |
| 809 | + " # for handle in legend.legend_handles:\n", |
| 810 | + " # handle.set_sizes([50])\n", |
| 811 | + "\n", |
| 812 | + " plot.tight_layout(rect=(0, 0, 1, 0.98))\n" |
784 | 813 | ]
|
785 | 814 | },
|
786 | 815 | {
|
|
804 | 833 | " return\n",
|
805 | 834 | " \n",
|
806 | 835 | " number_of_distinct_clusters = clustering_visualization_dataframe[cluster_label_column_name].nunique()\n",
|
807 |
| - " print(f\"Number of distinct clusters: {number_of_distinct_clusters}\")\n", |
808 | 836 | " if number_of_distinct_clusters > 30:\n",
|
809 | 837 | " plot_clusters_by_size_quartiles(\n",
|
810 | 838 | " clustering_visualization_dataframe=clustering_visualization_dataframe,\n",
|
|
986 | 1014 | " AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n",
|
987 | 1015 | " AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n",
|
988 | 1016 | " AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n",
|
| 1017 | + " AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL\n", |
989 | 1018 | " AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL\n",
|
990 | 1019 | " AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL\n",
|
991 | 1020 | " RETURN DISTINCT \n",
|
|
998 | 1027 | " ,codeUnit.clusteringHDBSCANLabel AS clusteringHDBSCANLabel\n",
|
999 | 1028 | " ,codeUnit.clusteringHDBSCANProbability AS clusteringHDBSCANProbability\n",
|
1000 | 1029 | " ,codeUnit.clusteringHDBSCANNoise AS clusteringHDBSCANNoise\n",
|
| 1030 | + " ,codeUnit.clusteringHDBSCANMedoid AS clusteringHDBSCANMedoid\n", |
1001 | 1031 | " ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX\n",
|
1002 | 1032 | " ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\n",
|
1003 | 1033 | "\"\"\"\n",
|
|
0 commit comments