|
700 | 700 | " if clustering_visualization_dataframe.empty:\n",
|
701 | 701 | " print(\"No projected data to plot available\")\n",
|
702 | 702 | " return\n",
|
703 |
| - "\n", |
| 703 | + " \n", |
| 704 | + " def truncate(text: str, max_length: int):\n", |
| 705 | + " if len(text) <= max_length:\n", |
| 706 | + " return text\n", |
| 707 | + " return text[:max_length - 3] + \"...\"\n", |
| 708 | + " \n", |
704 | 709 | " # Create figure and subplots\n",
|
705 | 710 | " plot.figure(figsize=(10, 10))\n",
|
706 | 711 | "\n",
|
|
760 | 765 | " medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n",
|
761 | 766 | " for index, row in medoids.iterrows():\n",
|
762 | 767 | " plot.annotate(\n",
|
763 |
| - " text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n", |
| 768 | + " text=f\"{truncate(row[code_unit_column_name], 30)} ({row[cluster_label_column_name]})\",\n", |
764 | 769 | " xy=(row[x_position_column], row[y_position_column]),\n",
|
765 | 770 | " xytext=(5, 5), # Offset for better visibility\n",
|
766 | 771 | " **plot_annotation_style\n",
|
|
774 | 779 | " color='lightgrey',\n",
|
775 | 780 | " alpha=0.4,\n",
|
776 | 781 | " label=\"Noise\"\n",
|
777 |
| - " )\n", |
778 |
| - "\n", |
779 |
| - " # Legend not needed since the clusters are now annotated at their center (medoid)\n", |
780 |
| - " # legend = plot.legend(title=\"HDBSCAN Clusters\", loc=\"best\", prop={'size': 6}, ncols=2)\n", |
781 |
| - " # # Workaround to set all legend dots to the same size\n", |
782 |
| - " # for handle in legend.legend_handles:\n", |
783 |
| - " # handle.set_sizes([30])\n", |
784 |
| - " \n" |
785 |
| - ] |
786 |
| - }, |
787 |
| - { |
788 |
| - "cell_type": "code", |
789 |
| - "execution_count": null, |
790 |
| - "id": "a13271b4", |
791 |
| - "metadata": {}, |
792 |
| - "outputs": [], |
793 |
| - "source": [ |
794 |
| - "def plot_clusters_by_size_quartiles(\n", |
795 |
| - " clustering_visualization_dataframe: pd.DataFrame,\n", |
796 |
| - " title: str,\n", |
797 |
| - " main_color_map: str = \"tab20\",\n", |
798 |
| - " code_unit_column_name: str = \"shortCodeUnitName\",\n", |
799 |
| - " cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n", |
800 |
| - " cluster_medoid_column_name: str = \"clusteringHDBSCANMedoid\",\n", |
801 |
| - " centrality_column_name: str = \"pageRank\",\n", |
802 |
| - " x_position_column = 'embeddingVisualizationX',\n", |
803 |
| - " y_position_column = 'embeddingVisualizationY'\n", |
804 |
| - ") -> None:\n", |
805 |
| - " if clustering_visualization_dataframe.empty:\n", |
806 |
| - " print(\"No projected data to plot available\")\n", |
807 |
| - " return\n", |
808 |
| - "\n", |
809 |
| - " node_embeddings_noise_only = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] == -1]\n", |
810 |
| - " # Exclude noise\n", |
811 |
| - " non_noise = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] != -1]\n", |
812 |
| - " if non_noise.empty:\n", |
813 |
| - " print(\"No clusters to plot (all noise)\")\n", |
814 |
| - " return\n", |
815 |
| - "\n", |
816 |
| - " # Compute cluster sizes\n", |
817 |
| - " cluster_sizes = non_noise.groupby(cluster_label_column_name).size().sort_values(ascending=False)\n", |
818 |
| - " n_clusters = len(cluster_sizes)\n", |
819 |
| - " clusters = cluster_sizes.index.tolist()\n", |
820 |
| - "\n", |
821 |
| - " # Split clusters into 4 quartiles by size\n", |
822 |
| - " quartile_size = (n_clusters + 3) // 4 # ceil division\n", |
823 |
| - " quartiles = [clusters[i*quartile_size:(i+1)*quartile_size] for i in range(4)]\n", |
824 |
| - "\n", |
825 |
| - " figure, axes = plot.subplots(4, 1, figsize=(10, 40), squeeze=False)\n", |
826 |
| - " figure.suptitle(title, fontsize=14)\n", |
827 |
| - "\n", |
828 |
| - " for index, cluster_group in enumerate(quartiles):\n", |
829 |
| - " axis = axes[index][0]\n", |
830 |
| - " group_df = non_noise[non_noise[cluster_label_column_name].isin(cluster_group)]\n", |
831 |
| - " if group_df.empty:\n", |
832 |
| - " axis.set_title(f\"Quartile {index+1} (empty)\")\n", |
833 |
| - " continue\n", |
834 |
| - "\n", |
835 |
| - " unique_labels = group_df[cluster_label_column_name].unique()\n", |
836 |
| - " color_palette = seaborn.color_palette(main_color_map, len(unique_labels))\n", |
837 |
| - " cluster_to_color = dict(zip(unique_labels, color_palette))\n", |
838 |
| - " quartile_diameter = max_pairwise_distance(group_df[[x_position_column, y_position_column]].to_numpy())\n", |
839 |
| - "\n", |
840 |
| - " for cluster_label in unique_labels:\n", |
841 |
| - " cluster_nodes = group_df[group_df[cluster_label_column_name] == cluster_label]\n", |
842 |
| - " # By comparing the cluster diameter to the max diameter of all clusters in the quartile,\n", |
843 |
| - " # we can adjust the alpha value for the KDE plot to visualize smaller clusters more clearly.\n", |
844 |
| - " # This way, larger clusters will have a lower alpha value, making them less prominent and less prone to overshadow smaller clusters.\n", |
845 |
| - " cluster_diameter = max_pairwise_distance(cluster_nodes[[x_position_column, y_position_column]].to_numpy())\n", |
846 |
| - " alpha = max((1.0 - (cluster_diameter / quartile_diameter)) * 0.45 - 0.25, 0.02)\n", |
847 |
| - "\n", |
848 |
| - " if len(cluster_nodes) > 1 and cluster_diameter > 0:\n", |
849 |
| - " seaborn.kdeplot(\n", |
850 |
| - " x=cluster_nodes[x_position_column],\n", |
851 |
| - " y=cluster_nodes[y_position_column],\n", |
852 |
| - " fill=True,\n", |
853 |
| - " alpha=alpha,\n", |
854 |
| - " levels=2,\n", |
855 |
| - " color=cluster_to_color[cluster_label],\n", |
856 |
| - " ax=axis,\n", |
857 |
| - " warn_singular=False,\n", |
858 |
| - " )\n", |
859 |
| - " axis.scatter(\n", |
860 |
| - " x=cluster_nodes[x_position_column],\n", |
861 |
| - " y=cluster_nodes[y_position_column],\n", |
862 |
| - " s=cluster_nodes[centrality_column_name] * 300 + 2,\n", |
863 |
| - " color=cluster_to_color[cluster_label],\n", |
864 |
| - " alpha=0.9,\n", |
865 |
| - " label=f\"Cluster {cluster_label}\"\n", |
866 |
| - " )\n", |
867 |
| - "\n", |
868 |
| - " # Annotate medoids of the cluster\n", |
869 |
| - " medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n", |
870 |
| - " for medoid_index, row in medoids.iterrows():\n", |
871 |
| - " axis.annotate(\n", |
872 |
| - " text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n", |
873 |
| - " xy=(row[x_position_column], row[y_position_column]),\n", |
874 |
| - " xytext=(5, 5), # Offset y position for better visibility\n", |
875 |
| - " **plot_annotation_style\n", |
876 |
| - " )\n", |
877 |
| - "\n", |
878 |
| - " # Plot noise points in gray\n", |
879 |
| - " axis.scatter(\n", |
880 |
| - " x=node_embeddings_noise_only[x_position_column],\n", |
881 |
| - " y=node_embeddings_noise_only[y_position_column],\n", |
882 |
| - " s=node_embeddings_noise_only[centrality_column_name] * 300 + 2,\n", |
883 |
| - " color='gainsboro',\n", |
884 |
| - " alpha=0.4,\n", |
885 |
| - " label=\"Noise\"\n", |
886 |
| - " )\n", |
887 |
| - "\n", |
888 |
| - " axis.set_title(f\"Quartile {index+1}: {len(cluster_group)} clusters\")\n", |
889 |
| - "\n", |
890 |
| - " # Legend not needed since the clusters are now annotated at their center (medoid)\n", |
891 |
| - " # legend = axis.legend(title=\"Cluster\", prop={'size': 6}, loc=\"best\", ncols=1)\n", |
892 |
| - " # # Workaround to set all legend dots to the same size\n", |
893 |
| - " # for handle in legend.legend_handles:\n", |
894 |
| - " # handle.set_sizes([50])\n", |
895 |
| - "\n", |
896 |
| - " plot.tight_layout(rect=(0, 0, 1, 0.98))\n" |
897 |
| - ] |
898 |
| - }, |
899 |
| - { |
900 |
| - "cell_type": "code", |
901 |
| - "execution_count": null, |
902 |
| - "id": "647e79e9", |
903 |
| - "metadata": {}, |
904 |
| - "outputs": [], |
905 |
| - "source": [ |
906 |
| - "def plot_clusters(\n", |
907 |
| - " clustering_visualization_dataframe: pd.DataFrame,\n", |
908 |
| - " title: str,\n", |
909 |
| - " main_color_map: str = \"tab20\",\n", |
910 |
| - " cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n", |
911 |
| - " centrality_column_name: str = \"pageRank\",\n", |
912 |
| - " x_position_column = 'embeddingVisualizationX',\n", |
913 |
| - " y_position_column = 'embeddingVisualizationY'\n", |
914 |
| - ") -> None:\n", |
915 |
| - " if clustering_visualization_dataframe.empty:\n", |
916 |
| - " print(\"No projected data to plot available\")\n", |
917 |
| - " return\n", |
918 |
| - " \n", |
919 |
| - " number_of_distinct_clusters = clustering_visualization_dataframe[cluster_label_column_name].nunique()\n", |
920 |
| - " if number_of_distinct_clusters > 30:\n", |
921 |
| - " plot_clusters_by_size_quartiles(\n", |
922 |
| - " clustering_visualization_dataframe=clustering_visualization_dataframe,\n", |
923 |
| - " title=title,\n", |
924 |
| - " main_color_map=main_color_map,\n", |
925 |
| - " cluster_label_column_name=cluster_label_column_name,\n", |
926 |
| - " centrality_column_name=centrality_column_name,\n", |
927 |
| - " x_position_column=x_position_column,\n", |
928 |
| - " y_position_column=y_position_column\n", |
929 |
| - " )\n", |
930 |
| - " else:\n", |
931 |
| - " plot_all_clusters(\n", |
932 |
| - " clustering_visualization_dataframe=clustering_visualization_dataframe,\n", |
933 |
| - " title=title,\n", |
934 |
| - " main_color_map=main_color_map,\n", |
935 |
| - " cluster_label_column_name=cluster_label_column_name,\n", |
936 |
| - " centrality_column_name=centrality_column_name,\n", |
937 |
| - " x_position_column=x_position_column,\n", |
938 |
| - " y_position_column=y_position_column\n", |
939 |
| - " )" |
| 782 | + " )" |
940 | 783 | ]
|
941 | 784 | },
|
942 | 785 | {
|
|
0 commit comments