4
4
# This is useful for understanding code structure, detecting modular boundaries, and identifying anomalies or outliers in large software systems without requiring manual labeling.
5
5
# It takes the code structure as a graph in Neo4j and generates node embeddings using Fast Random Projection (FastRP).
6
6
# These embeddings capture structural similarity and are clustered using HDBSCAN to assign labels or detect noise.
7
- # For visualization, the embeddings are reduced to 2D using t-SNE.
8
7
# All results - including embeddings, cluster labels, and 2D coordinates — are written back to Neo4j for further use.
9
8
10
9
# Prerequisite:
25
24
26
25
from neo4j import GraphDatabase , Driver
27
26
28
- from openTSNE .sklearn import TSNE
29
-
30
- from sklearn .base import BaseEstimator
27
+ # from sklearn.base import BaseEstimator # Extend from sklearn BaseEstimator to use e.g. GridSearchCV for hyperparameter tuning.
31
28
from sklearn .metrics import adjusted_rand_score , adjusted_mutual_info_score , normalized_mutual_info_score
32
29
from sklearn .cluster import HDBSCAN # type: ignore
33
30
38
35
39
36
40
37
class Parameters :
41
- required_parameters_ = ["projection_name" , "projection_node_label" , "projection_weight_property" , "community_property" ]
38
+ required_parameters_ = ["projection_name" , "projection_node_label" , "projection_weight_property" , "community_property" , "embedding_property" ]
42
39
43
40
def __init__ (self , input_parameters : typing .Dict [str , str ], verbose : bool = False ):
44
41
self .query_parameters_ = input_parameters .copy () # copy enforces immutability
@@ -63,9 +60,6 @@ def log_dependency_versions_() -> None:
63
60
from sklearn import __version__ as sklearn_version
64
61
print ('scikit-learn version: {}' .format (sklearn_version ))
65
62
66
- from openTSNE import __version__ as openTSNE_version
67
- print ('openTSNE version: {}' .format (openTSNE_version ))
68
-
69
63
from neo4j import __version__ as neo4j_version
70
64
print ('neo4j version: {}' .format (neo4j_version ))
71
65
@@ -116,6 +110,9 @@ def get_projection_name(self) -> str:
116
110
def get_projection_node_label (self ) -> str :
117
111
return self .query_parameters_ ["projection_node_label" ]
118
112
113
+ def get_embedding_property (self ) -> str :
114
+ return self .query_parameters_ ["embedding_property" ]
115
+
119
116
def is_verbose (self ) -> bool :
120
117
return self .verbose_
121
118
@@ -513,7 +510,8 @@ def __init__(self,
513
510
forth_iteration_weight : float = 1.0 ,
514
511
):
515
512
self .parameters_ = parameters
516
- self .verbose = parameters .is_verbose ()
513
+ self .verbose_ = parameters .is_verbose ()
514
+ self .write_property_ = parameters .get_embedding_property ()
517
515
518
516
self .embedding_dimension = embedding_dimension
519
517
self .random_seed = random_seed
@@ -526,15 +524,15 @@ def __to_algorithm_parameters(self) -> typing.Dict['str', 'str']:
526
524
"normalization_strength" : str (self .normalization_strength ),
527
525
"forth_iteration_weight" : str (self .forth_iteration_weight ),
528
526
"embedding_random_seed" : str (self .random_seed ),
529
- "write_property" : "embeddingsFastRandomProjectionForClustering" ,
527
+ "write_property" : str ( self . write_property_ ) ,
530
528
** self .parameters_ .get_query_parameters ()
531
529
}
532
530
533
531
def __run_algorithm (self ) -> pd .DataFrame :
534
532
algorithm_parameters = self .__to_algorithm_parameters ()
535
533
# For Debugging:
536
534
# print("Generating embeddings using Neo4j Graph Data Science with the following parameters: " + str(algorithm_parameters))
537
- if self .verbose :
535
+ if self .verbose_ :
538
536
return query_cypher_to_data_frame (self .cypher_query_for_generating_embeddings_ , parameters = algorithm_parameters )
539
537
540
538
return query_cypher_to_data_frame_suppress_warnings (self .cypher_query_for_generating_embeddings_ , parameters = algorithm_parameters )
@@ -568,12 +566,12 @@ def write_embeddings(self) -> typing.Self:
568
566
This is useful for further processing or analysis of the embeddings.
569
567
"""
570
568
algorithm_parameters = self .__to_algorithm_parameters ()
571
- if self .verbose :
569
+ if self .verbose_ :
572
570
print ("" )
573
571
print ("Writing embeddings to Neo4j with the following parameters: " + str (algorithm_parameters ))
574
572
print ("" )
575
573
576
- if self .verbose :
574
+ if self .verbose_ :
577
575
query_cypher_to_data_frame (self .cypher_query_for_writing_embeddings_ , parameters = algorithm_parameters )
578
576
else :
579
577
query_cypher_to_data_frame_suppress_warnings (self .cypher_query_for_writing_embeddings_ , parameters = algorithm_parameters )
@@ -633,63 +631,27 @@ def objective(trial):
633
631
return TuneableFastRandomProjectionNodeEmbeddings (parameters , ** study .best_params ).fit ()
634
632
635
633
636
- def prepare_node_embeddings_for_2d_visualization (embeddings : pd .DataFrame ) -> pd .DataFrame :
637
- """
638
- Reduces the dimensionality of the node embeddings (e.g. 64 floating point numbers in an array)
639
- to two dimensions for 2D visualization.
640
- see https://opentsne.readthedocs.io
641
- """
642
-
643
- if embeddings .empty :
644
- print ("No projected data for node embeddings dimensionality reduction available" )
645
- return embeddings
646
-
647
- # Calling the fit_transform method just with a list doesn't work.
648
- # It leads to an error with the following message: 'list' object has no attribute 'shape'
649
- # This can be solved by converting the list to a numpy array using np.array(..).
650
- # See https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape
651
- embeddings_as_numpy_array = np .array (embeddings .embedding .to_list ())
652
-
653
- # Use t-distributed Stochastic Neighbor Embedding (t-SNE) to reduce the dimensionality
654
- # of the previously calculated node embeddings to 2 dimensions for visualization
655
- t_distributed_stochastic_neighbor_embedding = TSNE (n_components = 2 , verbose = False , random_state = 47 )
656
- two_dimension_node_embeddings = t_distributed_stochastic_neighbor_embedding .fit_transform (embeddings_as_numpy_array )
657
- # display(two_dimension_node_embeddings.shape) # Display the shape of the t-SNE result
658
-
659
- # Create a new DataFrame with the results of the 2 dimensional node embeddings
660
- # and the code unit and artifact name of the query above as preparation for the plot
661
- embeddings ['embeddingVisualizationX' ] = [value [0 ] for value in two_dimension_node_embeddings ]
662
- embeddings ['embeddingVisualizationY' ] = [value [1 ] for value in two_dimension_node_embeddings ]
663
-
664
- return embeddings
665
-
666
-
667
- def execute_tuned_node_embeddings_clustering (parameters : Parameters ) -> None :
668
- tuned_fast_random_projection = get_tuned_fast_random_projection_node_embeddings (parameters )
669
- embeddings = tuned_fast_random_projection .get_embeddings ()
670
- clustering_results = coordinate_tuned_hierarchical_density_based_spatial_clustering (embeddings )
671
- if parameters .is_verbose ():
672
- print ("HDBSCAN clustered labels by their size descending (top 10):" , clustering_results .clustering_results_distribution .head (10 ))
673
- print ("HDBSCAN clustered labels by their probability descending (top 10):" , clustering_results .clustering_results_distribution .sort_values (by = 'probability' , ascending = False ).head (10 ))
674
-
675
- embeddings = prepare_node_embeddings_for_2d_visualization (clustering_results .embeddings )
676
-
677
- tuned_fast_random_projection .write_embeddings ()
678
- data_to_write = pd .DataFrame (data = {
679
- 'nodeElementId' : embeddings ["nodeElementId" ],
680
- 'clusteringHDBSCANLabel' : embeddings ['clusteringTunedHDBSCANLabel' ],
681
- 'clusteringHDBSCANProbability' : embeddings ['clusteringTunedHDBSCANProbability' ],
682
- 'clusteringHDBSCANNoise' : (embeddings ['clusteringTunedHDBSCANLabel' ] == - 1 ).astype (int ),
683
- 'embeddingFastRandomProjectionVisualizationX' : embeddings ["embeddingVisualizationX" ],
684
- 'embeddingFastRandomProjectionVisualizationY' : embeddings ["embeddingVisualizationY" ],
685
- })
686
- write_batch_data_into_database (data_to_write , parameters .get_projection_node_label ())
687
-
688
634
# ------------------------------------------------------------------------------------------------------------
689
635
# MAIN
690
636
# ------------------------------------------------------------------------------------------------------------
691
637
692
638
693
639
parameters = parse_input_parameters ()
694
640
driver = get_graph_database_driver ()
695
- execute_tuned_node_embeddings_clustering (parameters )
641
+
642
+ tuned_fast_random_projection = get_tuned_fast_random_projection_node_embeddings (parameters )
643
+ embeddings = tuned_fast_random_projection .get_embeddings ()
644
+
645
+ clustering_results = coordinate_tuned_hierarchical_density_based_spatial_clustering (embeddings )
646
+ if parameters .is_verbose ():
647
+ print ("HDBSCAN clustered labels by their size descending (top 10):" , clustering_results .clustering_results_distribution .head (10 ))
648
+ print ("HDBSCAN clustered labels by their probability descending (top 10):" , clustering_results .clustering_results_distribution .sort_values (by = 'probability' , ascending = False ).head (10 ))
649
+
650
+ tuned_fast_random_projection .write_embeddings ()
651
+ data_to_write = pd .DataFrame (data = {
652
+ 'nodeElementId' : embeddings ["nodeElementId" ],
653
+ 'clusteringHDBSCANLabel' : embeddings ['clusteringTunedHDBSCANLabel' ],
654
+ 'clusteringHDBSCANProbability' : embeddings ['clusteringTunedHDBSCANProbability' ],
655
+ 'clusteringHDBSCANNoise' : (embeddings ['clusteringTunedHDBSCANLabel' ] == - 1 ).astype (int ),
656
+ })
657
+ write_batch_data_into_database (data_to_write , parameters .get_projection_node_label ())
0 commit comments