JohT · JohT · Apr 30, 2025 · Jul 5, 2025
diff --git a/cypher/Community_Detection/Community_Detection_11a_HDBSCAN_Estimate.cypher b/cypher/Community_Detection/Community_Detection_11a_HDBSCAN_Estimate.cypher
@@ -0,0 +1,26 @@
+// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Estimate
+
+CALL gds.hdbscan.write.estimate(
+ $dependencies_projection + '-cleaned', {
+    nodeProperty: $dependencies_projection_node_embeddings_property,
+    writeProperty: $dependencies_projection_write_property,
+    samples: 3
+})
+ YIELD requiredMemory
+      ,nodeCount
+      ,relationshipCount
+      ,bytesMin
+      ,bytesMax
+      ,heapPercentageMin
+      ,heapPercentageMax
+      ,treeView
+      ,mapView
+RETURN requiredMemory
+      ,nodeCount
+      ,relationshipCount
+      ,bytesMin
+      ,bytesMax
+      ,heapPercentageMin
+      ,heapPercentageMax
+      ,treeView
+      //,mapView //doesn't work on Windows with git bash jq version jq-1.7-dirty
diff --git a/cypher/Community_Detection/Community_Detection_11b_HDBSCAN_Statistics.cypher b/cypher/Community_Detection/Community_Detection_11b_HDBSCAN_Statistics.cypher
@@ -0,0 +1,9 @@
+// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Statistics
+
+CALL gds.hdbscan.stats(
+ $dependencies_projection + '-cleaned', {
+    nodeProperty: $dependencies_projection_node_embeddings_property,
+    samples: 3
+})
+ YIELD nodeCount, numberOfClusters, numberOfNoisePoints, preProcessingMillis, computeMillis, postProcessingMillis
+RETURN nodeCount, numberOfClusters, numberOfNoisePoints, preProcessingMillis, computeMillis, postProcessingMillis
diff --git a/cypher/Community_Detection/Community_Detection_11c_HDBSCAN_Mutate.cypher b/cypher/Community_Detection/Community_Detection_11c_HDBSCAN_Mutate.cypher
@@ -0,0 +1,10 @@
+// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Mutate
+
+CALL gds.hdbscan.mutate(
+ $dependencies_projection + '-cleaned', {
+    nodeProperty: $dependencies_projection_node_embeddings_property,
+    mutateProperty: $dependencies_projection_write_property,
+    samples: 3
+})
+ YIELD nodeCount, numberOfClusters, numberOfNoisePoints, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, mutateMillis
+RETURN nodeCount, numberOfClusters, numberOfNoisePoints, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, mutateMillis
diff --git a/cypher/Community_Detection/Community_Detection_11d_HDBSCAN_Stream.cypher b/cypher/Community_Detection/Community_Detection_11d_HDBSCAN_Stream.cypher
@@ -0,0 +1,20 @@
+// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Stream
+
+CALL gds.hdbscan.stream(
+ $dependencies_projection + '-cleaned', {
+    nodeProperty: $dependencies_projection_node_embeddings_property,
+    samples: 3
+})
+ YIELD nodeId, label
+  WITH gds.util.asNode(nodeId) AS member
+      ,label
+  WITH member
+      ,coalesce(member.fqn, member.fileName, member.name) AS memberName
+      ,label
+  WITH count(DISTINCT member)       AS memberCount
+      ,collect(DISTINCT memberName) AS memberNames
+      ,label
+RETURN memberCount
+      ,label
+      ,memberNames
+ ORDER BY memberCount DESC, label ASC
diff --git a/cypher/Community_Detection/Community_Detection_11e_HDBSCAN_Write.cypher b/cypher/Community_Detection/Community_Detection_11e_HDBSCAN_Write.cypher
@@ -0,0 +1,25 @@
+// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) -  write node property e.g. communityHdbscanLabel
+
+CALL gds.hdbscan.write(
+ $dependencies_projection + '-cleaned', {
+    nodeProperty: $dependencies_projection_node_embeddings_property,
+    writeProperty: $dependencies_projection_write_property,
+    samples: 3
+})
+// Samples = 3 turned out to be needed for 
+YIELD nodeCount
+     ,numberOfClusters
+     ,numberOfNoisePoints
+     ,preProcessingMillis
+     ,computeMillis
+     ,writeMillis
+     ,postProcessingMillis
+     ,nodePropertiesWritten
+RETURN nodeCount
+      ,numberOfClusters
+      ,numberOfNoisePoints
+      ,preProcessingMillis
+      ,computeMillis
+      ,writeMillis
+      ,postProcessingMillis
+      ,nodePropertiesWritten
diff --git a/scripts/reports/CommunityCsv.sh b/scripts/reports/CommunityCsv.sh
@@ -242,6 +242,72 @@ detectCommunitiesWithKCoreDecomposition() {
     calculateCommunityMetrics "${@}" "${writePropertyName}"
 }
 
+# Node Embeddings using Fast Random Projection
+# 
+# Required Parameters:
+# - dependencies_projection=...
+#   Name prefix for the in-memory projection name for dependencies. Example: "package"
+# - dependencies_projection_node=...
+#   Label of the nodes that will be used for the projection. Example: "Package"
+# - dependencies_projection_weight_property=...
+#   Name of the node property that contains the dependency weight. Example: "weight"
+# - dependencies_projection_node_embeddings_property=...
+#   Name of the node property that will contain the node embeddings. Example: "embeddingsFastRandomProjectionForHDBSCAN"
+nodeEmbeddingsWithFastRandomProjectionForHDBSCAN() {
+    local embeddingProperty
+    embeddingProperty=$( extractQueryParameter "dependencies_projection_node_embeddings_property" "${@}")
+
+    local NODE_EMBEDDINGS_CYPHER_DIR="${CYPHER_DIR}/Node_Embeddings"
+    local mutatePropertyName="dependencies_projection_write_property=${embeddingProperty}"
+    local embeddingsDimension="dependencies_projection_embedding_dimension=2"
+
+    # Run the algorithm and write the result into the in-memory projection ("mutate")
+    execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1c_Fast_Random_Projection_Mutate.cypher" "${@}" "${mutatePropertyName}" ${embeddingsDimension}
+}
+
+# Community Detection using Hierarchical Density-Based Spatial Clustering (HDBSCAN) Algorithm
+#
+# Required Parameters:
+# - dependencies_projection=...
+#   Name prefix for the in-memory projection name for dependencies. Example: "package"
+# - dependencies_projection_node=...
+#   Label of the nodes that will be used for the projection. Example: "Package"
+# - dependencies_projection_weight_property=...
+#   Name of the node property that contains the dependency weight. Example: "weight"
+# 
+# Special Requirements:
+# - This algorithm needs a node property with an array of floats to compute clusters.
+#   One possible way is to use node embeddings for that (like FastRP).
+detectCommunitiesWithHDBSCAN() {
+    local COMMUNITY_DETECTION_CYPHER_DIR="${CYPHER_DIR}/Community_Detection"
+    local PROJECTION_CYPHER_DIR="${CYPHER_DIR}/Dependencies_Projection"
+
+    local writePropertyName="dependencies_projection_write_property=communityFastRpHdbscanLabel" 
+    local writeLabelName="dependencies_projection_write_label=HDBSCAN" 
+    local embeddingProperty="dependencies_projection_node_embeddings_property=embeddingsFastRandomProjection2dHDBSCAN"
+
+    nodeEmbeddingsWithFastRandomProjectionForHDBSCAN "${@}" ${embeddingProperty}
+
+    # Statistics
+    execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11a_HDBSCAN_Estimate.cypher" "${@}" ${embeddingProperty} "${writePropertyName}" 
+    execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11b_HDBSCAN_Statistics.cypher" "${@}" ${embeddingProperty}
+
+    # Run the algorithm and write the result into the in-memory projection ("mutate")
+    execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11c_HDBSCAN_Mutate.cypher" "${@}" ${embeddingProperty} "${writePropertyName}" 
+
+    # Stream to CSV
+    local nodeLabel
+    nodeLabel=$( extractQueryParameter "dependencies_projection_node" "${@}")
+    execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_8_Stream_Mutated_Grouped.cypher" "${@}" "${writePropertyName}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}_Communities_HDBSCAN.csv"
+
+    # Update Graph (node properties and labels) using the already mutated property projection
+    execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_9_Write_Mutated.cypher" "${@}" "${writePropertyName}"
+    execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_10_Delete_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}"
+    execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_11_Add_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}"
+
+    calculateCommunityMetrics "${@}" "${writePropertyName}"
+}
+
 # Community Detection using the Approximate Maximum k-cut Algorithm
 # 
 # Required Parameters:
@@ -402,6 +468,7 @@ detectCommunities() {
     time detectCommunitiesWithKCoreDecomposition "${@}"
     time detectCommunitiesWithApproximateMaximumKCut "${@}"
     time calculateLocalClusteringCoefficient "${@}"
+
     compareCommunityDetectionResults "${@}"
     listAllResults "${@}"
 }
@@ -415,7 +482,7 @@ ARTIFACT_GAMMA="dependencies_leiden_gamma=1.11" # default = 1.00
 ARTIFACT_KCUT="dependencies_maxkcut=5" # default = 2
 
 if createUndirectedDependencyProjection "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}"; then
-    detectCommunities "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}" "${ARTIFACT_GAMMA}" "${ARTIFACT_KCUT}"
+    detectCommunities "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}" "${ARTIFACT_GAMMA}" "${ARTIFACT_KCUT}" # "${ARTIFACT_NODE_EMBEDDINGS}"
     writeLeidenModularity "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}"
 fi
 
@@ -430,7 +497,9 @@ PACKAGE_KCUT="dependencies_maxkcut=20" # default = 2
 if createUndirectedDependencyProjection "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}"; then
     detectCommunities "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}" "${PACKAGE_GAMMA}" "${PACKAGE_KCUT}"
     writeLeidenModularity "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}"
-
+
+    detectCommunitiesWithHDBSCAN "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}"
+
     # Package Community Detection - Special CSV Queries after update
     execute_cypher "${CYPHER_DIR}/Community_Detection/Which_package_community_spans_several_artifacts_and_how_are_the_packages_distributed.cypher" > "${FULL_REPORT_DIRECTORY}/Package_Communities_Leiden_That_Span_Multiple_Artifacts.csv"
 fi
@@ -444,8 +513,8 @@ TYPE_GAMMA="dependencies_leiden_gamma=5.00" # default = 1.00
 TYPE_KCUT="dependencies_maxkcut=100" # default = 2
 
 if createUndirectedJavaTypeDependencyProjection "${TYPE_PROJECTION}"; then
-    detectCommunities "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}" "${TYPE_GAMMA}" "${TYPE_KCUT}"
-
+    detectCommunities "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}" "${TYPE_GAMMA}" "${TYPE_KCUT}" "${TYPE_NODE_EMBEDDINGS}"
+    detectCommunitiesWithHDBSCAN "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}"
     # Type Community Detection - Special CSV Queries after update
     execute_cypher "${CYPHER_DIR}/Community_Detection/Which_type_community_spans_several_artifacts_and_how_are_the_types_distributed.cypher" > "${FULL_REPORT_DIRECTORY}/Type_Communities_Leiden_That_Span_Multiple_Artifacts.csv"
     execute_cypher "${CYPHER_DIR}/Community_Detection/Type_communities_with_few_members_in_foreign_packages.cypher" > "${FULL_REPORT_DIRECTORY}/Type_communities_with_few_members_in_foreign_packages.csv"

diff --git a/scripts/reports/compilations/CsvReports.sh b/scripts/reports/compilations/CsvReports.sh
@@ -17,8 +17,6 @@ LOG_GROUP_END=${LOG_GROUP_END:-"::endgroup::"} # Prefix to end a log group. Defa
 # CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
 # This way non-standard tools like readlink aren't needed.
 REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )}
-echo "CsvReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}"
-
 REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$(dirname -- "${REPORT_COMPILATIONS_SCRIPT_DIR}")}
 echo "CsvReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}"
 

diff --git a/scripts/reports/compilations/JupyterReports.sh b/scripts/reports/compilations/JupyterReports.sh
@@ -20,18 +20,18 @@ LOG_GROUP_END=${LOG_GROUP_END:-"::endgroup::"} # Prefix to end a log group. Defa
 # CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
 # This way non-standard tools like readlink aren't needed.
 REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )}
-echo "JupyterReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}"
-
 REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$(dirname -- "${REPORT_COMPILATIONS_SCRIPT_DIR}")}
-echo "JupyterReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}"
-
 # Get the "scripts" directory by taking the scripts report path and going one directory up.
 SCRIPTS_DIR=${SCRIPTS_DIR:-$(dirname -- "${REPORTS_SCRIPT_DIR}")}
-echo "JupyterReports: SCRIPTS_DIR=${SCRIPTS_DIR}"
-
 # Get the "jupyter" directory by taking the path of the scripts directory, going up one directory and change then into "jupyter".
 JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DIRECTORY:-"${SCRIPTS_DIR}/../jupyter"} # Repository directory containing the Jupyter Notebooks
+
+echo "${LOG_GROUP_START}Initialize Jupyter Notebook Reports";
+echo "JupyterReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}"
+echo "JupyterReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}"
+echo "JupyterReports: SCRIPTS_DIR=${SCRIPTS_DIR}"
 echo "JupyterReports: JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DIRECTORY}"
+echo "${LOG_GROUP_END}";
 
 # Run all jupiter notebooks
 for jupyter_notebook_file in "${JUPYTER_NOTEBOOK_DIRECTORY}"/*.ipynb; do 

diff --git a/scripts/reports/compilations/VisualizationReports.sh b/scripts/reports/compilations/VisualizationReports.sh
@@ -20,10 +20,12 @@ LOG_GROUP_END=${LOG_GROUP_END:-"::endgroup::"} # Prefix to end a log group. Defa
 # CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
 # This way non-standard tools like readlink aren't needed.
 REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )}
-echo "VisualizationReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}"
-
 REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$(dirname -- "${REPORT_COMPILATIONS_SCRIPT_DIR}")}
+
+echo "${LOG_GROUP_START}Initialize Visualization Reports";
+echo "VisualizationReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}"
 echo "VisualizationReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}"
+echo "${LOG_GROUP_END}";
 
 # Run all visualization scripts
 for visualization_script_file in "${REPORTS_SCRIPT_DIR}"/*Visualization.sh; do