Skip to content

Add Hierarchical Density-Based Spatial Clustering (HDBSCAN) Community Detection #376

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Estimate

CALL gds.hdbscan.write.estimate(
$dependencies_projection + '-cleaned', {
nodeProperty: $dependencies_projection_node_embeddings_property,
writeProperty: $dependencies_projection_write_property,
samples: 3
})
YIELD requiredMemory
,nodeCount
,relationshipCount
,bytesMin
,bytesMax
,heapPercentageMin
,heapPercentageMax
,treeView
,mapView
RETURN requiredMemory
,nodeCount
,relationshipCount
,bytesMin
,bytesMax
,heapPercentageMin
,heapPercentageMax
,treeView
//,mapView //doesn't work on Windows with git bash jq version jq-1.7-dirty
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Statistics

CALL gds.hdbscan.stats(
$dependencies_projection + '-cleaned', {
nodeProperty: $dependencies_projection_node_embeddings_property,
samples: 3
})
YIELD nodeCount, numberOfClusters, numberOfNoisePoints, preProcessingMillis, computeMillis, postProcessingMillis
RETURN nodeCount, numberOfClusters, numberOfNoisePoints, preProcessingMillis, computeMillis, postProcessingMillis
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Mutate

CALL gds.hdbscan.mutate(
$dependencies_projection + '-cleaned', {
nodeProperty: $dependencies_projection_node_embeddings_property,
mutateProperty: $dependencies_projection_write_property,
samples: 3
})
YIELD nodeCount, numberOfClusters, numberOfNoisePoints, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, mutateMillis
RETURN nodeCount, numberOfClusters, numberOfNoisePoints, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, mutateMillis
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Stream

CALL gds.hdbscan.stream(
$dependencies_projection + '-cleaned', {
nodeProperty: $dependencies_projection_node_embeddings_property,
samples: 3
})
YIELD nodeId, label
WITH gds.util.asNode(nodeId) AS member
,label
WITH member
,coalesce(member.fqn, member.fileName, member.name) AS memberName
,label
WITH count(DISTINCT member) AS memberCount
,collect(DISTINCT memberName) AS memberNames
,label
RETURN memberCount
,label
,memberNames
ORDER BY memberCount DESC, label ASC
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - write node property e.g. communityHdbscanLabel

CALL gds.hdbscan.write(
$dependencies_projection + '-cleaned', {
nodeProperty: $dependencies_projection_node_embeddings_property,
writeProperty: $dependencies_projection_write_property,
samples: 3
})
// Samples = 3 turned out to be needed for
YIELD nodeCount
,numberOfClusters
,numberOfNoisePoints
,preProcessingMillis
,computeMillis
,writeMillis
,postProcessingMillis
,nodePropertiesWritten
RETURN nodeCount
,numberOfClusters
,numberOfNoisePoints
,preProcessingMillis
,computeMillis
,writeMillis
,postProcessingMillis
,nodePropertiesWritten
77 changes: 73 additions & 4 deletions scripts/reports/CommunityCsv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,72 @@ detectCommunitiesWithKCoreDecomposition() {
calculateCommunityMetrics "${@}" "${writePropertyName}"
}

# Node Embeddings using Fast Random Projection
#
# Required Parameters:
# - dependencies_projection=...
# Name prefix for the in-memory projection name for dependencies. Example: "package"
# - dependencies_projection_node=...
# Label of the nodes that will be used for the projection. Example: "Package"
# - dependencies_projection_weight_property=...
# Name of the node property that contains the dependency weight. Example: "weight"
# - dependencies_projection_node_embeddings_property=...
# Name of the node property that will contain the node embeddings. Example: "embeddingsFastRandomProjectionForHDBSCAN"
nodeEmbeddingsWithFastRandomProjectionForHDBSCAN() {
local embeddingProperty
embeddingProperty=$( extractQueryParameter "dependencies_projection_node_embeddings_property" "${@}")

local NODE_EMBEDDINGS_CYPHER_DIR="${CYPHER_DIR}/Node_Embeddings"
local mutatePropertyName="dependencies_projection_write_property=${embeddingProperty}"
local embeddingsDimension="dependencies_projection_embedding_dimension=2"

# Run the algorithm and write the result into the in-memory projection ("mutate")
execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1c_Fast_Random_Projection_Mutate.cypher" "${@}" "${mutatePropertyName}" ${embeddingsDimension}
}

# Community Detection using Hierarchical Density-Based Spatial Clustering (HDBSCAN) Algorithm
#
# Required Parameters:
# - dependencies_projection=...
# Name prefix for the in-memory projection name for dependencies. Example: "package"
# - dependencies_projection_node=...
# Label of the nodes that will be used for the projection. Example: "Package"
# - dependencies_projection_weight_property=...
# Name of the node property that contains the dependency weight. Example: "weight"
#
# Special Requirements:
# - This algorithm needs a node property with an array of floats to compute clusters.
# One possible way is to use node embeddings for that (like FastRP).
detectCommunitiesWithHDBSCAN() {
local COMMUNITY_DETECTION_CYPHER_DIR="${CYPHER_DIR}/Community_Detection"
local PROJECTION_CYPHER_DIR="${CYPHER_DIR}/Dependencies_Projection"

local writePropertyName="dependencies_projection_write_property=communityFastRpHdbscanLabel"
local writeLabelName="dependencies_projection_write_label=HDBSCAN"
local embeddingProperty="dependencies_projection_node_embeddings_property=embeddingsFastRandomProjection2dHDBSCAN"

nodeEmbeddingsWithFastRandomProjectionForHDBSCAN "${@}" ${embeddingProperty}

# Statistics
execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11a_HDBSCAN_Estimate.cypher" "${@}" ${embeddingProperty} "${writePropertyName}"
execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11b_HDBSCAN_Statistics.cypher" "${@}" ${embeddingProperty}

# Run the algorithm and write the result into the in-memory projection ("mutate")
execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11c_HDBSCAN_Mutate.cypher" "${@}" ${embeddingProperty} "${writePropertyName}"

# Stream to CSV
local nodeLabel
nodeLabel=$( extractQueryParameter "dependencies_projection_node" "${@}")
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_8_Stream_Mutated_Grouped.cypher" "${@}" "${writePropertyName}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}_Communities_HDBSCAN.csv"

# Update Graph (node properties and labels) using the already mutated property projection
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_9_Write_Mutated.cypher" "${@}" "${writePropertyName}"
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_10_Delete_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}"
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_11_Add_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}"

calculateCommunityMetrics "${@}" "${writePropertyName}"
}

# Community Detection using the Approximate Maximum k-cut Algorithm
#
# Required Parameters:
Expand Down Expand Up @@ -402,6 +468,7 @@ detectCommunities() {
time detectCommunitiesWithKCoreDecomposition "${@}"
time detectCommunitiesWithApproximateMaximumKCut "${@}"
time calculateLocalClusteringCoefficient "${@}"

compareCommunityDetectionResults "${@}"
listAllResults "${@}"
}
Expand All @@ -415,7 +482,7 @@ ARTIFACT_GAMMA="dependencies_leiden_gamma=1.11" # default = 1.00
ARTIFACT_KCUT="dependencies_maxkcut=5" # default = 2

if createUndirectedDependencyProjection "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}"; then
detectCommunities "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}" "${ARTIFACT_GAMMA}" "${ARTIFACT_KCUT}"
detectCommunities "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}" "${ARTIFACT_GAMMA}" "${ARTIFACT_KCUT}" # "${ARTIFACT_NODE_EMBEDDINGS}"
writeLeidenModularity "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}"
fi

Expand All @@ -430,7 +497,9 @@ PACKAGE_KCUT="dependencies_maxkcut=20" # default = 2
if createUndirectedDependencyProjection "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}"; then
detectCommunities "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}" "${PACKAGE_GAMMA}" "${PACKAGE_KCUT}"
writeLeidenModularity "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}"


detectCommunitiesWithHDBSCAN "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}"

# Package Community Detection - Special CSV Queries after update
execute_cypher "${CYPHER_DIR}/Community_Detection/Which_package_community_spans_several_artifacts_and_how_are_the_packages_distributed.cypher" > "${FULL_REPORT_DIRECTORY}/Package_Communities_Leiden_That_Span_Multiple_Artifacts.csv"
fi
Expand All @@ -444,8 +513,8 @@ TYPE_GAMMA="dependencies_leiden_gamma=5.00" # default = 1.00
TYPE_KCUT="dependencies_maxkcut=100" # default = 2

if createUndirectedJavaTypeDependencyProjection "${TYPE_PROJECTION}"; then
detectCommunities "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}" "${TYPE_GAMMA}" "${TYPE_KCUT}"

detectCommunities "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}" "${TYPE_GAMMA}" "${TYPE_KCUT}" "${TYPE_NODE_EMBEDDINGS}"
detectCommunitiesWithHDBSCAN "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}"
# Type Community Detection - Special CSV Queries after update
execute_cypher "${CYPHER_DIR}/Community_Detection/Which_type_community_spans_several_artifacts_and_how_are_the_types_distributed.cypher" > "${FULL_REPORT_DIRECTORY}/Type_Communities_Leiden_That_Span_Multiple_Artifacts.csv"
execute_cypher "${CYPHER_DIR}/Community_Detection/Type_communities_with_few_members_in_foreign_packages.cypher" > "${FULL_REPORT_DIRECTORY}/Type_communities_with_few_members_in_foreign_packages.csv"
Expand Down
2 changes: 0 additions & 2 deletions scripts/reports/compilations/CsvReports.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ LOG_GROUP_END=${LOG_GROUP_END:-"::endgroup::"} # Prefix to end a log group. Defa
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
# This way non-standard tools like readlink aren't needed.
REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )}
echo "CsvReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}"

REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$(dirname -- "${REPORT_COMPILATIONS_SCRIPT_DIR}")}
echo "CsvReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}"

Expand Down
12 changes: 6 additions & 6 deletions scripts/reports/compilations/JupyterReports.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,18 @@ LOG_GROUP_END=${LOG_GROUP_END:-"::endgroup::"} # Prefix to end a log group. Defa
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
# This way non-standard tools like readlink aren't needed.
REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )}
echo "JupyterReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}"

REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$(dirname -- "${REPORT_COMPILATIONS_SCRIPT_DIR}")}
echo "JupyterReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}"

# Get the "scripts" directory by taking the scripts report path and going one directory up.
SCRIPTS_DIR=${SCRIPTS_DIR:-$(dirname -- "${REPORTS_SCRIPT_DIR}")}
echo "JupyterReports: SCRIPTS_DIR=${SCRIPTS_DIR}"

# Get the "jupyter" directory by taking the path of the scripts directory, going up one directory and change then into "jupyter".
JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DIRECTORY:-"${SCRIPTS_DIR}/../jupyter"} # Repository directory containing the Jupyter Notebooks

echo "${LOG_GROUP_START}Initialize Jupyter Notebook Reports";
echo "JupyterReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}"
echo "JupyterReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}"
echo "JupyterReports: SCRIPTS_DIR=${SCRIPTS_DIR}"
echo "JupyterReports: JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DIRECTORY}"
echo "${LOG_GROUP_END}";

# Run all jupiter notebooks
for jupyter_notebook_file in "${JUPYTER_NOTEBOOK_DIRECTORY}"/*.ipynb; do
Expand Down
6 changes: 4 additions & 2 deletions scripts/reports/compilations/VisualizationReports.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@ LOG_GROUP_END=${LOG_GROUP_END:-"::endgroup::"} # Prefix to end a log group. Defa
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
# This way non-standard tools like readlink aren't needed.
REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )}
echo "VisualizationReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}"

REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$(dirname -- "${REPORT_COMPILATIONS_SCRIPT_DIR}")}

echo "${LOG_GROUP_START}Initialize Visualization Reports";
echo "VisualizationReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}"
echo "VisualizationReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}"
echo "${LOG_GROUP_END}";

# Run all visualization scripts
for visualization_script_file in "${REPORTS_SCRIPT_DIR}"/*Visualization.sh; do
Expand Down
Loading