diff --git a/.github/workflows/internal-java-code-analysis.yml b/.github/workflows/internal-java-code-analysis.yml index ba1ca75a2..e6376b5af 100644 --- a/.github/workflows/internal-java-code-analysis.yml +++ b/.github/workflows/internal-java-code-analysis.yml @@ -119,4 +119,5 @@ jobs: with: analysis-name: ${{ needs.prepare-code-to-analyze.outputs.analysis-name }} artifacts-upload-name: ${{ needs.prepare-code-to-analyze.outputs.artifacts-upload-name }} - sources-upload-name: ${{ needs.prepare-code-to-analyze.outputs.sources-upload-name }} \ No newline at end of file + sources-upload-name: ${{ needs.prepare-code-to-analyze.outputs.sources-upload-name }} + jupyter-pdf: "false" \ No newline at end of file diff --git a/.github/workflows/internal-typescript-code-analysis.yml b/.github/workflows/internal-typescript-code-analysis.yml index 5991cd0f6..2fe272cb9 100644 --- a/.github/workflows/internal-typescript-code-analysis.yml +++ b/.github/workflows/internal-typescript-code-analysis.yml @@ -117,4 +117,5 @@ jobs: uses: ./.github/workflows/public-analyze-code-graph.yml with: analysis-name: ${{ needs.prepare-code-to-analyze.outputs.analysis-name }} - sources-upload-name: ${{ needs.prepare-code-to-analyze.outputs.sources-upload-name }} \ No newline at end of file + sources-upload-name: ${{ needs.prepare-code-to-analyze.outputs.sources-upload-name }} + jupyter-pdf: "false" \ No newline at end of file diff --git a/.github/workflows/public-analyze-code-graph.yml b/.github/workflows/public-analyze-code-graph.yml index 4ae3efeb3..cde4f78c3 100644 --- a/.github/workflows/public-analyze-code-graph.yml +++ b/.github/workflows/public-analyze-code-graph.yml @@ -55,6 +55,12 @@ on: required: false type: number default: 5 + jupyter-pdf: + description: > + Enable PDF generation for Jupyter Notebooks ("true") or disable it ("false"). + required: false + type: string + default: 'true' outputs: uploaded-analysis-results: description: > @@ -159,7 +165,7 @@ jobs: shell: bash -el {0} env: NEO4J_INITIAL_PASSWORD: ${{ steps.generate-neo4j-initial-password.outputs.neo4j-initial-password }} - ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION: "true" + ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION: ${{ inputs.jupyter-pdf }} IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT: "" # Options: "none", "aggregated", "full". default = "plugin" or "" PREPARE_CONDA_ENVIRONMENT: "false" # Had already been done in step with id "prepare-conda-environment". run: | diff --git a/cypher/Community_Detection/Community_Detection_11a_HDBSCAN_Estimate.cypher b/cypher/Community_Detection/Community_Detection_11a_HDBSCAN_Estimate.cypher new file mode 100644 index 000000000..6dca99219 --- /dev/null +++ b/cypher/Community_Detection/Community_Detection_11a_HDBSCAN_Estimate.cypher @@ -0,0 +1,26 @@ +// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Estimate + +CALL gds.hdbscan.write.estimate( + $dependencies_projection + '-cleaned', { + nodeProperty: $dependencies_projection_node_embeddings_property, + writeProperty: $dependencies_projection_write_property, + samples: 3 +}) + YIELD requiredMemory + ,nodeCount + ,relationshipCount + ,bytesMin + ,bytesMax + ,heapPercentageMin + ,heapPercentageMax + ,treeView + ,mapView +RETURN requiredMemory + ,nodeCount + ,relationshipCount + ,bytesMin + ,bytesMax + ,heapPercentageMin + ,heapPercentageMax + ,treeView + //,mapView //doesn't work on Windows with git bash jq version jq-1.7-dirty \ No newline at end of file diff --git a/cypher/Community_Detection/Community_Detection_11b_HDBSCAN_Statistics.cypher b/cypher/Community_Detection/Community_Detection_11b_HDBSCAN_Statistics.cypher new file mode 100644 index 000000000..3529c7a96 --- /dev/null +++ b/cypher/Community_Detection/Community_Detection_11b_HDBSCAN_Statistics.cypher @@ -0,0 +1,9 @@ +// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Statistics + +CALL gds.hdbscan.stats( + $dependencies_projection + '-cleaned', { + nodeProperty: $dependencies_projection_node_embeddings_property, + samples: 3 +}) + YIELD nodeCount, numberOfClusters, numberOfNoisePoints, preProcessingMillis, computeMillis, postProcessingMillis +RETURN nodeCount, numberOfClusters, numberOfNoisePoints, preProcessingMillis, computeMillis, postProcessingMillis \ No newline at end of file diff --git a/cypher/Community_Detection/Community_Detection_11c_HDBSCAN_Mutate.cypher b/cypher/Community_Detection/Community_Detection_11c_HDBSCAN_Mutate.cypher new file mode 100644 index 000000000..e86e6f899 --- /dev/null +++ b/cypher/Community_Detection/Community_Detection_11c_HDBSCAN_Mutate.cypher @@ -0,0 +1,10 @@ +// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Mutate + +CALL gds.hdbscan.mutate( + $dependencies_projection + '-cleaned', { + nodeProperty: $dependencies_projection_node_embeddings_property, + mutateProperty: $dependencies_projection_write_property, + samples: 3 +}) + YIELD nodeCount, numberOfClusters, numberOfNoisePoints, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, mutateMillis +RETURN nodeCount, numberOfClusters, numberOfNoisePoints, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, mutateMillis \ No newline at end of file diff --git a/cypher/Community_Detection/Community_Detection_11d_HDBSCAN_Stream.cypher b/cypher/Community_Detection/Community_Detection_11d_HDBSCAN_Stream.cypher new file mode 100644 index 000000000..436089a0e --- /dev/null +++ b/cypher/Community_Detection/Community_Detection_11d_HDBSCAN_Stream.cypher @@ -0,0 +1,20 @@ +// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Stream + +CALL gds.hdbscan.stream( + $dependencies_projection + '-cleaned', { + nodeProperty: $dependencies_projection_node_embeddings_property, + samples: 3 +}) + YIELD nodeId, label + WITH gds.util.asNode(nodeId) AS member + ,label + WITH member + ,coalesce(member.fqn, member.fileName, member.name) AS memberName + ,label + WITH count(DISTINCT member) AS memberCount + ,collect(DISTINCT memberName) AS memberNames + ,label +RETURN memberCount + ,label + ,memberNames + ORDER BY memberCount DESC, label ASC \ No newline at end of file diff --git a/cypher/Community_Detection/Community_Detection_11e_HDBSCAN_Write.cypher b/cypher/Community_Detection/Community_Detection_11e_HDBSCAN_Write.cypher new file mode 100644 index 000000000..7f720607c --- /dev/null +++ b/cypher/Community_Detection/Community_Detection_11e_HDBSCAN_Write.cypher @@ -0,0 +1,25 @@ +// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - write node property e.g. communityHdbscanLabel + +CALL gds.hdbscan.write( + $dependencies_projection + '-cleaned', { + nodeProperty: $dependencies_projection_node_embeddings_property, + writeProperty: $dependencies_projection_write_property, + samples: 3 +}) +// Samples = 3 turned out to be needed for +YIELD nodeCount + ,numberOfClusters + ,numberOfNoisePoints + ,preProcessingMillis + ,computeMillis + ,writeMillis + ,postProcessingMillis + ,nodePropertiesWritten +RETURN nodeCount + ,numberOfClusters + ,numberOfNoisePoints + ,preProcessingMillis + ,computeMillis + ,writeMillis + ,postProcessingMillis + ,nodePropertiesWritten \ No newline at end of file diff --git a/scripts/executeJupyterNotebook.sh b/scripts/executeJupyterNotebook.sh index 7dbf3ef9d..4e852011b 100755 --- a/scripts/executeJupyterNotebook.sh +++ b/scripts/executeJupyterNotebook.sh @@ -20,7 +20,10 @@ # Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) set -o errexit -o pipefail -ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION=${ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION:-""} # Enable PDF generation for Jupyter Notebooks if set to any non empty value e.g. "true" +ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION=${ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION:-""} # Enable PDF generation for Jupyter Notebooks if set to any non empty value like "true" or disable it with "" or "false". +if [ "${ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION}" == "false" ]; then + ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION="" # Reset PDF generation if explicitly set to false +fi ## Get this "scripts" directory if not already set # Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. diff --git a/scripts/reports/CommunityCsv.sh b/scripts/reports/CommunityCsv.sh index ced4e6ad2..bbbc6cee9 100755 --- a/scripts/reports/CommunityCsv.sh +++ b/scripts/reports/CommunityCsv.sh @@ -242,6 +242,72 @@ detectCommunitiesWithKCoreDecomposition() { calculateCommunityMetrics "${@}" "${writePropertyName}" } +# Node Embeddings using Fast Random Projection +# +# Required Parameters: +# - dependencies_projection=... +# Name prefix for the in-memory projection name for dependencies. Example: "package" +# - dependencies_projection_node=... +# Label of the nodes that will be used for the projection. Example: "Package" +# - dependencies_projection_weight_property=... +# Name of the node property that contains the dependency weight. Example: "weight" +# - dependencies_projection_node_embeddings_property=... +# Name of the node property that will contain the node embeddings. Example: "embeddingsFastRandomProjectionForHDBSCAN" +nodeEmbeddingsWithFastRandomProjectionForHDBSCAN() { + local embeddingProperty + embeddingProperty=$( extractQueryParameter "dependencies_projection_node_embeddings_property" "${@}") + + local NODE_EMBEDDINGS_CYPHER_DIR="${CYPHER_DIR}/Node_Embeddings" + local mutatePropertyName="dependencies_projection_write_property=${embeddingProperty}" + local embeddingsDimension="dependencies_projection_embedding_dimension=2" + + # Run the algorithm and write the result into the in-memory projection ("mutate") + execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1c_Fast_Random_Projection_Mutate.cypher" "${@}" "${mutatePropertyName}" ${embeddingsDimension} +} + +# Community Detection using Hierarchical Density-Based Spatial Clustering (HDBSCAN) Algorithm +# +# Required Parameters: +# - dependencies_projection=... +# Name prefix for the in-memory projection name for dependencies. Example: "package" +# - dependencies_projection_node=... +# Label of the nodes that will be used for the projection. Example: "Package" +# - dependencies_projection_weight_property=... +# Name of the node property that contains the dependency weight. Example: "weight" +# +# Special Requirements: +# - This algorithm needs a node property with an array of floats to compute clusters. +# One possible way is to use node embeddings for that (like FastRP). +detectCommunitiesWithHDBSCAN() { + local COMMUNITY_DETECTION_CYPHER_DIR="${CYPHER_DIR}/Community_Detection" + local PROJECTION_CYPHER_DIR="${CYPHER_DIR}/Dependencies_Projection" + + local writePropertyName="dependencies_projection_write_property=communityFastRpHdbscanLabel" + local writeLabelName="dependencies_projection_write_label=HDBSCAN" + local embeddingProperty="dependencies_projection_node_embeddings_property=embeddingsFastRandomProjection2dHDBSCAN" + + nodeEmbeddingsWithFastRandomProjectionForHDBSCAN "${@}" ${embeddingProperty} + + # Statistics + execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11a_HDBSCAN_Estimate.cypher" "${@}" ${embeddingProperty} "${writePropertyName}" + execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11b_HDBSCAN_Statistics.cypher" "${@}" ${embeddingProperty} + + # Run the algorithm and write the result into the in-memory projection ("mutate") + execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11c_HDBSCAN_Mutate.cypher" "${@}" ${embeddingProperty} "${writePropertyName}" + + # Stream to CSV + local nodeLabel + nodeLabel=$( extractQueryParameter "dependencies_projection_node" "${@}") + execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_8_Stream_Mutated_Grouped.cypher" "${@}" "${writePropertyName}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}_Communities_HDBSCAN.csv" + + # Update Graph (node properties and labels) using the already mutated property projection + execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_9_Write_Mutated.cypher" "${@}" "${writePropertyName}" + execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_10_Delete_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}" + execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_11_Add_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}" + + calculateCommunityMetrics "${@}" "${writePropertyName}" +} + # Community Detection using the Approximate Maximum k-cut Algorithm # # Required Parameters: @@ -402,6 +468,7 @@ detectCommunities() { time detectCommunitiesWithKCoreDecomposition "${@}" time detectCommunitiesWithApproximateMaximumKCut "${@}" time calculateLocalClusteringCoefficient "${@}" + compareCommunityDetectionResults "${@}" listAllResults "${@}" } @@ -415,7 +482,7 @@ ARTIFACT_GAMMA="dependencies_leiden_gamma=1.11" # default = 1.00 ARTIFACT_KCUT="dependencies_maxkcut=5" # default = 2 if createUndirectedDependencyProjection "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}"; then - detectCommunities "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}" "${ARTIFACT_GAMMA}" "${ARTIFACT_KCUT}" + detectCommunities "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}" "${ARTIFACT_GAMMA}" "${ARTIFACT_KCUT}" # "${ARTIFACT_NODE_EMBEDDINGS}" writeLeidenModularity "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}" fi @@ -430,7 +497,9 @@ PACKAGE_KCUT="dependencies_maxkcut=20" # default = 2 if createUndirectedDependencyProjection "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}"; then detectCommunities "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}" "${PACKAGE_GAMMA}" "${PACKAGE_KCUT}" writeLeidenModularity "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}" - + + detectCommunitiesWithHDBSCAN "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}" + # Package Community Detection - Special CSV Queries after update execute_cypher "${CYPHER_DIR}/Community_Detection/Which_package_community_spans_several_artifacts_and_how_are_the_packages_distributed.cypher" > "${FULL_REPORT_DIRECTORY}/Package_Communities_Leiden_That_Span_Multiple_Artifacts.csv" fi @@ -444,8 +513,8 @@ TYPE_GAMMA="dependencies_leiden_gamma=5.00" # default = 1.00 TYPE_KCUT="dependencies_maxkcut=100" # default = 2 if createUndirectedJavaTypeDependencyProjection "${TYPE_PROJECTION}"; then - detectCommunities "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}" "${TYPE_GAMMA}" "${TYPE_KCUT}" - + detectCommunities "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}" "${TYPE_GAMMA}" "${TYPE_KCUT}" "${TYPE_NODE_EMBEDDINGS}" + detectCommunitiesWithHDBSCAN "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}" # Type Community Detection - Special CSV Queries after update execute_cypher "${CYPHER_DIR}/Community_Detection/Which_type_community_spans_several_artifacts_and_how_are_the_types_distributed.cypher" > "${FULL_REPORT_DIRECTORY}/Type_Communities_Leiden_That_Span_Multiple_Artifacts.csv" execute_cypher "${CYPHER_DIR}/Community_Detection/Type_communities_with_few_members_in_foreign_packages.cypher" > "${FULL_REPORT_DIRECTORY}/Type_communities_with_few_members_in_foreign_packages.csv" diff --git a/scripts/reports/compilations/CsvReports.sh b/scripts/reports/compilations/CsvReports.sh index e06668ac8..cd4112fb4 100755 --- a/scripts/reports/compilations/CsvReports.sh +++ b/scripts/reports/compilations/CsvReports.sh @@ -17,8 +17,6 @@ LOG_GROUP_END=${LOG_GROUP_END:-"::endgroup::"} # Prefix to end a log group. Defa # CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. # This way non-standard tools like readlink aren't needed. REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} -echo "CsvReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}" - REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$(dirname -- "${REPORT_COMPILATIONS_SCRIPT_DIR}")} echo "CsvReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}" diff --git a/scripts/reports/compilations/JupyterReports.sh b/scripts/reports/compilations/JupyterReports.sh index 82ae9d571..1e345116b 100755 --- a/scripts/reports/compilations/JupyterReports.sh +++ b/scripts/reports/compilations/JupyterReports.sh @@ -20,18 +20,18 @@ LOG_GROUP_END=${LOG_GROUP_END:-"::endgroup::"} # Prefix to end a log group. Defa # CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. # This way non-standard tools like readlink aren't needed. REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} -echo "JupyterReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}" - REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$(dirname -- "${REPORT_COMPILATIONS_SCRIPT_DIR}")} -echo "JupyterReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}" - # Get the "scripts" directory by taking the scripts report path and going one directory up. SCRIPTS_DIR=${SCRIPTS_DIR:-$(dirname -- "${REPORTS_SCRIPT_DIR}")} -echo "JupyterReports: SCRIPTS_DIR=${SCRIPTS_DIR}" - # Get the "jupyter" directory by taking the path of the scripts directory, going up one directory and change then into "jupyter". JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DIRECTORY:-"${SCRIPTS_DIR}/../jupyter"} # Repository directory containing the Jupyter Notebooks + +echo "${LOG_GROUP_START}Initialize Jupyter Notebook Reports"; +echo "JupyterReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}" +echo "JupyterReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}" +echo "JupyterReports: SCRIPTS_DIR=${SCRIPTS_DIR}" echo "JupyterReports: JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DIRECTORY}" +echo "${LOG_GROUP_END}"; # Run all jupiter notebooks for jupyter_notebook_file in "${JUPYTER_NOTEBOOK_DIRECTORY}"/*.ipynb; do diff --git a/scripts/reports/compilations/VisualizationReports.sh b/scripts/reports/compilations/VisualizationReports.sh index 63ace1b88..1b5bb2f0c 100755 --- a/scripts/reports/compilations/VisualizationReports.sh +++ b/scripts/reports/compilations/VisualizationReports.sh @@ -20,10 +20,12 @@ LOG_GROUP_END=${LOG_GROUP_END:-"::endgroup::"} # Prefix to end a log group. Defa # CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. # This way non-standard tools like readlink aren't needed. REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} -echo "VisualizationReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}" - REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$(dirname -- "${REPORT_COMPILATIONS_SCRIPT_DIR}")} + +echo "${LOG_GROUP_START}Initialize Visualization Reports"; +echo "VisualizationReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}" echo "VisualizationReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}" +echo "${LOG_GROUP_END}"; # Run all visualization scripts for visualization_script_file in "${REPORTS_SCRIPT_DIR}"/*Visualization.sh; do