diff --git a/.gitignore b/.gitignore index ba8fd3cce..9f9a8293a 100644 --- a/.gitignore +++ b/.gitignore @@ -94,4 +94,7 @@ coverage/ *.nbconvert* # Python environments -.conda \ No newline at end of file +.conda + +# Optuna (and other) Database data +*.db \ No newline at end of file diff --git a/COMMANDS.md b/COMMANDS.md index acb850198..5c30f0099 100644 --- a/COMMANDS.md +++ b/COMMANDS.md @@ -8,6 +8,7 @@ - [Examples](#examples) - [Start an analysis with CSV reports only](#start-an-analysis-with-csv-reports-only) - [Start an analysis with Jupyter reports only](#start-an-analysis-with-jupyter-reports-only) + - [Start an analysis with Python reports only](#start-an-analysis-with-python-reports-only) - [Start an analysis with PDF generation](#start-an-analysis-with-pdf-generation) - [Start an analysis without importing git log data](#start-an-analysis-without-importing-git-log-data) - [Only run setup and explore the Graph manually](#only-run-setup-and-explore-the-graph-manually) @@ -102,6 +103,14 @@ If only the Jupyter reports are needed e.g. when the CSV reports had already bee ./../../scripts/analysis/analyze.sh --report Jupyter ``` +#### Start an analysis with Python reports only + +If you only need Python reports, e.g. to skip Chromium Browser dependency, the this can be done with: + +```shell +./../../scripts/analysis/analyze.sh --report Python +``` + #### Start an analysis with PDF generation Note: Generating a PDF from a Jupyter notebook using [nbconvert](https://nbconvert.readthedocs.io) takes some time and might even fail due to a timeout error. diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md index 9f138d666..8850965e2 100644 --- a/GETTING_STARTED.md +++ b/GETTING_STARTED.md @@ -84,16 +84,22 @@ Use these optional command line options as needed: ./../../scripts/analysis/analyze.sh --report Csv ``` -- Jupyter notebook reports when Python and Conda are installed: +- Jupyter notebook reports when Python and Conda are installed (and Chromium Browser for PDF generation): ```shell ./../../scripts/analysis/analyze.sh --report Jupyter ``` +- Python reports when Python and Conda are installed (without Chromium Browser for PDF generation): + + ```shell + ./../../scripts/analysis/analyze.sh --report Python + ``` + - Graph visualizations when Node.js and npm are installed: ```shell - ./../../scripts/analysis/analyze.sh --report Jupyter + ./../../scripts/analysis/analyze.sh --report Visualization ``` - All reports with Python, Conda, Node.js and npm installed: diff --git a/README.md b/README.md index 2ce0fd259..79a2b0f11 100644 --- a/README.md +++ b/README.md @@ -148,6 +148,9 @@ The [Code Structure Analysis Pipeline](./.github/workflows/internal-java-code-an - [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver) - [openTSNE](https://github.com/pavlin-policar/openTSNE) - [wordcloud](https://github.com/amueller/word_cloud) + - [umap](https://umap-learn.readthedocs.io) + - [scikit-learn](https://scikit-learn.org) + - [optuna](https://optuna.org) - [Graph Visualization](./graph-visualization/README.md) uses [node.js](https://nodejs.org/de) and the dependencies listed in [package.json](./graph-visualization/package.json). - [HPCC-Systems (High Performance Computing Cluster) Web-Assembly (JavaScript)](https://github.com/hpcc-systems/hpcc-js-wasm) containing a wrapper for GraphViz to visualize graph structures. - [GraphViz](https://gitlab.com/graphviz/graphviz) for CLI Graph Visualization diff --git a/cypher/Community_Detection/Community_Detection_2b_Leiden_Tuneable_Statistics.cypher b/cypher/Community_Detection/Community_Detection_2b_Leiden_Tuneable_Statistics.cypher new file mode 100644 index 000000000..117669b2d --- /dev/null +++ b/cypher/Community_Detection/Community_Detection_2b_Leiden_Tuneable_Statistics.cypher @@ -0,0 +1,31 @@ +//Community Detection Leiden Statistics + +CALL gds.leiden.stats( + $dependencies_projection + '-cleaned', { + gamma: toFloat($dependencies_leiden_gamma), + theta: toFloat($dependencies_leiden_theta), + maxLevels: toInteger($dependencies_leiden_max_levels), + tolerance: 0.0000001, + consecutiveIds: true, + relationshipWeightProperty: $dependencies_projection_weight_property +}) +YIELD nodeCount + ,communityCount + ,ranLevels + ,modularity + ,modularities + ,communityDistribution +RETURN nodeCount + ,communityCount + ,ranLevels + ,modularity + ,modularities + ,communityDistribution.min + ,communityDistribution.mean + ,communityDistribution.max + ,communityDistribution.p50 + ,communityDistribution.p75 + ,communityDistribution.p90 + ,communityDistribution.p95 + ,communityDistribution.p99 + ,communityDistribution.p999 \ No newline at end of file diff --git a/cypher/Community_Detection/Community_Detection_2d_Leiden_Tuneable_Write.cypher b/cypher/Community_Detection/Community_Detection_2d_Leiden_Tuneable_Write.cypher new file mode 100644 index 000000000..7eb67a102 --- /dev/null +++ b/cypher/Community_Detection/Community_Detection_2d_Leiden_Tuneable_Write.cypher @@ -0,0 +1,40 @@ +//Community Detection Leiden Write property communityLeidenId + +CALL gds.leiden.write( + $dependencies_projection + '-cleaned', { + gamma: toFloat($dependencies_leiden_gamma), + theta: toFloat($dependencies_leiden_theta), + maxLevels: toInteger($dependencies_leiden_max_levels), + tolerance: 0.0000001, + consecutiveIds: true, + relationshipWeightProperty: $dependencies_projection_weight_property, + writeProperty: $dependencies_projection_write_property +}) +YIELD preProcessingMillis + ,computeMillis + ,writeMillis + ,postProcessingMillis + ,nodePropertiesWritten + ,communityCount + ,ranLevels + ,modularity + ,modularities + ,communityDistribution +RETURN preProcessingMillis + ,computeMillis + ,writeMillis + ,postProcessingMillis + ,nodePropertiesWritten + ,communityCount + ,ranLevels + ,modularity + ,communityDistribution.min + ,communityDistribution.mean + ,communityDistribution.max + ,communityDistribution.p50 + ,communityDistribution.p75 + ,communityDistribution.p90 + ,communityDistribution.p95 + ,communityDistribution.p99 + ,communityDistribution.p999 + ,modularities \ No newline at end of file diff --git a/cypher/Dependencies_Projection/Dependencies_13_Sample_Projected_Graph.cypher b/cypher/Dependencies_Projection/Dependencies_13_Sample_Projected_Graph.cypher new file mode 100644 index 000000000..d9935d272 --- /dev/null +++ b/cypher/Dependencies_Projection/Dependencies_13_Sample_Projected_Graph.cypher @@ -0,0 +1,11 @@ +// Creates a smaller projection by sampling the original graph using "Common Neighbour Aware Random Walk" + +CALL gds.graph.sample.cnarw( + $dependencies_projection + '-sampled-cleaned', + $dependencies_projection, + { + samplingRatio: toFloat($dependencies_projection_sampling_ratio) + } +) +YIELD graphName, fromGraphName, nodeCount, relationshipCount, startNodeCount, projectMillis +RETURN graphName, fromGraphName, nodeCount, relationshipCount, startNodeCount, projectMillis \ No newline at end of file diff --git a/cypher/Dependencies_Projection/Dependencies_14_Write_Batch_Data.cypher b/cypher/Dependencies_Projection/Dependencies_14_Write_Batch_Data.cypher new file mode 100644 index 000000000..69fc5be00 --- /dev/null +++ b/cypher/Dependencies_Projection/Dependencies_14_Write_Batch_Data.cypher @@ -0,0 +1,7 @@ +// Writes batch data back into the database for code units when working with a dependencies projection. Variables: dependencies_projection_rows, dependencies_projection_node + +UNWIND $dependencies_projection_rows AS row +MATCH (codeUnit) +WHERE elementId(codeUnit) = row.nodeId + AND $dependencies_projection_node IN labels(codeUnit) + SET codeUnit += row.properties \ No newline at end of file diff --git a/cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher index 0353d3a16..d1ad6d8f7 100644 --- a/cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher +++ b/cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher @@ -16,8 +16,9 @@ OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName RETURN DISTINCT coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName - ,codeUnit.name AS shortCodeUnitName - ,coalesce(artifactName, projectName) AS projectName - ,coalesce(codeUnit.communityLeidenId, 0) AS communityId + ,codeUnit.name AS shortCodeUnitName + ,elementId(codeUnit) AS nodeElementId + ,coalesce(artifactName, projectName) AS projectName + ,coalesce(codeUnit.communityLeidenId, 0) AS communityId ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality ,embedding \ No newline at end of file diff --git a/cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Tuneable_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Tuneable_Stream.cypher new file mode 100644 index 000000000..59046b681 --- /dev/null +++ b/cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Tuneable_Stream.cypher @@ -0,0 +1,26 @@ +// Node Embeddings 1d using Fast Random Projection: Stream for Hyper-Parameter tuning. Requires "Add_file_name and_extension.cypher". + +CALL gds.fastRP.stream( + $dependencies_projection + '-cleaned', { + embeddingDimension: toInteger($dependencies_projection_embedding_dimension) + ,randomSeed: toInteger($dependencies_projection_embedding_random_seed) + ,normalizationStrength: toFloat($dependencies_projection_fast_random_projection_normalization_strength) + ,iterationWeights: [0.0, 0.0, 1.0, toFloat($dependencies_projection_fast_random_projection_forth_iteration_weight)] + ,relationshipWeightProperty: $dependencies_projection_weight_property + } +) +YIELD nodeId, embedding + WITH gds.util.asNode(nodeId) AS codeUnit + ,embedding +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + RETURN DISTINCT + coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.name AS shortCodeUnitName + ,elementId(codeUnit) AS nodeElementId + ,coalesce(artifactName, projectName) AS projectName + ,coalesce(codeUnit.communityLeidenIdTuned, codeUnit.communityLeidenId, 0) AS communityId + ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality + ,embedding \ No newline at end of file diff --git a/cypher/Node_Embeddings/Node_Embeddings_1e_Fast_Random_Projection_Tuneable_Write.cypher b/cypher/Node_Embeddings/Node_Embeddings_1e_Fast_Random_Projection_Tuneable_Write.cypher new file mode 100644 index 000000000..f47ca7ab5 --- /dev/null +++ b/cypher/Node_Embeddings/Node_Embeddings_1e_Fast_Random_Projection_Tuneable_Write.cypher @@ -0,0 +1,14 @@ +// Node Embeddings 1e using Fast Random Projection: Write for tuned hyper-parameters. + +CALL gds.fastRP.write( + $dependencies_projection + '-cleaned', { + embeddingDimension: toInteger($dependencies_projection_embedding_dimension) + ,randomSeed: toInteger($dependencies_projection_embedding_random_seed) + ,normalizationStrength: toFloat($dependencies_projection_fast_random_projection_normalization_strength) + ,iterationWeights: [0.0, 0.0, 1.0, toFloat($dependencies_projection_fast_random_projection_forth_iteration_weight)] + ,relationshipWeightProperty: $dependencies_projection_weight_property + ,writeProperty: $dependencies_projection_write_property + } +) + YIELD nodeCount, nodePropertiesWritten, preProcessingMillis, computeMillis, writeMillis +RETURN nodeCount, nodePropertiesWritten, preProcessingMillis, computeMillis, writeMillis \ No newline at end of file diff --git a/cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher index 661e555b2..150efed36 100644 --- a/cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher +++ b/cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher @@ -22,8 +22,9 @@ OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName RETURN DISTINCT coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName - ,codeUnit.name AS shortCodeUnitName - ,coalesce(artifactName, projectName) AS projectName - ,coalesce(codeUnit.communityLeidenId, 0) AS communityId - ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality - ,embedding \ No newline at end of file + ,codeUnit.name AS shortCodeUnitName + ,elementId(codeUnit) AS nodeElementId + ,coalesce(artifactName, projectName) AS projectName + ,coalesce(codeUnit.communityLeidenId, 0) AS communityId + ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality + ,embedding \ No newline at end of file diff --git a/cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Tuneable_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Tuneable_Stream.cypher new file mode 100644 index 000000000..0171c029f --- /dev/null +++ b/cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Tuneable_Stream.cypher @@ -0,0 +1,30 @@ +// Node Embeddings 2c using Hash GNN (Graph Neural Networks): Stream. Requires "Add_file_name and_extension.cypher". + +CALL gds.beta.hashgnn.stream( + $dependencies_projection + '-cleaned', { + embeddingDensity: toInteger($dependencies_projection_embedding_dimension) * 2 * toInteger($dependencies_projection_hashgnn_dimension_multiplier) + ,randomSeed: toInteger($dependencies_projection_embedding_random_seed) + ,iterations: toInteger($dependencies_projection_hashgnn_iterations) + ,generateFeatures: { + dimension: toInteger($dependencies_projection_embedding_dimension) * 4 * toInteger($dependencies_projection_hashgnn_dimension_multiplier) + ,densityLevel: toInteger($dependencies_projection_hashgnn_density_level) + } + ,outputDimension: toInteger($dependencies_projection_embedding_dimension) + ,neighborInfluence: toFloat($dependencies_projection_hashgnn_neighbor_influence) + } +) +YIELD nodeId, embedding + WITH gds.util.asNode(nodeId) AS codeUnit + ,embedding +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + RETURN DISTINCT + coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.name AS shortCodeUnitName + ,elementId(codeUnit) AS nodeElementId + ,coalesce(artifactName, projectName) AS projectName + ,coalesce(codeUnit.communityLeidenIdTuned, codeUnit.communityLeidenId, 0) AS communityId + ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality + ,embedding \ No newline at end of file diff --git a/cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher index 6bbebe3d5..28021d236 100644 --- a/cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher +++ b/cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher @@ -17,8 +17,9 @@ OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName RETURN DISTINCT coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName - ,codeUnit.name AS shortCodeUnitName - ,coalesce(artifactName, projectName) AS projectName - ,coalesce(codeUnit.communityLeidenId, 0) AS communityId - ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality - ,embedding \ No newline at end of file + ,codeUnit.name AS shortCodeUnitName + ,elementId(codeUnit) AS nodeElementId + ,coalesce(artifactName, projectName) AS projectName + ,coalesce(codeUnit.communityLeidenId, 0) AS communityId + ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality + ,embedding \ No newline at end of file diff --git a/cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Tuneable_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Tuneable_Stream.cypher new file mode 100644 index 000000000..9777117a5 --- /dev/null +++ b/cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Tuneable_Stream.cypher @@ -0,0 +1,32 @@ +// Node Embeddings 3c using Node2Vec: Stream. Requires "Add_file_name and_extension.cypher". + +CALL gds.node2vec.stream( + $dependencies_projection + '-cleaned', { + embeddingDimension: toInteger($dependencies_projection_embedding_dimension) + ,randomSeed: toInteger($dependencies_projection_embedding_random_seed) + ,iterations: toInteger($dependencies_projection_node2vec_iterations) + ,inOutFactor: toFloat($dependencies_projection_node2vec_in_out_factor) + ,returnFactor: toFloat($dependencies_projection_node2vec_return_factor) + ,windowSize: toInteger($dependencies_projection_node2vec_window_size) + ,walksPerNode: toInteger($dependencies_projection_node2vec_walks_per_node) + ,walkLength: toInteger($dependencies_projection_node2vec_walk_length) + ,negativeSamplingRate: toInteger($dependencies_projection_node2vec_negative_sampling_rate) + ,positiveSamplingFactor: toFloat($dependencies_projection_node2vec_positive_sampling_factor) + ,relationshipWeightProperty: $dependencies_projection_weight_property + } +) +YIELD nodeId, embedding + WITH gds.util.asNode(nodeId) AS codeUnit + ,embedding +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + RETURN DISTINCT + coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.name AS shortCodeUnitName + ,elementId(codeUnit) AS nodeElementId + ,coalesce(artifactName, projectName) AS projectName + ,coalesce(codeUnit.communityLeidenIdTuned, codeUnit.communityLeidenId, 0) AS communityId + ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality + ,embedding \ No newline at end of file diff --git a/cypher/Validation/ValidateAlwaysFalse.cypher b/cypher/Validation/ValidateAlwaysFalse.cypher new file mode 100644 index 000000000..fec63bb61 --- /dev/null +++ b/cypher/Validation/ValidateAlwaysFalse.cypher @@ -0,0 +1,3 @@ +// Will never return any results so that the validation will always fail. This is helpful for Jupyter Notebooks that should not be executed automatically. + +MATCH (nothing) RETURN nothing LIMIT 0 \ No newline at end of file diff --git a/domains/anomaly-detection/anomalyDetectionCsv.sh b/domains/anomaly-detection/anomalyDetectionCsv.sh new file mode 100755 index 000000000..d30399b91 --- /dev/null +++ b/domains/anomaly-detection/anomalyDetectionCsv.sh @@ -0,0 +1,151 @@ +#!/usr/bin/env bash + +# Pipeline that coordinates anomaly detection using the Graph Data Science Library of Neo4j. +# It requires an already running Neo4j graph database with already scanned and analyzed artifacts. +# The results will be written into the sub directory reports/anomaly-detection. + +# Note that "scripts/prepareAnalysis.sh" is required to run prior to this script. + +# Requires executeQueryFunctions.sh, projectionFunctions.sh, cleanupAfterReportGeneration.sh + +# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) +set -o errexit -o pipefail + +# Overrideable Constants (defaults also defined in sub scripts) +REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"} + +## Get this "scripts/reports" directory if not already set +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)} +echo "anomalyDetectionCsv: ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR}" +# Get the "scripts" directory by taking the path of this script and going one directory up. +SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/../../scripts"} # Repository directory containing the shell scripts +# Get the "cypher" query directory for gathering features. +ANOMALY_DETECTION_FEATURE_CYPHER_DIR=${ANOMALY_DETECTION_FEATURE_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/features"} +ANOMALY_DETECTION_QUERY_CYPHER_DIR=${ANOMALY_DETECTION_QUERY_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/queries"} + +# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher" +source "${SCRIPTS_DIR}/executeQueryFunctions.sh" + +# Define functions to create and delete Graph Projections like "createUndirectedDependencyProjection" +source "${SCRIPTS_DIR}/projectionFunctions.sh" + +# Query or recalculate features. +# +# Required Parameters: +# - projection_name=... +# Name prefix for the in-memory projection name. Example: "package-anomaly-detection" +# - projection_node_label=... +# Label of the nodes that will be used for the projection. Example: "Package" +# - projection_weight_property=... +# Name of the node property that contains the dependency weight. Example: "weight" +anomaly_detection_features() { + local nodeLabel + nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" ) + + echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Collecting features for ${nodeLabel} nodes..." + + # Determine the Betweenness centrality (with the directed graph projection) if not already done + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Betweenness-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Betweenness-Write.cypher" "${@}" + # Determine the local clustering coefficient if not already done + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-LocalClusteringCoefficient-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-LocalClusteringCoefficient-Write.cypher" "${@}" + # Determine the page rank if not already done + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageRank-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageRank-Write.cypher" "${@}" + # Determine the article rank if not already done + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageRank-Write.cypher" "${@}" +} +# Run queries to find anomalies in the graph. +# +# Required Parameters: +# - projection_node_label=... +# Label of the nodes that will be used for the projection. Example: "Package" +anomaly_detection_queries() { + local nodeLabel + nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" ) + + echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Queries for ${nodeLabel} nodes..." + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_PotentialImbalancedRoles.csv" + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_PotentialOverEngineerOrIsolated.csv" + + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_HiddenBridgeNodes.csv" + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_PopularBottlenecks.csv" + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_SilentCoordinators.csv" + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_OverReferencesUtilities.csv" + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_FragileStructuralBridges.csv" + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_DependencyHungryOrchestrators.csv" + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_UnexpectedCentralNodes.csv" +} + +# Run the anomaly detection pipeline. +# +# Required Parameters: +# - projection_name=... +# Name prefix for the in-memory projection name. Example: "package-anomaly-detection" +# - projection_node_label=... +# Label of the nodes that will be used for the projection. Example: "Package" +# - projection_weight_property=... +# Name of the node property that contains the dependency weight. Example: "weight" +anomaly_detection_csv_reports() { + time anomaly_detection_features "${@}" + time anomaly_detection_queries "${@}" +} + +# Create report directory +REPORT_NAME="anomaly-detection" +FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}" +mkdir -p "${FULL_REPORT_DIRECTORY}" + +# Query Parameter key pairs for projection and algorithm side +PROJECTION_NAME="dependencies_projection" +ALGORITHM_PROJECTION="projection_name" + +PROJECTION_NODE="dependencies_projection_node" +ALGORITHM_NODE="projection_node_label" + +PROJECTION_WEIGHT="dependencies_projection_weight_property" +ALGORITHM_WEIGHT="projection_weight_property" + +# Code independent algorithm parameters +COMMUNITY_PROPERTY="community_property=communityLeidenIdTuned" +EMBEDDING_PROPERTY="embedding_property=embeddingsFastRandomProjectionTunedForClustering" + +# -- Java Artifact Node Embeddings ------------------------------- + +if createUndirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight"; then + createDirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection-directed" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight" + anomaly_detection_csv_reports "${ALGORITHM_PROJECTION}=artifact-anomaly-detection" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" +fi + +# -- Java Package Node Embeddings -------------------------------- + +if createUndirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces"; then + createDirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection-directed" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces" + anomaly_detection_csv_reports "${ALGORITHM_PROJECTION}=package-anomaly-detection" "${ALGORITHM_NODE}=Package" "${ALGORITHM_WEIGHT}=weight25PercentInterfaces" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" +fi + +# -- Java Type Node Embeddings ----------------------------------- + +if createUndirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection"; then + createDirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection-directed" + anomaly_detection_csv_reports "${ALGORITHM_PROJECTION}=type-anomaly-detection" "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" +fi + +# -- Typescript Module Node Embeddings --------------------------- + +if createUndirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight"; then + createDirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding-directed" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight" + anomaly_detection_csv_reports "${ALGORITHM_PROJECTION}=typescript-module-embedding" "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=lowCouplingElement25PercentWeight" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" +fi + +# --------------------------------------------------------------- + +# Clean-up after report generation. Empty reports will be deleted. +source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}" + +echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished." \ No newline at end of file diff --git a/domains/anomaly-detection/anomalyDetectionPlots.py b/domains/anomaly-detection/anomalyDetectionPlots.py new file mode 100755 index 000000000..3de4d8aab --- /dev/null +++ b/domains/anomaly-detection/anomalyDetectionPlots.py @@ -0,0 +1,857 @@ +#!/usr/bin/env python + +# This Python script creates plots that might help to find anomalies in code. +# It queries the Graph database, aggregates the data from previously calculated metrics and plots the diagrams into files. +# +# Input Parameters: +# - --verbose: for finer grained log, optional +# - query_parameters: e.g. "projection_node_label=Package", required +# +# Requires: +# - tunedLeidenCommunityDetection.py +# - tunedNodeEmbeddingClustering +# - umap2dNodeEmbeddings.py + +import typing + +import os +import sys +import argparse +import pprint +import contextlib + +import pandas as pd +import numpy as np + +from neo4j import GraphDatabase, Driver + +import matplotlib.pyplot as plot +import seaborn + + +class Parameters: + required_parameters_ = ["projection_node_label"] + + def __init__(self, input_parameters: typing.Dict[str, str], report_directory: str = "", verbose: bool = False): + self.query_parameters_ = input_parameters.copy() # copy enforces immutability + self.report_directory = report_directory + self.verbose_ = verbose + + def __repr__(self): + pretty_dict = pprint.pformat(self.query_parameters_, indent=4) + return f"Parameters: verbose={self.verbose_}, report_directory={self.report_directory}, query_parameters:\n{pretty_dict}" + + @staticmethod + def log_dependency_versions_() -> None: + print('---------------------------------------') + + print('Python version: {}'.format(sys.version)) + + from numpy import __version__ as numpy_version + print('numpy version: {}'.format(numpy_version)) + + from pandas import __version__ as pandas_version + print('pandas version: {}'.format(pandas_version)) + + from matplotlib import __version__ as matplotlib_version + print('matplotlib version: {}'.format(matplotlib_version)) + + from seaborn import __version__ as seaborn_version # type: ignore + print('seaborn version: {}'.format(seaborn_version)) + + from neo4j import __version__ as neo4j_version + print('neo4j version: {}'.format(neo4j_version)) + + print('---------------------------------------') + + @classmethod + def from_input_parameters(cls, input_parameters: typing.Dict[str, str], report_directory: str = "", verbose: bool = False): + """ + Creates a Parameters instance from a dictionary of input parameters. + The dictionary must contain the following keys: + - "projection_node_label": The node type of the projection. + """ + missing_parameters = [parameter for parameter in cls.required_parameters_ if parameter not in input_parameters] + if missing_parameters: + raise ValueError("Missing parameters:", missing_parameters) + created_parameters = cls(input_parameters, report_directory, verbose) + if created_parameters.is_verbose(): + print(created_parameters) + cls.log_dependency_versions_() + return created_parameters + + @classmethod + def example(cls): + return cls({"projection_node_label": "Package"}) + + def get_query_parameters(self) -> typing.Dict[str, str]: + return self.query_parameters_.copy() # copy enforces immutability + + def get_projection_node_label(self) -> str: + return self.query_parameters_["projection_node_label"] + + def get_report_directory(self) -> str: + return self.report_directory + + def is_verbose(self) -> bool: + return self.verbose_ + + +def parse_input_parameters() -> Parameters: + # Convert list of "key=value" strings to a dictionary + def parse_key_value_list(param_list: typing.List[str]) -> typing.Dict[str, str]: + param_dict = {} + for item in param_list: + if '=' in item: + key, value = item.split('=', 1) + param_dict[key] = value + return param_dict + + parser = argparse.ArgumentParser( + description="Unsupervised clustering to assign labels to code units (Java packages, types,...) and their dependencies based on how structurally similar they are within a software system.") + parser.add_argument('--verbose', action='store_true', help='Enable verbose mode to log all details') + parser.add_argument('--report_directory', type=str, default="", help='Path to the report directory') + parser.add_argument('query_parameters', nargs='*', type=str, help='List of key=value Cypher query parameters') + parser.set_defaults(verbose=False) + args = parser.parse_args() + return Parameters.from_input_parameters(parse_key_value_list(args.query_parameters), args.report_directory, args.verbose) + + +def get_graph_database_driver() -> Driver: + driver = GraphDatabase.driver( + uri="bolt://localhost:7687", + auth=("neo4j", os.environ.get("NEO4J_INITIAL_PASSWORD")) + ) + driver.verify_connectivity() + print("anomalyDetectionPlots: Successfully connected to Neo4j") + return driver + + +def query_cypher_to_data_frame(query: typing.LiteralString, parameters: typing.Optional[typing.Dict[str, typing.Any]] = None): + records, summary, keys = driver.execute_query(query, parameters_=parameters) + print(f"anomalyDetectionPlots: Successfully queried data from Neo4j after {summary.result_available_after} ms") + return pd.DataFrame([record.values() for record in records], columns=keys) + + +def query_cypher_to_data_frame_suppress_warnings(query: typing.LiteralString, parameters: typing.Optional[typing.Dict[str, typing.Any]] = None): + """ + Executes the Cypher query in the given file and returns the result as a pandas DataFrame. + This function suppresses any warnings or error messages that would normally be printed to stderr. + This is useful when you want to run a query without cluttering the output with warnings. + Parameters: + - filename: The name of the file containing the Cypher query. + - parameters: Optional dictionary of parameters to pass to the Cypher query. + Returns: + - A pandas DataFrame containing the results of the Cypher query. + """ + with open(os.devnull, 'w') as devnull, contextlib.redirect_stderr(devnull): + return query_cypher_to_data_frame(query, parameters) + + +def query_data(input_parameters: Parameters = Parameters.example()) -> pd.DataFrame: + + query: typing.LiteralString = """ + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.incomingDependencies IS NOT NULL + AND codeUnit.outgoingDependencies IS NOT NULL + AND codeUnit.centralityArticleRank IS NOT NULL + AND codeUnit.centralityPageRank IS NOT NULL + AND codeUnit.centralityBetweenness IS NOT NULL + AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL + AND codeUnit.clusteringHDBSCANLabel IS NOT NULL + AND codeUnit.clusteringHDBSCANProbability IS NOT NULL + AND codeUnit.clusteringHDBSCANNoise IS NOT NULL + AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL + AND codeUnit.clusteringHDBSCANSize IS NOT NULL + AND codeUnit.clusteringHDBSCANRadiusMax IS NOT NULL + AND codeUnit.clusteringHDBSCANRadiusAverage IS NOT NULL + AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL + AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL + OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName + OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + RETURN DISTINCT + coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.name AS shortCodeUnitName + ,coalesce(artifactName, projectName) AS projectName + ,codeUnit.incomingDependencies AS incomingDependencies + ,codeUnit.outgoingDependencies AS outgoingDependencies + ,codeUnit.centralityArticleRank AS articleRank + ,codeUnit.centralityPageRank AS pageRank + ,codeUnit.centralityBetweenness AS betweenness + ,codeUnit.communityLocalClusteringCoefficient AS clusteringCoefficient + ,1.0 - codeUnit.communityLocalClusteringCoefficient AS inverseClusteringCoefficient + ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference + ,codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree + ,codeUnit.clusteringHDBSCANLabel AS clusterLabel + ,codeUnit.clusteringHDBSCANProbability AS clusterProbability + ,codeUnit.clusteringHDBSCANNoise AS clusterNoise + ,codeUnit.clusteringHDBSCANMedoid AS clusterMedoid + ,codeUnit.clusteringHDBSCANSize AS clusterSize + ,codeUnit.clusteringHDBSCANRadiusMax AS clusterRadiusMax + ,codeUnit.clusteringHDBSCANRadiusAverage AS clusterRadiusAverage + ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX + ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY + """ + if parameters.is_verbose(): + return query_cypher_to_data_frame(query, parameters=input_parameters.get_query_parameters()) + return query_cypher_to_data_frame_suppress_warnings(query, parameters=input_parameters.get_query_parameters()) + + +def enhance_data_with_visualization_cluster_diameter( + clustering_visualization_dataframe: pd.DataFrame, + result_diameter_column_name: str = 'clusterVisualizationDiameter', + cluster_label_column_name: str = "clusterLabel", + x_position_column: str = "embeddingVisualizationX", + y_position_column: str = "embeddingVisualizationY", +): + def max_pairwise_distance(points): + if len(points) < 2: + return 0.0 + # Efficient vectorized pairwise distance computation + dists = np.sqrt( + np.sum((points[:, np.newaxis, :] - points[np.newaxis, :, :]) ** 2, axis=-1) + ) + return np.max(dists) + + cluster_diameters = {} + unique_cluster_labels = clustering_visualization_dataframe[cluster_label_column_name].unique() + for cluster_label in unique_cluster_labels: + if cluster_label == -1: + cluster_diameters[-1] = 0.0 + continue + + cluster_nodes = clustering_visualization_dataframe[ + clustering_visualization_dataframe[cluster_label_column_name] == cluster_label + ] + cluster_diameters[cluster_label] = max_pairwise_distance(cluster_nodes[[x_position_column, y_position_column]].to_numpy()) + + clustering_visualization_dataframe[result_diameter_column_name] = clustering_visualization_dataframe[cluster_label_column_name].map(cluster_diameters) + + +def get_clusters_by_criteria( + data: pd.DataFrame, + by: str, + ascending: bool = True, + cluster_count: int = 10, + label_column_name: str = 'clusterLabel' +) -> pd.DataFrame: + """ + Returns the rows for the "cluster_count" clusters with the largest (ascending=False) or smallest(ascending=True) + value in the column specified with "by". Noise (labeled with -1) remains unfiltered. + """ + if ascending: + threshold = data.groupby(by=label_column_name)[by].min().nsmallest(cluster_count).iloc[-1] + # print(f"Ascending threshold is {threshold} for {by}.") + return data[(data[by] <= threshold) | (data[label_column_name] == -1)] + + threshold = data.groupby(by=label_column_name)[by].max().nlargest(cluster_count).iloc[-1] + # print(f"Descending threshold is {threshold} for {by}.") + return data[(data[by] >= threshold) | (data[label_column_name] == -1)] + + +plot_annotation_style: dict = { + 'textcoords': 'offset points', + 'arrowprops': dict(arrowstyle='->', color='black', alpha=0.3), + 'fontsize': 6, + 'backgroundcolor': 'white', + 'bbox': dict(boxstyle='round,pad=0.4', + edgecolor='silver', + facecolor='whitesmoke', + alpha=1 + ) +} + + +def get_file_path(name: str, parameters: Parameters, extension: str = 'svg') -> str: + name = parameters.get_report_directory() + '/' + name.replace(' ', '_') + '.' + extension + if parameters.is_verbose(): + print(f"Saving file {name}") + return name + + +def plot_standard_deviation_lines(color: typing.LiteralString, mean: float, standard_deviation: float, standard_deviation_factor: int = 0) -> None: + """ + Plots vertical lines for the mean + factor times standard deviation (z-score references). + """ + # Vertical line for the standard deviation + positive_standard_deviation = mean + (standard_deviation_factor * standard_deviation) + horizontal_line_label = f'Mean + {standard_deviation_factor} x Standard Deviation: {positive_standard_deviation:.2f}' if standard_deviation_factor != 0 else f'Mean: {mean:.2f}' + + plot.axvline(positive_standard_deviation, color=color, linestyle='dashed', linewidth=1, label=horizontal_line_label) + + if standard_deviation_factor != 0: + negative_standard_deviation = mean - (standard_deviation_factor * standard_deviation) + plot.axvline(negative_standard_deviation, color=color, linestyle='dashed', linewidth=1) + + plot.legend() + + +def plot_difference_between_article_and_page_rank( + page_ranks: pd.Series, + article_ranks: pd.Series, + short_names: pd.Series, + title: str, + plot_file_path: str +) -> None: + """ + Plots the difference between Article Rank and Page Rank for Java packages. + + Parameters + ---------- + page_ranks : pd.Series + DataFrame column containing Page Rank values. + article_ranks : pd.Series + DataFrame column containing Article Rank values. + short_names : pd.Series + DataFrame column containing short names of the code units. + title: str + """ + if page_ranks.empty or article_ranks.empty or short_names.empty: + print("No data available to plot.") + return + + # Calculate the difference between Article Rank and Page Rank + page_to_article_rank_difference = page_ranks - article_ranks + + plot.figure(figsize=(10, 6)) + plot.hist(page_to_article_rank_difference, bins=50, color='blue', alpha=0.7, edgecolor='black') + plot.title(title) + plot.xlabel('Absolute difference between Page Rank and Article Rank') + plot.ylabel('Frequency') + plot.xlim(left=page_to_article_rank_difference.min(), right=page_to_article_rank_difference.max()) + plot.yscale('log') # Use logarithmic scale for better visibility of differences + plot.grid(True) + plot.tight_layout() + + mean_difference = page_to_article_rank_difference.mean() + standard_deviation = page_to_article_rank_difference.std() + + # Vertical line for the mean + plot_standard_deviation_lines('red', mean_difference, standard_deviation, standard_deviation_factor=0) + # Vertical line for the standard deviation + mean (=z-score of 1) + plot_standard_deviation_lines('orange', mean_difference, standard_deviation, standard_deviation_factor=1) + # Vertical line for 2 x standard deviations + mean (=z-score of 2) + plot_standard_deviation_lines('green', mean_difference, standard_deviation, standard_deviation_factor=2) + + def annotate_outliers(outliers: pd.DataFrame) -> None: + if outliers.empty: + return + for dataframe_index, row in outliers.iterrows(): + index = typing.cast(int, dataframe_index) + value = row['pageToArticleRankDifference'] + x_index_offset = - index * 10 if value > 0 else + index * 10 + plot.annotate( + text=f'{row['shortName']} (rank #{row['page_rank_ranking']}, #{row['article_rank_ranking']})', + xy=(value, 1), + xytext=(value + x_index_offset, 60), + rotation=90, + **plot_annotation_style, + ) + + # Merge all series into a single DataFrame for easier handling + page_to_article_rank_dataframe = pd.DataFrame({ + 'shortName': short_names, + 'pageRank': page_ranks, + 'articleRank': article_ranks, + 'pageToArticleRankDifference': page_to_article_rank_difference, + 'page_rank_ranking': page_ranks.rank().astype(int), + 'article_rank_ranking': article_ranks.rank().astype(int) + }, index=page_ranks.index) + + # Annotate values above z-score of 2 with their names + positive_z_score_2 = mean_difference + 2 * standard_deviation + positive_outliers = page_to_article_rank_dataframe[page_to_article_rank_difference > positive_z_score_2].sort_values(by='pageToArticleRankDifference', ascending=False).reset_index().head(5) + annotate_outliers(positive_outliers) + + # Annotate values below z-score of -2 with their names + negative_z_score_2 = mean_difference - 2 * standard_deviation + negative_outliers = page_to_article_rank_dataframe[page_to_article_rank_difference < negative_z_score_2].sort_values(by='pageToArticleRankDifference', ascending=True).reset_index().head(5) + annotate_outliers(negative_outliers) + + plot.savefig(plot_file_path) + + +def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series, title: str, plot_file_path: str) -> None: + """ + Plots the distribution of clustering coefficients. + + Parameters + ---------- + clustering_coefficients : pd.Series + Series containing clustering coefficient values. + """ + if clustering_coefficients.empty: + print("No data available to plot.") + return + + plot.figure(figsize=(10, 6)) + plot.figure(figsize=(10, 6)) + plot.hist(clustering_coefficients, bins=40, color='blue', alpha=0.7, edgecolor='black') + plot.title(title) + plot.xlabel('Clustering Coefficient') + plot.ylabel('Frequency') + plot.xlim(left=clustering_coefficients.min(), right=clustering_coefficients.max()) + # plot.yscale('log') # Use logarithmic scale for better visibility of differences + plot.grid(True) + plot.tight_layout() + + mean = clustering_coefficients.mean() + standard_deviation = clustering_coefficients.std() + + # Vertical line for the mean + plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0) + # Vertical line for 1 x standard deviations + mean (=z-score of 1) + plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=1) + + plot.savefig(plot_file_path) + + +def plot_clustering_coefficient_vs_page_rank( + clustering_coefficients: pd.Series, + page_ranks: pd.Series, + short_names: pd.Series, + clustering_noise: pd.Series, + title: str, + plot_file_path: str +) -> None: + """ + Plots the relationship between clustering coefficients and Page Rank values. + + Parameters + ---------- + clustering_coefficients : pd.Series + Series containing clustering coefficient values. + page_ranks : pd.Series + Series containing Page Rank values. + short_names : pd.Series + Series containing short names of the code units. + clustering_noise : pd.Series + Series indicating whether the code unit is noise (value = 1) nor not (value = 0) from the clustering algorithm. + """ + if clustering_coefficients.empty or page_ranks.empty or short_names.empty: + print("No data available to plot.") + return + + color = clustering_noise.map({0: 'blue', 1: 'gray'}) + + plot.figure(figsize=(10, 6)) + plot.scatter(x=clustering_coefficients, y=page_ranks, alpha=0.7, color=color) + plot.title(title) + plot.xlabel('Clustering Coefficient') + plot.ylabel('Page Rank') + + # Add color bar: grey = noise, blue = non-noise + scatter = plot.scatter([], [], color='blue', label='Non-Noise', alpha=0.7) + scatter_noise = plot.scatter([], [], color='gray', label='Noise', alpha=0.7) + plot.legend(handles=[scatter, scatter_noise], loc='upper right', title='Clustering Noise') + + # Merge all series into a single DataFrame for easier handling + combined_data = pd.DataFrame({ + 'shortName': short_names, + 'clusteringCoefficient': clustering_coefficients, + 'pageRank': page_ranks, + 'clusterNoise': clustering_noise, + }, index=clustering_coefficients.index) + + # Annotate points with their names. Filter out values with a page rank smaller than 1.5 standard deviations + mean_page_rank = page_ranks.mean() + standard_deviation_page_rank = page_ranks.std() + threshold_page_rank = mean_page_rank + 1.5 * standard_deviation_page_rank + significant_points = combined_data[combined_data['pageRank'] > threshold_page_rank].reset_index(drop=True).head(10) + for dataframe_index, row in significant_points.iterrows(): + index = typing.cast(int, dataframe_index) + plot.annotate( + text=row['shortName'], + xy=(row['clusteringCoefficient'], row['pageRank']), + xytext=(5, 5 + index * 10), # Offset y position for better visibility + **plot_annotation_style + ) + + # Annotate points with the highest clustering coefficients (top 20) and only show the lowest 5 page ranks + combined_data['page_rank_ranking'] = combined_data['pageRank'].rank(ascending=False).astype(int) + combined_data['clustering_coefficient_ranking'] = combined_data['clusteringCoefficient'].rank(ascending=False).astype(int) + top_clustering_coefficients = combined_data.sort_values(by='clusteringCoefficient', ascending=False).reset_index(drop=True).head(20) + top_clustering_coefficients = top_clustering_coefficients.sort_values(by='pageRank', ascending=True).reset_index(drop=True).head(5) + for dataframe_index, row in top_clustering_coefficients.iterrows(): + index = typing.cast(int, dataframe_index) + plot.annotate( + text=f"{row['shortName']} (score {row['pageRank']:.4f})", + xy=(row['clusteringCoefficient'], row['pageRank']), + xytext=(5, 5 + index * 10), # Offset y position for better visibility + **plot_annotation_style + ) + + # plot.yscale('log') # Use logarithmic scale for better visibility of differences + plot.grid(True) + plot.tight_layout() + + plot.savefig(plot_file_path) + + +def plot_clusters( + clustering_visualization_dataframe: pd.DataFrame, + title: str, + plot_file_path: str, + main_color_map: str = "tab20", + code_unit_column_name: str = "shortCodeUnitName", + cluster_label_column_name: str = "clusterLabel", + cluster_medoid_column_name: str = "clusterMedoid", + centrality_column_name: str = "pageRank", + x_position_column: str = 'embeddingVisualizationX', + y_position_column: str = 'embeddingVisualizationY', + cluster_visualization_diameter_column='clusterVisualizationDiameter' +) -> None: + if clustering_visualization_dataframe.empty: + print("No projected data to plot available") + return + + def truncate(text: str, max_length: int): + if len(text) <= max_length: + return text + return text[:max_length - 3] + "..." + + # Create figure and subplots + plot.figure(figsize=(10, 10)) + + # Setup columns + node_size_column = centrality_column_name + + # Separate HDBSCAN non-noise and noise nodes + node_embeddings_without_noise = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] != -1] + node_embeddings_noise_only = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] == -1] + + # ------------------------------------------ + # Subplot: HDBSCAN Clustering with KDE + # ------------------------------------------ + plot.title(title) + + unique_cluster_labels = node_embeddings_without_noise[cluster_label_column_name].unique() + hdbscan_color_palette = seaborn.color_palette(main_color_map, len(unique_cluster_labels)) + hdbscan_cluster_to_color = dict(zip(unique_cluster_labels, hdbscan_color_palette)) + + max_visualization_diameter = node_embeddings_without_noise[cluster_visualization_diameter_column].max() + visualization_diameter_normalization_factor = max_visualization_diameter * 2 + + for cluster_label in unique_cluster_labels: + cluster_nodes = node_embeddings_without_noise[ + node_embeddings_without_noise[cluster_label_column_name] == cluster_label + ] + # By comparing the cluster diameter to the max diameter of all clusters in the quartile, + # we can adjust the alpha value for the KDE plot to visualize smaller clusters more clearly. + # This way, larger clusters will have a lower alpha value, making them less prominent and less prone to overshadow smaller clusters. + cluster_diameter = cluster_nodes.iloc[0][cluster_visualization_diameter_column] + alpha = max((1.0 - (cluster_diameter / (visualization_diameter_normalization_factor))) * 0.45 - 0.25, 0.02) + + # KDE cloud shape + if len(cluster_nodes) > 1 and ( + cluster_nodes[x_position_column].std() > 0 or cluster_nodes[y_position_column].std() > 0 + ): + seaborn.kdeplot( + x=cluster_nodes[x_position_column], + y=cluster_nodes[y_position_column], + fill=True, + alpha=alpha, + levels=2, + color=hdbscan_cluster_to_color[cluster_label], + ax=plot.gca(), # Use current axes + warn_singular=False, + ) + + # Node scatter points + plot.scatter( + x=cluster_nodes[x_position_column], + y=cluster_nodes[y_position_column], + s=cluster_nodes[node_size_column] * 200 + 2, + color=hdbscan_cluster_to_color[cluster_label], + alpha=0.9, + label=f"Cluster {cluster_label}" + ) + + # Annotate medoids of the cluster + medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1] + for index, row in medoids.iterrows(): + plot.annotate( + text=f"{truncate(row[code_unit_column_name], 30)} ({row[cluster_label_column_name]})", + xy=(row[x_position_column], row[y_position_column]), + xytext=(5, 5), # Offset for better visibility + **plot_annotation_style + ) + + # Plot noise points in gray + plot.scatter( + x=node_embeddings_noise_only[x_position_column], + y=node_embeddings_noise_only[y_position_column], + s=node_embeddings_noise_only[node_size_column] * 200 + 2, + color='lightgrey', + alpha=0.4, + label="Noise" + ) + + plot.savefig(plot_file_path) + + +def plot_clusters_probabilities( + clustering_visualization_dataframe: pd.DataFrame, + title: str, + plot_file_path: str, + code_unit_column: str = "shortCodeUnitName", + cluster_label_column: str = "clusterLabel", + cluster_medoid_column: str = "clusterMedoid", + cluster_size_column: str = "clusterSize", + cluster_probability_column: str = "clusterProbability", + size_column: str = "pageRank", + x_position_column: str = 'embeddingVisualizationX', + y_position_column: str = 'embeddingVisualizationY', +) -> None: + + if clustering_visualization_dataframe.empty: + print("No projected data to plot available") + return + + def truncate(text: str, max_length: int): + if len(text) <= max_length: + return text + return text[:max_length - 3] + "..." + + cluster_noise = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column] == -1] + cluster_non_noise = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column] != -1] + cluster_even_labels = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column] % 2 == 0] + cluster_odd_labels = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column] % 2 == 1] + + plot.figure(figsize=(10, 10)) + plot.title(title) + + # Plot noise + plot.scatter( + x=cluster_noise[x_position_column], + y=cluster_noise[y_position_column], + s=cluster_noise[size_column] * 200 + 3, + color='lightgrey', + alpha=0.5, + label='Noise' + ) + + # Plot even labels + plot.scatter( + x=cluster_even_labels[x_position_column], + y=cluster_even_labels[y_position_column], + s=cluster_even_labels[size_column] * 200 + 3, + c=cluster_even_labels[cluster_probability_column], + vmin=0.6, + vmax=1.0, + cmap='Greens', + alpha=0.8, + label='Even Label' + ) + + # Plot odd labels + plot.scatter( + x=cluster_odd_labels[x_position_column], + y=cluster_odd_labels[y_position_column], + s=cluster_odd_labels[size_column] * 200 + 3, + c=cluster_odd_labels[cluster_probability_column], + vmin=0.6, + vmax=1.0, + cmap='Blues', + alpha=0.8, + label='Odd Label' + ) + + # Annotate medoids of the cluster + cluster_medoids = cluster_non_noise[cluster_non_noise[cluster_medoid_column] == 1].sort_values(by=cluster_size_column, ascending=False).head(20) + for index, row in cluster_medoids.iterrows(): + mean_cluster_probability = cluster_non_noise[cluster_non_noise[cluster_label_column] == row[cluster_label_column]][cluster_probability_column].mean() + plot.annotate( + text=f"{row[cluster_label_column]}:{truncate(row[code_unit_column], 20)} ({mean_cluster_probability:.4f})", + xy=(row[x_position_column], row[y_position_column]), + xytext=(5, 5), + alpha=0.5, + **plot_annotation_style + ) + + lowest_probabilities = cluster_non_noise.sort_values(by=cluster_probability_column, ascending=True).reset_index().head(10) + for dataframe_index, row in lowest_probabilities.iterrows(): + index = typing.cast(int, dataframe_index) + plot.annotate( + text=f"!{row[cluster_label_column]}:{truncate(row[code_unit_column], 20)} ({row[cluster_probability_column]:.4f})", + xy=(row[x_position_column], row[y_position_column]), + xytext=(5, 5 + index * 10), + color='red', + **plot_annotation_style + ) + + plot.savefig(plot_file_path) + + +def plot_cluster_noise( + clustering_visualization_dataframe: pd.DataFrame, + title: str, + plot_file_path: str, + code_unit_column_name: str = "shortCodeUnitName", + cluster_label_column_name: str = "clusterLabel", + size_column_name: str = "degree", + color_column_name: str = "pageRank", + x_position_column='embeddingVisualizationX', + y_position_column='embeddingVisualizationY' +) -> None: + if clustering_visualization_dataframe.empty: + print("No projected data to plot available") + return + + # Filter only noise points + noise_points = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] == -1] + noise_points = noise_points.sort_values(by=size_column_name, ascending=False).reset_index(drop=True) + + if noise_points.empty: + print("No noise points to plot.") + return + + plot.figure(figsize=(10, 10)) + plot.title(title) + + # Determine the color threshold for noise points + color_10th_highest_value = noise_points[color_column_name].nlargest(10).iloc[-1] # Get the 10th largest value + color_90_quantile = noise_points[color_column_name].quantile(0.90) + color_threshold = max(color_10th_highest_value, color_90_quantile) + + # Color the color column values above the 90% quantile threshold red, the rest light grey + colors = noise_points[color_column_name].apply( + lambda x: "red" if x >= color_threshold else "lightgrey" + ) + normalized_size = noise_points[size_column_name] / noise_points[size_column_name].max() + + # Scatter plot for noise points + plot.scatter( + x=noise_points[x_position_column], + y=noise_points[y_position_column], + s=normalized_size.clip(lower=0.01) * 800 + 2, + c=colors, + alpha=0.6 + ) + + # Annotate the largest 10 points and all colored ones with their names + for index, row in noise_points.iterrows(): + index = typing.cast(int, index) + if colors[index] != 'red' and index >= 10: + continue + plot.annotate( + text=row[code_unit_column_name], + xy=(row[x_position_column], row[y_position_column]), + xytext=(5, 5 + (index % 2) * 20), # Offset for better visibility + **plot_annotation_style + ) + + plot.xlabel(x_position_column) + plot.ylabel(y_position_column) + plot.tight_layout() + + plot.savefig(plot_file_path) + + +# ------------------------------------------------------------------------------------------------------------ +# MAIN +# ------------------------------------------------------------------------------------------------------------ + +parameters = parse_input_parameters() +plot_type = parameters.get_projection_node_label() +report_directory = parameters.get_report_directory() + +driver = get_graph_database_driver() +data = query_data(parameters) +enhance_data_with_visualization_cluster_diameter(data) + +overall_cluster_count = data['clusterLabel'].nunique() + +plot_difference_between_article_and_page_rank( + data['pageRank'], + data['articleRank'], + data['shortCodeUnitName'], + title=f"{plot_type} Distribution of Page Rank - Article Rank Difference", + plot_file_path=get_file_path(f"{plot_type}_PageRank_Minus_ArticleRank_Distribution", parameters) +) + +plot_clustering_coefficient_distribution( + data['clusteringCoefficient'], + title=f"{plot_type} Distribution of Clustering Coefficients", + plot_file_path=get_file_path(f"{plot_type}_ClusteringCoefficient_distribution", parameters) +) + +plot_clustering_coefficient_vs_page_rank( + data['clusteringCoefficient'], + data['pageRank'], + data['shortCodeUnitName'], + data['clusterNoise'], + title=f"{plot_type} Clustering Coefficient versus Page Rank", + plot_file_path=get_file_path(f"{plot_type}_ClusteringCoefficient_versus_PageRank", parameters) +) + +if (overall_cluster_count < 20): + print(f"anomalyDetectionPlots: Less than 20 clusters: {overall_cluster_count}. Only one plot containing all clusters will be created.") + plot_clusters( + clustering_visualization_dataframe=data, + title=f"{plot_type} All Clusters Overall (less than 20)", + plot_file_path=get_file_path(f"{plot_type}_Clusters_Overall", parameters) + ) +else: + print(f"anomalyDetectionPlots: More than 20 clusters: {overall_cluster_count}. Different plots focussing on different features like cluster size will be created.") + clusters_by_largest_size = get_clusters_by_criteria( + data, by='clusterSize', ascending=False, cluster_count=20 + ) + plot_clusters( + clustering_visualization_dataframe=clusters_by_largest_size, + title=f"{plot_type} Clusters with the largest size", + plot_file_path=get_file_path(f"{plot_type}_Clusters_largest_size", parameters) + ) + + clusters_by_largest_max_radius = get_clusters_by_criteria( + data, by='clusterRadiusMax', ascending=False, cluster_count=20 + ) + plot_clusters( + clustering_visualization_dataframe=clusters_by_largest_max_radius, + title=f"{plot_type} Clusters with the largest max radius", + plot_file_path=get_file_path(f"{plot_type}_Clusters_largest_max_radius", parameters) + ) + + clusters_by_largest_average_radius = get_clusters_by_criteria( + data, by='clusterRadiusAverage', ascending=False, cluster_count=20 + ) + plot_clusters( + clustering_visualization_dataframe=clusters_by_largest_average_radius, + title=f"{plot_type} Clusters with the largest average radius", + plot_file_path=get_file_path(f"{plot_type}_Clusters_largest_average_radius", parameters) + ) + +plot_clusters_probabilities( + clustering_visualization_dataframe=data, + title=f"{plot_type} Clustering Probabilities (red=high uncertainty)", + plot_file_path=get_file_path(f"{plot_type}_Cluster_probabilities", parameters) +) + +plot_cluster_noise( + clustering_visualization_dataframe=data, + title=f"{plot_type} Clustering Noise points that are surprisingly central (color) or popular (size)", + size_column_name='degree', + color_column_name='pageRank', + plot_file_path=get_file_path(f"{plot_type}_ClusterNoise_highly_central_and_popular", parameters) +) + +plot_cluster_noise( + clustering_visualization_dataframe=data, + title=f"{plot_type} Clustering Noise points that bridge flow (color) and are poorly integrated (size)", + size_column_name='inverseClusteringCoefficient', + color_column_name='betweenness', + plot_file_path=get_file_path(f"{plot_type}_ClusterNoise_poorly_integrated_bridges", parameters) +) + +plot_cluster_noise( + clustering_visualization_dataframe=data, + title=f"{plot_type} Clustering Noise points with role inversion (size), possibly violating layering or dependency direction (color)", + size_column_name='pageToArticleRankDifference', + color_column_name='betweenness', + plot_file_path=get_file_path(f"{plot_type}_ClusterNoise_role_inverted_bridges", parameters) +) + +driver.close() diff --git a/domains/anomaly-detection/anomalyDetectionPython.sh b/domains/anomaly-detection/anomalyDetectionPython.sh new file mode 100755 index 000000000..185a2f010 --- /dev/null +++ b/domains/anomaly-detection/anomalyDetectionPython.sh @@ -0,0 +1,183 @@ +#!/usr/bin/env bash + +# Pipeline that coordinates anomaly detection using the Graph Data Science Library of Neo4j. +# It requires an already running Neo4j graph database with already scanned and analyzed artifacts. +# The results will be written into the sub directory reports/anomaly-detection. + +# Note that "scripts/prepareAnalysis.sh" is required to run prior to this script. + +# Requires executeQueryFunctions.sh, projectionFunctions.sh, cleanupAfterReportGeneration.sh + +# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) +set -o errexit -o pipefail + +# Overrideable Constants (defaults also defined in sub scripts) +REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"} + +## Get this "scripts/reports" directory if not already set +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)} +echo "anomalyDetectionPipeline: ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR}" +# Get the "scripts" directory by taking the path of this script and going one directory up. +SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/../../scripts"} # Repository directory containing the shell scripts +# Get the "cypher" query directory for gathering features. +ANOMALY_DETECTION_FEATURE_CYPHER_DIR=${ANOMALY_DETECTION_FEATURE_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/features"} +ANOMALY_DETECTION_QUERY_CYPHER_DIR=${ANOMALY_DETECTION_QUERY_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/queries"} + +# Function to display script usage +usage() { + echo -e "${COLOR_ERROR}" >&2 + echo "Usage: $0 [--verbose]" >&2 + echo -e "${COLOR_DEFAULT}" >&2 + exit 1 +} + +# Default values +verboseMode="" # either "" or "--verbose" + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + key="$1" + value="${2}" + + case ${key} in + --verbose) + verboseMode="--verbose" + ;; + *) + echo -e "${COLOR_ERROR}anomalyDetectionPipeline: Error: Unknown option: ${key}${COLOR_DEFAULT}" >&2 + usage + ;; + esac + shift || true # ignore error when there are no more arguments +done + +# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher" +source "${SCRIPTS_DIR}/executeQueryFunctions.sh" + +# Define functions to create and delete Graph Projections like "createUndirectedDependencyProjection" +source "${SCRIPTS_DIR}/projectionFunctions.sh" + +# Query or recalculate features. +# +# Required Parameters: +# - projection_name=... +# Name prefix for the in-memory projection name. Example: "package-anomaly-detection" +# - projection_node_label=... +# Label of the nodes that will be used for the projection. Example: "Package" +# - projection_weight_property=... +# Name of the node property that contains the dependency weight. Example: "weight" +anomaly_detection_features() { + local nodeLabel + nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" ) + echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Collecting features for ${nodeLabel} nodes..." + + # Determine the Betweenness centrality (with the directed graph projection) if not already done + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Betweenness-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Betweenness-Write.cypher" "${@}" + # Determine the local clustering coefficient if not already done + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-LocalClusteringCoefficient-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-LocalClusteringCoefficient-Write.cypher" "${@}" + # Determine the page rank if not already done + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageRank-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageRank-Write.cypher" "${@}" + # Determine the article rank if not already done + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageRank-Write.cypher" "${@}" +} + +# Execute the Python scripts for anomaly detection. +# +# Required Parameters: +# - projection_name=... +# Name prefix for the in-memory projection name. Example: "package-anomaly-detection" +# - projection_node_label=... +# Label of the nodes that will be used for the projection. Example: "Package" +# - projection_weight_property=... +# Name of the node property that contains the dependency weight. Example: "weight" +anomaly_detection_using_python() { + local nodeLabel + nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" ) + echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Python scripts for ${nodeLabel} nodes..." + + # Get tuned Leiden communities as a reference to tune clustering + time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedLeidenCommunityDetection.py" "${@}" ${verboseMode} + # Tuned Fast Random Projection and tuned HDBSCAN clustering + time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedNodeEmbeddingClustering.py" "${@}" ${verboseMode} + # Reduce the dimensionality of the node embeddings down to 2D for visualization using UMAP + time "${ANOMALY_DETECTION_SCRIPT_DIR}/umap2dNodeEmbeddings.py" "${@}" ${verboseMode} + + time "${ANOMALY_DETECTION_SCRIPT_DIR}/anomalyDetectionPlots.py" "${@}" "--report_directory" "${FULL_REPORT_DIRECTORY}" ${verboseMode} + # Query Results: Output all collected features into a CSV file. + execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeatures.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_Features.csv" +} + +# Run the anomaly detection pipeline. +# +# Required Parameters: +# - projection_name=... +# Name prefix for the in-memory projection name. Example: "package-anomaly-detection" +# - projection_node_label=... +# Label of the nodes that will be used for the projection. Example: "Package" +# - projection_weight_property=... +# Name of the node property that contains the dependency weight. Example: "weight" +anomaly_detection_python_reports() { + time anomaly_detection_features "${@}" + anomaly_detection_using_python "${@}" +} + +# Create report directory +REPORT_NAME="anomaly-detection" +FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}" +mkdir -p "${FULL_REPORT_DIRECTORY}" + +# Query Parameter key pairs for projection and algorithm side +PROJECTION_NAME="dependencies_projection" +ALGORITHM_PROJECTION="projection_name" + +PROJECTION_NODE="dependencies_projection_node" +ALGORITHM_NODE="projection_node_label" + +PROJECTION_WEIGHT="dependencies_projection_weight_property" +ALGORITHM_WEIGHT="projection_weight_property" + +# Code independent algorithm parameters +COMMUNITY_PROPERTY="community_property=communityLeidenIdTuned" +EMBEDDING_PROPERTY="embedding_property=embeddingsFastRandomProjectionTunedForClustering" + +# -- Java Artifact Node Embeddings ------------------------------- + +if createUndirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight"; then + createDirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection-directed" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight" + anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=artifact-anomaly-detection" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" +fi + +# -- Java Package Node Embeddings -------------------------------- + +if createUndirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces"; then + createDirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection-directed" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces" + anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=package-anomaly-detection" "${ALGORITHM_NODE}=Package" "${ALGORITHM_WEIGHT}=weight25PercentInterfaces" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" +fi + +# -- Java Type Node Embeddings ----------------------------------- + +if createUndirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection"; then + createDirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection-directed" + anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=type-anomaly-detection" "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" +fi + +# -- Typescript Module Node Embeddings --------------------------- + +if createUndirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight"; then + createDirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding-directed" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight" + anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=typescript-module-embedding" "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=lowCouplingElement25PercentWeight" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" +fi + +# --------------------------------------------------------------- + +# Clean-up after report generation. Empty reports will be deleted. +source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}" + +echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished." \ No newline at end of file diff --git a/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb new file mode 100644 index 000000000..3d2fb3be4 --- /dev/null +++ b/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb @@ -0,0 +1,1383 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "2f0eabc4", + "metadata": {}, + "source": [ + "# Anomaly Detection - Manual Exploration\n", + "\n", + "This notebook demonstrates different methods for anomaly detection for static code analysis data using jQAssistant and Neo4j. It plots results of different approaches from plain queries to statistical methods. The focus is on detecting anomalies in the data, which can be useful for identifying potential issues or areas for improvement in the codebase.\n", + "\n", + "
\n", + "\n", + "### References\n", + "- [jqassistant](https://jqassistant.org)\n", + "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4191f259", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import typing\n", + "\n", + "from IPython.display import display\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import matplotlib.pyplot as plot\n", + "import seaborn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0676813", + "metadata": {}, + "outputs": [], + "source": [ + "#The following cell uses the build-in %html \"magic\" to override the CSS style for tables to a much smaller size.\n", + "#This is especially needed for PDF export of tables with multiple columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebac1bb9", + "metadata": {}, + "outputs": [], + "source": [ + "%%html\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07319282", + "metadata": {}, + "outputs": [], + "source": [ + "# Main Colormap\n", + "# main_color_map = 'nipy_spectral'\n", + "main_color_map = 'viridis'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8ef41ff", + "metadata": {}, + "outputs": [], + "source": [ + "from sys import version as python_version\n", + "print('Python version: {}'.format(python_version))\n", + "\n", + "from numpy import __version__ as numpy_version\n", + "print('numpy version: {}'.format(numpy_version))\n", + "\n", + "from pandas import __version__ as pandas_version\n", + "print('pandas version: {}'.format(pandas_version))\n", + "\n", + "from matplotlib import __version__ as matplotlib_version\n", + "print('matplotlib version: {}'.format(matplotlib_version))\n", + "\n", + "from seaborn import __version__ as seaborn_version # type: ignore\n", + "print('seaborn version: {}'.format(seaborn_version))\n", + "\n", + "from neo4j import __version__ as neo4j_version\n", + "print('neo4j version: {}'.format(neo4j_version))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c5dab37", + "metadata": {}, + "outputs": [], + "source": [ + "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell \n", + "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n", + "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n", + "from neo4j import GraphDatabase\n", + "\n", + "driver = GraphDatabase.driver(\n", + " uri=\"bolt://localhost:7687\", \n", + " auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\"))\n", + ")\n", + "driver.verify_connectivity()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1db254b", + "metadata": {}, + "outputs": [], + "source": [ + "def query_cypher_to_data_frame(query: typing.LiteralString, parameters: typing.Optional[typing.Dict[str, typing.Any]] = None):\n", + " records, summary, keys = driver.execute_query(query, parameters_=parameters)\n", + " return pd.DataFrame([record.values() for record in records], columns=keys)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7656bd5", + "metadata": {}, + "outputs": [], + "source": [ + "plot_annotation_style: dict = {\n", + " 'textcoords': 'offset points',\n", + " 'arrowprops': dict(arrowstyle='->', color='black', alpha=0.3),\n", + " 'fontsize': 6,\n", + " 'backgroundcolor': 'white',\n", + " 'bbox': dict(boxstyle='round,pad=0.4',\n", + " edgecolor='silver',\n", + " facecolor='whitesmoke',\n", + " alpha=1\n", + " )\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "0c68aa20", + "metadata": {}, + "source": [ + "## 1. Java Packages" + ] + }, + { + "cell_type": "markdown", + "id": "c927388f", + "metadata": {}, + "source": [ + "### 1.1 Differences between Page Rank and Article Rank\n", + "\n", + "A high difference between Page Rank and Article Rank can reveal nodes with imbalanced roles — e.g. utility code that is highly depended on but does not depend on much else.\n", + "\n", + "PageRank measures how important a node is by who depends on it (high in-degree weight) while ArticleRank measures how important a node is based on how many other nodes it links to (outgoing edges matter more).\n", + "\n", + "Nodes with low PageRank but high ArticleRank may be coordination-heavy, which could signal:\n", + "- Unusual architecture\n", + "- Utility overuse\n", + "- Monolithic patterns\n", + "\n", + "These are often design smells or potential anomalies in large-scale codebases." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c26f8f19", + "metadata": {}, + "outputs": [], + "source": [ + "java_package_centrality_features_query = \"\"\"\n", + " MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit:Java:Package)\n", + " WHERE codeUnit.incomingDependencies IS NOT NULL\n", + " AND codeUnit.outgoingDependencies IS NOT NULL\n", + " AND codeUnit.centralityArticleRank IS NOT NULL\n", + " AND codeUnit.centralityPageRank IS NOT NULL\n", + " AND codeUnit.centralityBetweenness IS NOT NULL\n", + " RETURN DISTINCT \n", + " codeUnit.fqn AS codeUnitName\n", + " ,codeUnit.name AS shortCodeUnitName\n", + " ,artifact.name AS projectName\n", + " ,codeUnit.incomingDependencies AS incomingDependencies\n", + " ,codeUnit.outgoingDependencies AS outgoingDependencies\n", + " ,codeUnit.centralityArticleRank AS articleRank\n", + " ,codeUnit.centralityPageRank AS pageRank\n", + " ,codeUnit.centralityBetweenness AS betweenness\n", + "\"\"\"\n", + "\n", + "java_package_centrality_features = query_cypher_to_data_frame(java_package_centrality_features_query)\n", + "display(java_package_centrality_features.head(5))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb417d4a", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_standard_deviation_lines(color: typing.LiteralString, mean: float, standard_deviation: float, standard_deviation_factor: int = 0) -> None:\n", + " \"\"\"\n", + " Plots vertical lines for the mean + factor times standard deviation (z-score references).\n", + " \"\"\"\n", + " # Vertical line for the standard deviation\n", + " positive_standard_deviation = mean + (standard_deviation_factor * standard_deviation)\n", + " horizontal_line_label = f'Mean + {standard_deviation_factor} x Standard Deviation: {positive_standard_deviation:.2f}' if standard_deviation_factor != 0 else f'Mean: {mean:.2f}'\n", + " \n", + " plot.axvline(positive_standard_deviation, color=color, linestyle='dashed', linewidth=1, label=horizontal_line_label)\n", + " \n", + " if standard_deviation_factor != 0:\n", + " negative_standard_deviation = mean - (standard_deviation_factor * standard_deviation)\n", + " plot.axvline(negative_standard_deviation, color=color, linestyle='dashed', linewidth=1)\n", + " \n", + " plot.legend()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ccbf588f", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_difference_between_article_and_page_rank(\n", + " page_ranks: pd.Series, \n", + " article_ranks: pd.Series,\n", + " short_names: pd.Series,\n", + ") -> None:\n", + " \"\"\"\n", + " Plots the difference between Article Rank and Page Rank for Java packages.\n", + " \n", + " Parameters\n", + " ----------\n", + " page_ranks : pd.Series\n", + " DataFrame column containing Page Rank values.\n", + " article_ranks : pd.Series\n", + " DataFrame column containing Article Rank values.\n", + " short_names : pd.Series\n", + " DataFrame column containing short names of the code units.\n", + " \"\"\"\n", + " if page_ranks.empty or article_ranks.empty or short_names.empty:\n", + " print(\"No data available to plot.\")\n", + " return\n", + "\n", + " # Calculate the difference between Article Rank and Page Rank\n", + " page_to_article_rank_difference = page_ranks - article_ranks\n", + "\n", + " plot.figure(figsize=(10, 6))\n", + " plot.hist(page_to_article_rank_difference, bins=50, color='blue', alpha=0.7, edgecolor='black')\n", + " plot.title('Distribution of Page Rank - Article Rank Difference')\n", + " plot.xlabel('Absolute difference between Page Rank and Article Rank')\n", + " plot.ylabel('Frequency')\n", + " plot.xlim(left=page_to_article_rank_difference.min(), right=page_to_article_rank_difference.max())\n", + " plot.yscale('log') # Use logarithmic scale for better visibility of differences\n", + " plot.grid(True)\n", + " plot.tight_layout()\n", + "\n", + " mean_difference = page_to_article_rank_difference.mean()\n", + " standard_deviation = page_to_article_rank_difference.std()\n", + " \n", + " # Vertical line for the mean\n", + " plot_standard_deviation_lines('red', mean_difference, standard_deviation, standard_deviation_factor=0)\n", + " # Vertical line for the standard deviation + mean (=z-score of 1)\n", + " plot_standard_deviation_lines('orange', mean_difference, standard_deviation, standard_deviation_factor=1)\n", + " # Vertical line for 2 x standard deviations + mean (=z-score of 2)\n", + " plot_standard_deviation_lines('green', mean_difference, standard_deviation, standard_deviation_factor=2)\n", + "\n", + " def annotate_outliers(outliers: pd.DataFrame) -> None:\n", + " if outliers.empty:\n", + " return\n", + " for dataframe_index, row in outliers.iterrows():\n", + " index = typing.cast(int, dataframe_index)\n", + " value = row['pageToArticleRankDifference']\n", + " x_index_offset = - index * 10 if value > 0 else + index * 10\n", + " plot.annotate(\n", + " text=f'{row['shortName']} (rank #{row['page_rank_ranking']}, #{row['article_rank_ranking']})',\n", + " xy=(value, 1),\n", + " xytext=(value + x_index_offset, 60),\n", + " rotation=90,\n", + " **plot_annotation_style,\n", + " )\n", + "\n", + " # Merge all series into a single DataFrame for easier handling\n", + " page_to_article_rank_dataframe = pd.DataFrame({\n", + " 'shortName': short_names,\n", + " 'pageRank': page_ranks,\n", + " 'articleRank': article_ranks,\n", + " 'pageToArticleRankDifference': page_to_article_rank_difference,\n", + " 'page_rank_ranking': page_ranks.rank().astype(int),\n", + " 'article_rank_ranking': article_ranks.rank().astype(int)\n", + " }, index=page_ranks.index)\n", + "\n", + " # Annotate values above z-score of 2 with their names\n", + " positive_z_score_2 = mean_difference + 2 * standard_deviation\n", + " positive_outliers = page_to_article_rank_dataframe[page_to_article_rank_difference > positive_z_score_2].sort_values(by='pageToArticleRankDifference', ascending=False).reset_index().head(5)\n", + " annotate_outliers(positive_outliers)\n", + "\n", + " # Annotate values below z-score of -2 with their names\n", + " negative_z_score_2 = mean_difference - 2 * standard_deviation\n", + " negative_outliers = page_to_article_rank_dataframe[page_to_article_rank_difference < negative_z_score_2].sort_values(by='pageToArticleRankDifference', ascending=True).reset_index().head(5)\n", + " annotate_outliers(negative_outliers)\n", + "\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0636511b", + "metadata": {}, + "outputs": [], + "source": [ + "plot_difference_between_article_and_page_rank(\n", + " java_package_centrality_features['pageRank'],\n", + " java_package_centrality_features['articleRank'],\n", + " java_package_centrality_features['shortCodeUnitName']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2dec26a8", + "metadata": {}, + "source": [ + "### 1.2 Local Clustering Coefficient\n", + "\n", + "The local clustering coefficient is a measure of how connected a node's neighbors are to each other.\n", + "A high local clustering coefficient indicates that a node's neighbors are well-connected, which can suggest a tightly-knit group of related components or classes.\n", + "A low local clustering coefficient may indicate that a node's neighbors are not well-connected, which can suggest a more loosely-coupled architecture or potential design smells." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "740699a8", + "metadata": {}, + "outputs": [], + "source": [ + "java_package_clustering_coefficient_query = \"\"\"\n", + " MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit:Java:Package)\n", + " WHERE codeUnit.incomingDependencies IS NOT NULL\n", + " AND codeUnit.outgoingDependencies IS NOT NULL\n", + " AND codeUnit.centralityPageRank IS NOT NULL\n", + " AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n", + " RETURN DISTINCT \n", + " codeUnit.fqn AS codeUnitName\n", + " ,codeUnit.name AS shortCodeUnitName\n", + " ,artifact.name AS projectName\n", + " ,codeUnit.incomingDependencies AS incomingDependencies\n", + " ,codeUnit.outgoingDependencies AS outgoingDependencies\n", + " ,codeUnit.centralityPageRank AS pageRank\n", + " ,codeUnit.communityLocalClusteringCoefficient AS clusteringCoefficient\n", + " ,codeUnit.clusteringHDBSCANNoise AS clusterNoise\n", + "\"\"\"\n", + "\n", + "java_package_clustering_coefficient_features = query_cypher_to_data_frame(java_package_clustering_coefficient_query)\n", + "display(java_package_clustering_coefficient_features.head(5))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed900c59", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series) -> None:\n", + " \"\"\"\n", + " Plots the distribution of clustering coefficients.\n", + " \n", + " Parameters\n", + " ----------\n", + " clustering_coefficients : pd.Series\n", + " Series containing clustering coefficient values.\n", + " \"\"\"\n", + " if clustering_coefficients.empty:\n", + " print(\"No data available to plot.\")\n", + " return\n", + "\n", + " plot.figure(figsize=(10, 6))\n", + " plot.figure(figsize=(10, 6))\n", + " plot.hist(clustering_coefficients, bins=40, color='blue', alpha=0.7, edgecolor='black')\n", + " plot.title('Distribution of Clustering Coefficients')\n", + " plot.xlabel('Clustering Coefficient')\n", + " plot.ylabel('Frequency')\n", + " plot.xlim(left=clustering_coefficients.min(), right=clustering_coefficients.max())\n", + " # plot.yscale('log') # Use logarithmic scale for better visibility of differences\n", + " plot.grid(True)\n", + " plot.tight_layout()\n", + "\n", + " mean = clustering_coefficients.mean()\n", + " standard_deviation = clustering_coefficients.std()\n", + "\n", + " # Vertical line for the mean\n", + " plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)\n", + " # Vertical line for 1 x standard deviations + mean (=z-score of 1)\n", + " plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=1)\n", + "\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b05994e5", + "metadata": {}, + "outputs": [], + "source": [ + "plot_clustering_coefficient_distribution(java_package_clustering_coefficient_features['clusteringCoefficient'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4f46116", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_clustering_coefficient_vs_page_rank(\n", + " clustering_coefficients: pd.Series, \n", + " page_ranks: pd.Series,\n", + " short_names: pd.Series,\n", + " clustering_noise: pd.Series,\n", + ") -> None:\n", + " \"\"\"\n", + " Plots the relationship between clustering coefficients and Page Rank values.\n", + " \n", + " Parameters\n", + " ----------\n", + " clustering_coefficients : pd.Series\n", + " Series containing clustering coefficient values.\n", + " page_ranks : pd.Series\n", + " Series containing Page Rank values.\n", + " short_names : pd.Series\n", + " Series containing short names of the code units.\n", + " clustering_noise : pd.Series\n", + " Series indicating whether the code unit is noise (value = 1) nor not (value = 0) from the clustering algorithm.\n", + " \"\"\"\n", + " if clustering_coefficients.empty or page_ranks.empty or short_names.empty:\n", + " print(\"No data available to plot.\")\n", + " return\n", + "\n", + " color = clustering_noise.map({0: 'blue', 1: 'gray'})\n", + "\n", + " plot.figure(figsize=(10, 6))\n", + " plot.scatter(x=clustering_coefficients, y=page_ranks, alpha=0.7, color=color)\n", + " plot.title('Clustering Coefficient vs Page Rank')\n", + " plot.xlabel('Clustering Coefficient')\n", + " plot.ylabel('Page Rank')\n", + "\n", + " # Add color bar: grey = noise, blue = non-noise\n", + " scatter = plot.scatter([], [], color='blue', label='Non-Noise', alpha=0.7)\n", + " scatter_noise = plot.scatter([], [], color='gray', label='Noise', alpha=0.7)\n", + " plot.legend(handles=[scatter, scatter_noise], loc='upper right', title='Clustering Noise')\n", + " \n", + " # Merge all series into a single DataFrame for easier handling\n", + " combined_data = pd.DataFrame({\n", + " 'shortName': short_names,\n", + " 'clusteringCoefficient': clustering_coefficients,\n", + " 'pageRank': page_ranks,\n", + " 'clusterNoise': clustering_noise,\n", + " }, index=clustering_coefficients.index)\n", + "\n", + " # Annotate points with their names. Filter out values with a page rank smaller than 1.5 standard deviations\n", + " mean_page_rank = page_ranks.mean()\n", + " standard_deviation_page_rank = page_ranks.std()\n", + " threshold_page_rank = mean_page_rank + 1.5 * standard_deviation_page_rank\n", + " significant_points = combined_data[combined_data['pageRank'] > threshold_page_rank].reset_index(drop=True).head(10)\n", + " for dataframe_index, row in significant_points.iterrows():\n", + " index = typing.cast(int, dataframe_index)\n", + " plot.annotate(\n", + " text=row['shortName'],\n", + " xy=(row['clusteringCoefficient'], row['pageRank']),\n", + " xytext=(5, 5 + index * 10), # Offset y position for better visibility\n", + " **plot_annotation_style\n", + " )\n", + "\n", + " # Annotate points with the highest clustering coefficients (top 20) and only show the lowest 5 page ranks\n", + " combined_data['page_rank_ranking'] = combined_data['pageRank'].rank(ascending=False).astype(int)\n", + " combined_data['clustering_coefficient_ranking'] = combined_data['clusteringCoefficient'].rank(ascending=False).astype(int)\n", + " top_clustering_coefficients = combined_data.sort_values(by='clusteringCoefficient', ascending=False).reset_index(drop=True).head(20)\n", + " top_clustering_coefficients = top_clustering_coefficients.sort_values(by='pageRank', ascending=True).reset_index(drop=True).head(5)\n", + " for dataframe_index, row in top_clustering_coefficients.iterrows():\n", + " index = typing.cast(int, dataframe_index)\n", + " plot.annotate(\n", + " text=f\"{row['shortName']} (score {row['pageRank']:.4f})\",\n", + " xy=(row['clusteringCoefficient'], row['pageRank']),\n", + " xytext=(5, 5 + index * 10), # Offset y position for better visibility\n", + " **plot_annotation_style\n", + " )\n", + "\n", + " #plot.yscale('log') # Use logarithmic scale for better visibility of differences\n", + " plot.grid(True)\n", + " plot.tight_layout()\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af2fad9a", + "metadata": {}, + "outputs": [], + "source": [ + "plot_clustering_coefficient_vs_page_rank(\n", + " java_package_clustering_coefficient_features['clusteringCoefficient'],\n", + " java_package_centrality_features['pageRank'],\n", + " java_package_clustering_coefficient_features['shortCodeUnitName'],\n", + " java_package_clustering_coefficient_features['clusterNoise']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "630f5e4b", + "metadata": {}, + "source": [ + "### 1.3 HDBSCAN Clusters\n", + "\n", + "HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) is a clustering algorithm that can identify clusters of varying densities and shapes. It is particularly useful for detecting anomalies in data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8cb4eea3", + "metadata": {}, + "outputs": [], + "source": [ + "java_package_clustering_query = \"\"\"\n", + " MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit:Java:Package)\n", + " WHERE codeUnit.incomingDependencies IS NOT NULL\n", + " AND codeUnit.outgoingDependencies IS NOT NULL\n", + " AND codeUnit.centralityPageRank IS NOT NULL\n", + " AND codeUnit.centralityArticleRank IS NOT NULL\n", + " AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL\n", + " AND codeUnit.centralityBetweenness IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANSize IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANRadiusMax IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANRadiusAverage IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANNormalizedDistanceToMedoid IS NOT NULL\n", + " AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL\n", + " AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL\n", + " RETURN DISTINCT \n", + " codeUnit.fqn AS codeUnitName\n", + " ,codeUnit.name AS shortCodeUnitName\n", + " ,artifact.name AS projectName\n", + " ,codeUnit.incomingDependencies AS incomingDependencies\n", + " ,codeUnit.outgoingDependencies AS outgoingDependencies\n", + " ,codeUnit.centralityPageRank AS pageRank\n", + " ,1.0 - codeUnit.communityLocalClusteringCoefficient AS inverseClusteringCoefficient\n", + " ,codeUnit.centralityBetweenness AS betweenness\n", + " ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference\n", + " ,codeUnit.clusteringHDBSCANLabel AS clusterLabel\n", + " ,codeUnit.clusteringHDBSCANProbability AS clusterProbability\n", + " ,codeUnit.clusteringHDBSCANNoise AS clusterNoise\n", + " ,codeUnit.clusteringHDBSCANMedoid AS clusterMedoid\n", + " ,codeUnit.clusteringHDBSCANSize AS clusterSize\n", + " ,codeUnit.clusteringHDBSCANRadiusMax AS clusterRadiusMax\n", + " ,codeUnit.clusteringHDBSCANRadiusAverage AS clusterRadiusAverage\n", + " ,codeUnit.clusteringHDBSCANNormalizedDistanceToMedoid AS clusterNormalizedDistanceToMedoid\n", + " ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX\n", + " ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\n", + " \"\"\"\n", + "\n", + "java_package_clustering_features = query_cypher_to_data_frame(java_package_clustering_query)\n", + "java_package_clustering_features['degree'] = java_package_clustering_features['incomingDependencies'] + java_package_clustering_features['outgoingDependencies']\n", + "display(java_package_clustering_features.head(5))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2dd4ac72", + "metadata": {}, + "outputs": [], + "source": [ + "def add_visualization_cluster_diameter(\n", + " clustering_visualization_dataframe: pd.DataFrame,\n", + " result_diameter_column_name: str = 'clusterVisualizationDiameter',\n", + " cluster_label_column_name: str = \"clusterLabel\",\n", + " x_position_column: str = \"embeddingVisualizationX\",\n", + " y_position_column: str = \"embeddingVisualizationY\",\n", + "):\n", + " \n", + " def max_pairwise_distance(points):\n", + " if len(points) < 2:\n", + " return 0.0\n", + " # Efficient vectorized pairwise distance computation\n", + " dists = np.sqrt(\n", + " np.sum((points[:, np.newaxis, :] - points[np.newaxis, :, :]) ** 2, axis=-1)\n", + " )\n", + " return np.max(dists)\n", + " \n", + " unique_cluster_labels = clustering_visualization_dataframe[cluster_label_column_name].unique()\n", + " \n", + " if len(unique_cluster_labels) == 0:\n", + " return \n", + "\n", + " cluster_diameters = {}\n", + " for cluster_label in unique_cluster_labels:\n", + " if cluster_label == -1:\n", + " cluster_diameters[-1] = 0.0\n", + " continue\n", + " \n", + " cluster_nodes = clustering_visualization_dataframe[\n", + " clustering_visualization_dataframe[cluster_label_column_name] == cluster_label\n", + " ]\n", + " cluster_diameters[cluster_label] = max_pairwise_distance(cluster_nodes[[x_position_column, y_position_column]].to_numpy())\n", + "\n", + " if cluster_diameters:\n", + " clustering_visualization_dataframe[result_diameter_column_name] = clustering_visualization_dataframe[cluster_label_column_name].map(cluster_diameters)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "443e5e1e", + "metadata": {}, + "outputs": [], + "source": [ + "add_visualization_cluster_diameter(java_package_clustering_features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18125c61", + "metadata": {}, + "outputs": [], + "source": [ + "def get_clusters_by_criteria(\n", + " dataframe: pd.DataFrame, \n", + " by: str, \n", + " ascending: bool = True, \n", + " cluster_count: int = 10, \n", + " label_column_name: str = 'clusterLabel'\n", + " ) -> pd.DataFrame:\n", + " \"\"\" \n", + " Returns the rows for the \"cluster_count\" clusters with the largest (ascending=False) or smallest(ascending=True)\n", + " value in the column specified with \"by\". Noise (labeled with -1) remains unfiltered.\n", + " \"\"\"\n", + " if ascending:\n", + " threshold = dataframe.groupby(by=label_column_name)[by].min().nsmallest(cluster_count).iloc[-1]\n", + " #print(f\"Ascending threshold is {threshold} for {by}.\")\n", + " return dataframe[(dataframe[by] <= threshold) | (dataframe[label_column_name] == -1)]\n", + " \n", + " threshold = dataframe.groupby(by=label_column_name)[by].max().nlargest(cluster_count).iloc[-1]\n", + " #print(f\"Descending threshold is {threshold} for {by}.\")\n", + " return dataframe[(dataframe[by] >= threshold) | (dataframe[label_column_name] == -1)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a8f41fb", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_clusters(\n", + " clustering_visualization_dataframe: pd.DataFrame,\n", + " title: str,\n", + " main_color_map: str = \"tab20\",\n", + " code_unit_column_name: str = \"shortCodeUnitName\",\n", + " cluster_label_column_name: str = \"clusterLabel\",\n", + " cluster_medoid_column_name: str = \"clusterMedoid\",\n", + " centrality_column_name: str = \"pageRank\",\n", + " x_position_column: str = 'embeddingVisualizationX',\n", + " y_position_column: str = 'embeddingVisualizationY',\n", + " cluster_visualization_diameter_column = 'clusterVisualizationDiameter'\n", + ") -> None:\n", + " \n", + " if clustering_visualization_dataframe.empty:\n", + " print(\"No projected data to plot available\")\n", + " return\n", + " \n", + " def truncate(text: str, max_length: int):\n", + " if len(text) <= max_length:\n", + " return text\n", + " return text[:max_length - 3] + \"...\"\n", + " \n", + " # Create figure and subplots\n", + " plot.figure(figsize=(10, 10))\n", + "\n", + " # Setup columns\n", + " node_size_column = centrality_column_name\n", + "\n", + " # Separate HDBSCAN non-noise and noise nodes\n", + " node_embeddings_without_noise = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] != -1]\n", + " node_embeddings_noise_only = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] == -1]\n", + "\n", + " # ------------------------------------------\n", + " # Subplot: HDBSCAN Clustering with KDE\n", + " # ------------------------------------------\n", + " plot.title(title)\n", + "\n", + " unique_cluster_labels = node_embeddings_without_noise[cluster_label_column_name].unique()\n", + " hdbscan_color_palette = seaborn.color_palette(main_color_map, len(unique_cluster_labels))\n", + " hdbscan_cluster_to_color = dict(zip(unique_cluster_labels, hdbscan_color_palette))\n", + "\n", + " max_visualization_diameter = node_embeddings_without_noise[cluster_visualization_diameter_column].max()\n", + " visualization_diameter_normalization_factor = max_visualization_diameter * 2\n", + "\n", + " for cluster_label in unique_cluster_labels:\n", + " cluster_nodes = node_embeddings_without_noise[\n", + " node_embeddings_without_noise[cluster_label_column_name] == cluster_label\n", + " ]\n", + " # By comparing the cluster diameter to the max diameter of all clusters in the quartile,\n", + " # we can adjust the alpha value for the KDE plot to visualize smaller clusters more clearly.\n", + " # This way, larger clusters will have a lower alpha value, making them less prominent and less prone to overshadow smaller clusters.\n", + " cluster_diameter = cluster_nodes.iloc[0][cluster_visualization_diameter_column]\n", + " alpha = max((1.0 - (cluster_diameter / (visualization_diameter_normalization_factor))) * 0.45 - 0.25, 0.02)\n", + "\n", + " # KDE cloud shape\n", + " if len(cluster_nodes) > 1 and (\n", + " cluster_nodes[x_position_column].std() > 0 or cluster_nodes[y_position_column].std() > 0\n", + " ):\n", + " seaborn.kdeplot(\n", + " x=cluster_nodes[x_position_column],\n", + " y=cluster_nodes[y_position_column],\n", + " fill=True,\n", + " alpha=alpha,\n", + " levels=2,\n", + " color=hdbscan_cluster_to_color[cluster_label],\n", + " ax=plot.gca(), # Use current axes\n", + " warn_singular=False,\n", + " )\n", + "\n", + " # Node scatter points\n", + " plot.scatter(\n", + " x=cluster_nodes[x_position_column],\n", + " y=cluster_nodes[y_position_column],\n", + " s=cluster_nodes[node_size_column] * 200 + 2,\n", + " color=hdbscan_cluster_to_color[cluster_label],\n", + " alpha=0.9,\n", + " label=f\"Cluster {cluster_label}\"\n", + " )\n", + "\n", + " # Annotate medoids of the cluster\n", + " medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n", + " for index, row in medoids.iterrows():\n", + " plot.annotate(\n", + " text=f\"{truncate(row[code_unit_column_name], 30)} ({row[cluster_label_column_name]})\",\n", + " xy=(row[x_position_column], row[y_position_column]),\n", + " xytext=(5, 5), # Offset for better visibility\n", + " **plot_annotation_style\n", + " )\n", + "\n", + " # Plot noise points in gray\n", + " plot.scatter(\n", + " x=node_embeddings_noise_only[x_position_column],\n", + " y=node_embeddings_noise_only[y_position_column],\n", + " s=node_embeddings_noise_only[node_size_column] * 200 + 2,\n", + " color='lightgrey',\n", + " alpha=0.4,\n", + " label=\"Noise\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "def7fb66", + "metadata": {}, + "outputs": [], + "source": [ + "java_package_clustering_features_filtered=get_clusters_by_criteria(\n", + " java_package_clustering_features, by='clusterSize', ascending=False, cluster_count=20\n", + ")\n", + "plot_clusters(\n", + " clustering_visualization_dataframe=java_package_clustering_features_filtered,\n", + " title=\"Java Package Clusters with the largest size\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce1c113f", + "metadata": {}, + "outputs": [], + "source": [ + "java_package_clustering_features_filtered=get_clusters_by_criteria(\n", + " java_package_clustering_features, by='clusterRadiusMax', ascending=False, cluster_count=20\n", + ")\n", + "plot_clusters(\n", + " clustering_visualization_dataframe=java_package_clustering_features_filtered,\n", + " title=\"Java Package Clusters with the biggest max radius\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec816783", + "metadata": {}, + "outputs": [], + "source": [ + "java_package_clustering_features_filtered=get_clusters_by_criteria(\n", + " java_package_clustering_features, by='clusterRadiusAverage', ascending=False, cluster_count=20\n", + ")\n", + "plot_clusters(\n", + " clustering_visualization_dataframe=java_package_clustering_features_filtered,\n", + " title=\"Java Package Clusters with the biggest average radius\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37b3b601", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_clusters_probabilities(\n", + " clustering_visualization_dataframe: pd.DataFrame,\n", + " title: str,\n", + " code_unit_column: str = \"shortCodeUnitName\",\n", + " cluster_label_column: str = \"clusterLabel\",\n", + " cluster_medoid_column: str = \"clusterMedoid\",\n", + " cluster_size_column: str = \"clusterSize\",\n", + " cluster_probability_column: str = \"clusterProbability\",\n", + " size_column: str = \"pageRank\",\n", + " x_position_column: str = 'embeddingVisualizationX',\n", + " y_position_column: str = 'embeddingVisualizationY',\n", + ") -> None:\n", + " \n", + " if clustering_visualization_dataframe.empty:\n", + " print(\"No projected data to plot available\")\n", + " return\n", + " \n", + " def truncate(text: str, max_length: int):\n", + " if len(text) <= max_length:\n", + " return text\n", + " return text[:max_length - 3] + \"...\"\n", + " \n", + " cluster_noise = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column] == -1]\n", + " cluster_non_noise = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column] != -1]\n", + " cluster_even_labels = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column] % 2 == 0]\n", + " cluster_odd_labels = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column] % 2 == 1]\n", + "\n", + " plot.figure(figsize=(10, 10))\n", + " plot.title(title)\n", + "\n", + " # Plot noise\n", + " plot.scatter(\n", + " x=cluster_noise[x_position_column],\n", + " y=cluster_noise[y_position_column],\n", + " s=cluster_noise[size_column] * 200 + 3,\n", + " color='lightgrey',\n", + " alpha=0.5,\n", + " label='Noise'\n", + " )\n", + "\n", + " # Plot even labels\n", + " plot.scatter(\n", + " x=cluster_even_labels[x_position_column],\n", + " y=cluster_even_labels[y_position_column],\n", + " s=cluster_even_labels[size_column] * 200 + 3,\n", + " c=cluster_even_labels[cluster_probability_column],\n", + " vmin=0.6,\n", + " vmax=1.0,\n", + " cmap='Greens',\n", + " alpha=0.8,\n", + " label='Even Label'\n", + " )\n", + "\n", + " # Plot odd labels\n", + " plot.scatter(\n", + " x=cluster_odd_labels[x_position_column],\n", + " y=cluster_odd_labels[y_position_column],\n", + " s=cluster_odd_labels[size_column] * 200 + 3,\n", + " c=cluster_odd_labels[cluster_probability_column],\n", + " vmin=0.6,\n", + " vmax=1.0,\n", + " cmap='Blues',\n", + " alpha=0.8,\n", + " label='Odd Label'\n", + " )\n", + "\n", + " # Annotate medoids of the cluster\n", + " cluster_medoids = cluster_non_noise[cluster_non_noise[cluster_medoid_column] == 1].sort_values(by=cluster_size_column, ascending=False).head(20)\n", + " for index, row in cluster_medoids.iterrows():\n", + " mean_cluster_probability = cluster_non_noise[cluster_non_noise[cluster_label_column] == row[cluster_label_column]][cluster_probability_column].mean()\n", + " plot.annotate(\n", + " text=f\"{row[cluster_label_column]}:{truncate(row[code_unit_column], 20)} ({mean_cluster_probability:.4f})\",\n", + " xy=(row[x_position_column], row[y_position_column]),\n", + " xytext=(5, 5),\n", + " alpha=0.5,\n", + " **plot_annotation_style\n", + " )\n", + "\n", + " lowest_probabilities = cluster_non_noise.sort_values(by=cluster_probability_column, ascending=True).reset_index().head(10)\n", + " for dataframe_index, row in lowest_probabilities.iterrows():\n", + " index = typing.cast(int, dataframe_index)\n", + " plot.annotate(\n", + " text=f\"!{row[cluster_label_column]}:{truncate(row[code_unit_column], 20)} ({row[cluster_probability_column]:.4f})\",\n", + " xy=(row[x_position_column], row[y_position_column]),\n", + " xytext=(5, 5 + index * 10),\n", + " color='red',\n", + " **plot_annotation_style\n", + " )\n", + "\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9dca6a98", + "metadata": {}, + "outputs": [], + "source": [ + "plot_clusters_probabilities(java_package_clustering_features, \"Java Package Clustering Probabilities (red=high uncertainty)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9580ddb", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_cluster_noise(\n", + " clustering_visualization_dataframe: pd.DataFrame,\n", + " title: str,\n", + " code_unit_column_name: str = \"shortCodeUnitName\",\n", + " cluster_label_column_name: str = \"clusterLabel\",\n", + " size_column_name: str = \"degree\",\n", + " color_column_name: str = \"pageRank\",\n", + " x_position_column = 'embeddingVisualizationX',\n", + " y_position_column = 'embeddingVisualizationY'\n", + ") -> None:\n", + " if clustering_visualization_dataframe.empty:\n", + " print(\"No projected data to plot available\")\n", + " return\n", + "\n", + " # Filter only noise points\n", + " noise_points = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] == -1]\n", + " noise_points = noise_points.sort_values(by=size_column_name, ascending=False).reset_index(drop=True)\n", + "\n", + " if noise_points.empty:\n", + " print(\"No noise points to plot.\")\n", + " return\n", + "\n", + " plot.figure(figsize=(10, 10))\n", + " plot.title(title)\n", + "\n", + " # Determine the color threshold for noise points\n", + " color_10th_highest_value = noise_points[color_column_name].nlargest(10).iloc[-1] # Get the 10th largest value\n", + " color_90_quantile = noise_points[color_column_name].quantile(0.90)\n", + " color_threshold = max(color_10th_highest_value, color_90_quantile)\n", + "\n", + " # Color the color column values above the 90% quantile threshold red, the rest light grey \n", + " colors = noise_points[color_column_name].apply(\n", + " lambda x: \"red\" if x >= color_threshold else \"lightgrey\"\n", + " )\n", + " normalized_size = noise_points[size_column_name] / noise_points[size_column_name].max()\n", + "\n", + " # Scatter plot for noise points\n", + " scatter = plot.scatter(\n", + " x=noise_points[x_position_column],\n", + " y=noise_points[y_position_column],\n", + " s=normalized_size.clip(lower=0.01) * 800 + 2,\n", + " c=colors,\n", + " alpha=0.6\n", + " )\n", + "\n", + " # Annotate the largest 10 points and all colored ones with their names\n", + " for index, row in noise_points.iterrows():\n", + " index = typing.cast(int, index)\n", + " if colors[index] != 'red' and index >= 10:\n", + " continue\n", + " plot.annotate(\n", + " text=row[code_unit_column_name],\n", + " xy=(row[x_position_column], row[y_position_column]),\n", + " xytext=(5, 5 + (index % 2) * 20), # Offset for better visibility\n", + " **plot_annotation_style\n", + " )\n", + "\n", + " plot.xlabel(x_position_column)\n", + " plot.ylabel(y_position_column)\n", + " plot.tight_layout()\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c56606c", + "metadata": {}, + "outputs": [], + "source": [ + "plot_cluster_noise(\n", + " clustering_visualization_dataframe=java_package_clustering_features,\n", + " title=\"Java Package Clustering Noise points that are surprisingly central (color) or popular (size)\",\n", + " size_column_name='degree',\n", + " color_column_name='pageRank'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9b2010c", + "metadata": {}, + "outputs": [], + "source": [ + "plot_cluster_noise(\n", + " clustering_visualization_dataframe=java_package_clustering_features,\n", + " title=\"Java Package Clustering Noise points that bridge flow (color) and are poorly integrated (size)\",\n", + " size_column_name='inverseClusteringCoefficient',\n", + " color_column_name='betweenness'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "891d79b2", + "metadata": {}, + "outputs": [], + "source": [ + "plot_cluster_noise(\n", + " clustering_visualization_dataframe=java_package_clustering_features,\n", + " title=\"Java Package Clustering Noise points with role inversion (size), possibly violating layering or dependency direction (color)\",\n", + " size_column_name='pageToArticleRankDifference',\n", + " color_column_name='betweenness'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "5682bb64", + "metadata": {}, + "source": [ + "## 2. Java Types" + ] + }, + { + "cell_type": "markdown", + "id": "25370d7f", + "metadata": {}, + "source": [ + "### 2.1 Differences between Page Rand and Article Rank\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "080d6c0e", + "metadata": {}, + "outputs": [], + "source": [ + "java_type_anomaly_detection_centrality_features_query = \"\"\"\n", + " MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit:Java:Type)\n", + " WHERE codeUnit.incomingDependencies IS NOT NULL\n", + " AND codeUnit.outgoingDependencies IS NOT NULL\n", + " AND codeUnit.centralityArticleRank IS NOT NULL\n", + " AND codeUnit.centralityPageRank IS NOT NULL\n", + " AND codeUnit.centralityBetweenness IS NOT NULL\n", + " RETURN DISTINCT \n", + " codeUnit.fqn AS codeUnitName\n", + " ,codeUnit.name AS shortCodeUnitName\n", + " ,artifact.name AS projectName\n", + " ,codeUnit.incomingDependencies AS incomingDependencies\n", + " ,codeUnit.outgoingDependencies AS outgoingDependencies\n", + " ,codeUnit.centralityArticleRank AS articleRank\n", + " ,codeUnit.centralityPageRank AS pageRank\n", + " ,codeUnit.centralityBetweenness AS betweenness\n", + "\"\"\"\n", + "\n", + "java_type_anomaly_detection_centrality_features = query_cypher_to_data_frame(java_type_anomaly_detection_centrality_features_query)\n", + "display(java_type_anomaly_detection_centrality_features.head(5))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6119f10", + "metadata": {}, + "outputs": [], + "source": [ + "plot_difference_between_article_and_page_rank(\n", + " java_type_anomaly_detection_centrality_features['pageRank'],\n", + " java_type_anomaly_detection_centrality_features['articleRank'],\n", + " java_type_anomaly_detection_centrality_features['shortCodeUnitName']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "89d4965c", + "metadata": {}, + "source": [ + "### 2.2 Local Clustering Coefficient" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18501d1d", + "metadata": {}, + "outputs": [], + "source": [ + "java_type_clustering_coefficient_query = \"\"\"\n", + " MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit:Java:Type)\n", + " WHERE codeUnit.incomingDependencies IS NOT NULL\n", + " AND codeUnit.outgoingDependencies IS NOT NULL\n", + " AND codeUnit.centralityPageRank IS NOT NULL\n", + " AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n", + " RETURN DISTINCT \n", + " codeUnit.fqn AS codeUnitName\n", + " ,codeUnit.name AS shortCodeUnitName\n", + " ,artifact.name AS projectName\n", + " ,codeUnit.incomingDependencies AS incomingDependencies\n", + " ,codeUnit.outgoingDependencies AS outgoingDependencies\n", + " ,codeUnit.centralityPageRank AS pageRank\n", + " ,codeUnit.communityLocalClusteringCoefficient AS clusteringCoefficient\n", + " ,codeUnit.clusteringHDBSCANNoise AS clusterNoise\n", + "\"\"\"\n", + "\n", + "java_type_clustering_coefficient_features = query_cypher_to_data_frame(java_type_clustering_coefficient_query)\n", + "display(java_type_clustering_coefficient_features.head(5))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "289c2fe3", + "metadata": {}, + "outputs": [], + "source": [ + "plot_clustering_coefficient_distribution(java_type_clustering_coefficient_features['clusteringCoefficient'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "320858c9", + "metadata": {}, + "outputs": [], + "source": [ + "plot_clustering_coefficient_vs_page_rank(\n", + " java_type_clustering_coefficient_features['clusteringCoefficient'],\n", + " java_type_clustering_coefficient_features['pageRank'],\n", + " java_type_clustering_coefficient_features['shortCodeUnitName'],\n", + " java_type_clustering_coefficient_features['clusterNoise']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "69256999", + "metadata": {}, + "source": [ + "### 2.3 HDBSCAN Clusters\n", + "\n", + "HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) is a clustering algorithm that can identify clusters of varying densities and shapes. It is particularly useful for detecting anomalies in data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "765cb8bb", + "metadata": {}, + "outputs": [], + "source": [ + "java_type_clustering_query = \"\"\"\n", + " MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit:Java:Type)\n", + " WHERE codeUnit.incomingDependencies IS NOT NULL\n", + " AND codeUnit.outgoingDependencies IS NOT NULL\n", + " AND codeUnit.centralityPageRank IS NOT NULL\n", + " AND codeUnit.centralityArticleRank IS NOT NULL\n", + " AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL\n", + " AND codeUnit.centralityBetweenness IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANSize IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANRadiusMax IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANRadiusAverage IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANNormalizedDistanceToMedoid IS NOT NULL\n", + " AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL\n", + " AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL\n", + " RETURN DISTINCT \n", + " codeUnit.fqn AS codeUnitName\n", + " ,codeUnit.name AS shortCodeUnitName\n", + " ,artifact.name AS projectName\n", + " ,codeUnit.incomingDependencies AS incomingDependencies\n", + " ,codeUnit.outgoingDependencies AS outgoingDependencies\n", + " ,codeUnit.centralityPageRank AS pageRank\n", + " ,1.0 - codeUnit.communityLocalClusteringCoefficient AS inverseClusteringCoefficient\n", + " ,codeUnit.centralityBetweenness AS betweenness\n", + " ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference\n", + " ,codeUnit.clusteringHDBSCANLabel AS clusterLabel\n", + " ,codeUnit.clusteringHDBSCANProbability AS clusterProbability\n", + " ,codeUnit.clusteringHDBSCANNoise AS clusterNoise\n", + " ,codeUnit.clusteringHDBSCANMedoid AS clusterMedoid\n", + " ,codeUnit.clusteringHDBSCANSize AS clusterSize\n", + " ,codeUnit.clusteringHDBSCANRadiusMax AS clusterRadiusMax\n", + " ,codeUnit.clusteringHDBSCANRadiusAverage AS clusterRadiusAverage\n", + " ,codeUnit.clusteringHDBSCANNormalizedDistanceToMedoid AS clusterNormalizedDistanceToMedoid\n", + " ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX\n", + " ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\n", + "\"\"\"\n", + "\n", + "java_type_clustering_features = query_cypher_to_data_frame(java_type_clustering_query)\n", + "java_type_clustering_features['degree'] = java_type_clustering_features['incomingDependencies'] + java_type_clustering_features['outgoingDependencies']\n", + "\n", + "display(java_type_clustering_features.head(5))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbadf787", + "metadata": {}, + "outputs": [], + "source": [ + "add_visualization_cluster_diameter(java_type_clustering_features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ec974d1", + "metadata": {}, + "outputs": [], + "source": [ + "java_type_clustering_features_filtered=get_clusters_by_criteria(\n", + " java_type_clustering_features, by='clusterSize', ascending=False, cluster_count=20\n", + ")\n", + "plot_clusters(\n", + " clustering_visualization_dataframe=java_type_clustering_features_filtered,\n", + " title=\"Java Type Clusters with the largest size\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "881783e1", + "metadata": {}, + "outputs": [], + "source": [ + "java_type_clustering_features_filtered=get_clusters_by_criteria(\n", + " java_type_clustering_features, by='clusterRadiusMax', ascending=False, cluster_count=20\n", + ")\n", + "plot_clusters(\n", + " clustering_visualization_dataframe=java_type_clustering_features_filtered,\n", + " title=\"Java Type Clusters with the biggest max radius\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ace9b99", + "metadata": {}, + "outputs": [], + "source": [ + "java_type_clustering_features_filtered=get_clusters_by_criteria(\n", + " java_type_clustering_features, by='clusterRadiusAverage', ascending=False, cluster_count=20\n", + ")\n", + "plot_clusters(\n", + " clustering_visualization_dataframe=java_type_clustering_features_filtered,\n", + " title=\"Java Type Clusters with the biggest average radius\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a80c6a0a", + "metadata": {}, + "outputs": [], + "source": [ + "plot_clusters_probabilities(java_type_clustering_features, \"Java Type Clustering Probabilities (red=high uncertainty)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d70ec20c", + "metadata": {}, + "outputs": [], + "source": [ + "plot_cluster_noise(\n", + " clustering_visualization_dataframe=java_type_clustering_features,\n", + " title=\"Java Type Clustering Noise points that are surprisingly central (color) or popular (size)\",\n", + " size_column_name='degree',\n", + " color_column_name='pageRank'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8d888be", + "metadata": {}, + "outputs": [], + "source": [ + "plot_cluster_noise(\n", + " clustering_visualization_dataframe=java_type_clustering_features,\n", + " title=\"Java Type Clustering Noise points that bridge flow (color) and are poorly integrated (size)\",\n", + " size_column_name='inverseClusteringCoefficient',\n", + " color_column_name='betweenness'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9921ad7", + "metadata": {}, + "outputs": [], + "source": [ + "plot_cluster_noise(\n", + " clustering_visualization_dataframe=java_type_clustering_features,\n", + " title=\"Java Type Clustering Noise points with role inversion (size), possibly violating layering or dependency direction (color)\",\n", + " size_column_name='pageToArticleRankDifference',\n", + " color_column_name='betweenness'\n", + ")" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "JohT" + } + ], + "code_graph_analysis_pipeline_data_validation": "ValidateAlwaysFalse", + "kernelspec": { + "display_name": "codegraph", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + }, + "title": "Anomaly Detection - Manual Exploration" + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb new file mode 100644 index 000000000..2eecc7424 --- /dev/null +++ b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb @@ -0,0 +1,843 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "2f0eabc4", + "metadata": {}, + "source": [ + "# Anomaly Detection with Isolation Forest - Manual Exploration\n", + "\n", + "This notebook demonstrates anomaly detection with Isolation Forest for static code analysis gathered by using jQAssistant and Neo4j. The focus is on detecting anomalies in the data, which can be useful for identifying potential issues or areas for improvement in the codebase.\n", + "\n", + "
\n", + "\n", + "### References\n", + "- [jqassistant](https://jqassistant.org)\n", + "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)" + ] + }, + { + "cell_type": "markdown", + "id": "cee1e4fb", + "metadata": {}, + "source": [ + "## Features overview\n", + "\n", + "| **Feature** | **Type** | **What it Measures** | **Why It’s Useful** |\n", + "| -------------------------------- | ------------------ | ------------------------------------------- | ------------------------------------------- |\n", + "| `PageRank` | Centrality | Popularity / referenced code | High = many dependents |\n", + "| `ArticleRank` | Centrality | How much the code depends on others | High = high dependency |\n", + "| `PageRank - ArticleRank` | Relative Rank | Role inversion / architectural layering | Highlights mismatches |\n", + "| `Betweenness Centrality` | Centrality | Bridge or control nodes | High = structural chokepoints |\n", + "| `Local Clustering Coefficient` | Structural | Local cohesion / modularity | Low = isolated node in a clique-like region |\n", + "| `Degree` (Total and In/Out) | Structural | Connectivity | Raw values may dominate |\n", + "| `Node Embedding` (PCA reduced) | Latent | Structural and semantic similarity | Captures latent position in graph |\n", + "| `Normalized Cluster Distance` | Geometric | Relative to cluster radius | Adds context to position |\n", + "| `1.0 - HDBSCAN membership probability` | Cluster Confidence | How confidently HDBSCAN clustered this node, 1-x inverted | High score = likely anomaly |\n", + "| `Average Cluster Radius` | Cluster Context | How tight or spread out the cluster is | Highly spread clusters may be a less meaningful one |\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4191f259", + "metadata": {}, + "outputs": [], + "source": [ + "import typing\n", + "import numpy.typing as numpy_typing\n", + "\n", + "import os\n", + "from IPython.display import display\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.ensemble import IsolationForest, RandomForestClassifier\n", + "\n", + "import matplotlib.pyplot as plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0676813", + "metadata": {}, + "outputs": [], + "source": [ + "#The following cell uses the build-in %html \"magic\" to override the CSS style for tables to a much smaller size.\n", + "#This is especially needed for PDF export of tables with multiple columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebac1bb9", + "metadata": {}, + "outputs": [], + "source": [ + "%%html\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07319282", + "metadata": {}, + "outputs": [], + "source": [ + "# Main Colormap\n", + "# main_color_map = 'nipy_spectral'\n", + "main_color_map = 'viridis'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8ef41ff", + "metadata": {}, + "outputs": [], + "source": [ + "from sys import version as python_version\n", + "print('Python version: {}'.format(python_version))\n", + "\n", + "from numpy import __version__ as numpy_version\n", + "print('numpy version: {}'.format(numpy_version))\n", + "\n", + "from pandas import __version__ as pandas_version\n", + "print('pandas version: {}'.format(pandas_version))\n", + "\n", + "from sklearn import __version__ as sklearn_version\n", + "print('sklearn version: {}'.format(sklearn_version))\n", + "\n", + "from matplotlib import __version__ as matplotlib_version\n", + "print('matplotlib version: {}'.format(matplotlib_version))\n", + "\n", + "from neo4j import __version__ as neo4j_version\n", + "print('neo4j version: {}'.format(neo4j_version))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c5dab37", + "metadata": {}, + "outputs": [], + "source": [ + "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell \n", + "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n", + "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n", + "from neo4j import GraphDatabase\n", + "\n", + "driver = GraphDatabase.driver(\n", + " uri=\"bolt://localhost:7687\", \n", + " auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\"))\n", + ")\n", + "driver.verify_connectivity()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1db254b", + "metadata": {}, + "outputs": [], + "source": [ + "def query_cypher_to_data_frame(query: typing.LiteralString, parameters: typing.Optional[typing.Dict[str, typing.Any]] = None):\n", + " records, summary, keys = driver.execute_query(query, parameters_=parameters)\n", + " return pd.DataFrame([record.values() for record in records], columns=keys)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7656bd5", + "metadata": {}, + "outputs": [], + "source": [ + "plot_annotation_style: dict = {\n", + " 'textcoords': 'offset points',\n", + " 'arrowprops': dict(arrowstyle='->', color='black', alpha=0.3),\n", + " 'fontsize': 6,\n", + " 'backgroundcolor': 'white',\n", + " 'bbox': dict(boxstyle='round,pad=0.4',\n", + " edgecolor='silver',\n", + " facecolor='whitesmoke',\n", + " alpha=1\n", + " )\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "0c68aa20", + "metadata": {}, + "source": [ + "## 1. Java Packages" + ] + }, + { + "cell_type": "markdown", + "id": "c927388f", + "metadata": {}, + "source": [ + "### 1.1 Query Features\n", + "\n", + "Query all features that are relevant for anomaly detection. Some of them come from precalculated clustering (HDBSCAN), node embeddings (Fast Random Projection), community detection algorithms (Leiden, Local Clustering Coefficient), centrality algorithms (Page Rank, Article Rank, Betweenness) and classical metrics like the in-/out-degree." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c26f8f19", + "metadata": {}, + "outputs": [], + "source": [ + "java_package_anomaly_detection_features_query = \"\"\"\n", + " MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit:Java:Package)\n", + " WHERE codeUnit.incomingDependencies IS NOT NULL\n", + " AND codeUnit.outgoingDependencies IS NOT NULL\n", + " and codeUnit.embeddingsFastRandomProjectionTunedForClustering IS NOT NULL\n", + " AND codeUnit.centralityPageRank IS NOT NULL\n", + " AND codeUnit.centralityArticleRank IS NOT NULL\n", + " AND codeUnit.centralityBetweenness IS NOT NULL\n", + " AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANRadiusAverage IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANNormalizedDistanceToMedoid IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANSize IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL\n", + " AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL\n", + " AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL\n", + " RETURN DISTINCT \n", + " codeUnit.fqn AS codeUnitName\n", + " ,codeUnit.name AS shortCodeUnitName\n", + " ,artifact.name AS projectName\n", + " ,codeUnit.incomingDependencies AS incomingDependencies\n", + " ,codeUnit.outgoingDependencies AS outgoingDependencies\n", + " ,codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree\n", + " ,codeUnit.embeddingsFastRandomProjectionTunedForClustering AS embedding\n", + " ,codeUnit.centralityPageRank AS pageRank\n", + " ,codeUnit.centralityArticleRank AS articleRank\n", + " ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference\n", + " ,codeUnit.centralityBetweenness AS betweenness\n", + " ,codeUnit.communityLocalClusteringCoefficient AS locallusteringCoefficient\n", + " ,1.0 - codeUnit.clusteringHDBSCANProbability AS clusterApproximateOutlierScore\n", + " ,codeUnit.clusteringHDBSCANNoise AS clusterNoise\n", + " ,codeUnit.clusteringHDBSCANRadiusAverage AS clusterRadiusAverage\n", + " ,codeUnit.clusteringHDBSCANNormalizedDistanceToMedoid AS clusterDistanceToMedoid\n", + " ,codeUnit.clusteringHDBSCANSize AS clusterSize\n", + " ,codeUnit.clusteringHDBSCANLabel AS clusterLabel\n", + " ,codeUnit.clusteringHDBSCANMedoid AS clusterMedoid\n", + " ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX\n", + " ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\n", + "\"\"\"\n", + "\n", + "java_package_anomaly_detection_features = query_cypher_to_data_frame(java_package_anomaly_detection_features_query)\n", + "java_package_features_to_standardize = java_package_anomaly_detection_features.columns.drop(['codeUnitName', 'shortCodeUnitName', 'projectName', 'embedding', 'clusterLabel', 'clusterSize', 'clusterMedoid', 'embeddingVisualizationX', 'embeddingVisualizationY']).to_list()\n", + "\n", + "display(java_package_anomaly_detection_features.head(5))" + ] + }, + { + "cell_type": "markdown", + "id": "ff9f1415", + "metadata": {}, + "source": [ + "### 1.2 Data preparation\n", + "\n", + "Prepare the data by standardizing numeric fields and reducing the dimensionality of the node embeddings to not dominate the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2ebaa37", + "metadata": {}, + "outputs": [], + "source": [ + "def validate_data(features: pd.DataFrame) -> None:\n", + " if features.empty:\n", + " print(\"Data Validation Info: No data\")\n", + "\n", + " if features.isnull().values.any():\n", + " raise RuntimeError(\"Data Validation Error: Some values are null. Fix the wrong values or filter them out.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d81f593", + "metadata": {}, + "outputs": [], + "source": [ + "validate_data(java_package_anomaly_detection_features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae1b7103", + "metadata": {}, + "outputs": [], + "source": [ + "def standardize_features(features: pd.DataFrame, feature_list: list[str]) -> numpy_typing.NDArray:\n", + " features_to_scale = features[feature_list]\n", + " scaler = StandardScaler()\n", + " return scaler.fit_transform(features_to_scale)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2de5ade1", + "metadata": {}, + "outputs": [], + "source": [ + "java_package_anomaly_detection_features_standardized = standardize_features(java_package_anomaly_detection_features, java_package_features_to_standardize)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf5f02ad", + "metadata": {}, + "outputs": [], + "source": [ + "def reduce_dimensionality_of_node_embeddings(\n", + " features: pd.DataFrame, \n", + " min_dimensions: int = 20, \n", + " max_dimensions: int = 40, \n", + " target_variance: float = 0.90,\n", + " embedding_column_name: str = 'embedding'\n", + ") -> numpy_typing.NDArray:\n", + " \"\"\"\n", + " Automatically reduce the dimensionality of node embeddings using Principal Component Analysis (PCA)\n", + " to reach a target explained variance ratio with the lowest possible number of components (output dimensions).\n", + "\n", + " Parameters:\n", + " - features (pd.DataFrame) with a column 'embedding', where every value contains a float array with original dimensions.\n", + " - min_dimensions: Even if possible with the given variance, don't go below this number of dimensions for the output\n", + " - max_dimensions: Return at most the max number of dimensions, even if that means, that the target variance can't be met.\n", + " - target_variance (float): Cumulative variance threshold (default: 0.90)\n", + " - embedding_column_name (string): Defaults to 'embedding'\n", + "\n", + " Returns: Reduced embeddings as an numpy array\n", + " \"\"\"\n", + "\n", + " # Convert the input and get the original dimension\n", + " embeddings = np.stack(features[embedding_column_name].apply(np.array).tolist())\n", + " original_dimension = embeddings.shape[1]\n", + "\n", + " # Fit PCA without dimensionality reduction to get explained variance\n", + " full_principal_component_analysis_without_reduction = PCA()\n", + " full_principal_component_analysis_without_reduction.fit(embeddings)\n", + "\n", + " # Find smallest number of components to reach target variance\n", + " cumulative_variance = np.cumsum(full_principal_component_analysis_without_reduction.explained_variance_ratio_)\n", + " best_n_components = np.searchsorted(cumulative_variance, target_variance) + 1\n", + " best_n_components = max(best_n_components, min_dimensions) # Use at least min_dimensions\n", + " best_n_components = min(best_n_components, max_dimensions) # Use at most max_dimensions\n", + "\n", + " # Apply PCA with optimal number of components\n", + " principal_component_analysis = PCA(n_components=best_n_components)\n", + " java_type_anomaly_detection_node_embeddings_reduced = principal_component_analysis.fit_transform(embeddings)\n", + "\n", + " explained_variance_ratio_sum = sum(principal_component_analysis.explained_variance_ratio_)\n", + " print(f\"Dimensionality reduction from {original_dimension} to {best_n_components} (min {min_dimensions}) of node embeddings using Principal Component Analysis (PCA): Explained variance is {explained_variance_ratio_sum:.4f}.\")\n", + "\n", + " return java_type_anomaly_detection_node_embeddings_reduced\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9a33f00", + "metadata": {}, + "outputs": [], + "source": [ + "java_package_anomaly_detection_node_embeddings_reduced = reduce_dimensionality_of_node_embeddings(java_package_anomaly_detection_features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca2d5044", + "metadata": {}, + "outputs": [], + "source": [ + "java_package_anomaly_detection_features_prepared = np.hstack([java_package_anomaly_detection_features_standardized, java_package_anomaly_detection_node_embeddings_reduced])\n", + "java_package_anomaly_detection_feature_names = list(java_package_features_to_standardize) + [f'pca_{i}' for i in range(java_package_anomaly_detection_node_embeddings_reduced.shape[1])]" + ] + }, + { + "cell_type": "markdown", + "id": "980e72e7", + "metadata": {}, + "source": [ + "### 1.3 List the top 10 anomalies found using Isolation Forest\n", + "\n", + "> The IsolationForest 'isolates' observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4219727a", + "metadata": {}, + "outputs": [], + "source": [ + "def detect_anomalies(\n", + " prepared_features: numpy_typing.NDArray, \n", + " original_features: pd.DataFrame,\n", + " anomaly_label_column: str = 'anomalyLabel',\n", + " anomaly_score_column: str = 'anomalyScore',\n", + ") -> pd.DataFrame:\n", + " isolation_forest = IsolationForest(n_estimators=200, contamination=0.05, random_state=42)\n", + " anomaly_score = isolation_forest.fit_predict(prepared_features)\n", + "\n", + " original_features[anomaly_label_column] = anomaly_score * -1 # 1 = anomaly, 0 = no anomaly\n", + " original_features[anomaly_score_column] = isolation_forest.decision_function(prepared_features) * -1 # higher = more anomalous\n", + " return original_features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41fb094f", + "metadata": {}, + "outputs": [], + "source": [ + "java_package_anomaly_detection_features = detect_anomalies(java_package_anomaly_detection_features_prepared, java_package_anomaly_detection_features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3eeac684", + "metadata": {}, + "outputs": [], + "source": [ + "def get_top_10_anomalies(\n", + " anomaly_detected_features: pd.DataFrame, \n", + " anomaly_label_column: str = \"anomalyLabel\",\n", + " anomaly_score_column: str = \"anomalyScore\"\n", + ") -> pd.DataFrame:\n", + " anomalies = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == -1]\n", + " return anomalies.sort_values(by=anomaly_score_column, ascending=False).reset_index(drop=True).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87b43abf", + "metadata": {}, + "outputs": [], + "source": [ + "display(get_top_10_anomalies(java_package_anomaly_detection_features))" + ] + }, + { + "cell_type": "markdown", + "id": "efa822ca", + "metadata": {}, + "source": [ + "### 1.4 Plot the 20 most influential features\n", + "\n", + "Use Random Forest as a proxy to estimate the importance of each feature contributing to the anomaly score." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24427977", + "metadata": {}, + "outputs": [], + "source": [ + "def get_feature_importances(\n", + " anomaly_detected_features: pd.DataFrame, \n", + " prepared_features: numpy_typing.NDArray,\n", + " anomaly_label_column: str = \"anomalyLabel\",\n", + ") -> numpy_typing.NDArray:\n", + " \"\"\"\n", + " Use Random Forest as a proxy model to find out which are the most important features for the anomaly detection model (Isolation Forest).\n", + " This helps to see if embedding components dominate (top 10 filled with them), and then tune accordingly.\n", + " \"\"\"\n", + " # Use IsolationForest labels as a \"pseudo ground truth\"\n", + " y_pseudo = (anomaly_detected_features[anomaly_label_column] == -1).astype(int)\n", + "\n", + " # Fit classifier to match the IF model\n", + " proxy_random_forest = RandomForestClassifier(n_estimators=100, random_state=42)\n", + " proxy_random_forest.fit(prepared_features, y_pseudo)\n", + "\n", + " return proxy_random_forest.feature_importances_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97b21d49", + "metadata": {}, + "outputs": [], + "source": [ + "java_package_anomaly_detection_importances = get_feature_importances(java_package_anomaly_detection_features, java_package_anomaly_detection_features_prepared)\n", + "java_package_anomaly_detection_importances_series = pd.Series(java_package_anomaly_detection_importances, index=java_package_anomaly_detection_feature_names).sort_values(ascending=False)\n", + "#display(java_type_anomaly_detection_importances_series.head(10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14d0b03e", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_feature_importances(feature_importances_series: pd.Series, title_prefix: str) -> None:\n", + " feature_importances_series.head(20).plot(\n", + " kind='barh',\n", + " figsize=(10, 6),\n", + " color='skyblue',\n", + " title=f\"{title_prefix}: Top 20 Feature Importances (Random Forest Proxy)\",\n", + " xlabel=\"Importance\"\n", + " )\n", + " plot.gca().invert_yaxis() # Most important feature at the top\n", + " plot.tight_layout()\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "974a2bae", + "metadata": {}, + "outputs": [], + "source": [ + "plot_feature_importances(java_package_anomaly_detection_importances_series, title_prefix='Java Packages')" + ] + }, + { + "cell_type": "markdown", + "id": "c9dd6246", + "metadata": {}, + "source": [ + "### 1.5. Plot anomalies\n", + "\n", + "Plots clustered nodes and highlights anomalies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab1e76ab", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_anomalies(\n", + " clustering_visualization_dataframe: pd.DataFrame,\n", + " title_prefix: str,\n", + " code_unit_column: str = \"shortCodeUnitName\",\n", + " cluster_label_column: str = \"clusterLabel\",\n", + " cluster_medoid_column: str = \"clusterMedoid\",\n", + " cluster_size_column: str = \"clusterSize\",\n", + " anomaly_label_column: str = \"anomalyLabel\",\n", + " anomaly_score_column: str = \"anomalyScore\",\n", + " page_rank_column: str = \"pageRank\",\n", + " x_position_column: str = 'embeddingVisualizationX',\n", + " y_position_column: str = 'embeddingVisualizationY',\n", + ") -> None:\n", + " \n", + " if clustering_visualization_dataframe.empty:\n", + " print(\"No projected data to plot available\")\n", + " return\n", + " \n", + " def truncate(text: str, max_length: int):\n", + " if len(text) <= max_length:\n", + " return text\n", + " return text[:max_length - 3] + \"...\"\n", + " \n", + " cluster_anomalies = clustering_visualization_dataframe[clustering_visualization_dataframe[anomaly_label_column] == 1]\n", + " cluster_without_anomalies = clustering_visualization_dataframe[clustering_visualization_dataframe[anomaly_label_column] != 1]\n", + " cluster_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] == -1]\n", + " cluster_non_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] != -1]\n", + "\n", + " plot.figure(figsize=(10, 10))\n", + " plot.title(title_prefix + ' (size=PageRank, color=ClusterLabel, red=Anomaly)')\n", + "\n", + " # Plot noise\n", + " plot.scatter(\n", + " x=cluster_noise[x_position_column],\n", + " y=cluster_noise[y_position_column],\n", + " s=cluster_noise[page_rank_column] * 200 + 4,\n", + " color='lightgrey',\n", + " alpha=0.5,\n", + " label='Noise'\n", + " )\n", + "\n", + " # Plot clusters\n", + " plot.scatter(\n", + " x=cluster_non_noise[x_position_column],\n", + " y=cluster_non_noise[y_position_column],\n", + " s=cluster_non_noise[page_rank_column] * 200 + 4,\n", + " c=cluster_non_noise[cluster_label_column],\n", + " cmap='tab20',\n", + " alpha=0.7,\n", + " label='Clusters'\n", + " )\n", + "\n", + " # Plot anomalies\n", + " plot.scatter(\n", + " x=cluster_anomalies[x_position_column],\n", + " y=cluster_anomalies[y_position_column],\n", + " s=cluster_anomalies[page_rank_column] * 200 + 4,\n", + " c=cluster_anomalies[anomaly_score_column],\n", + " cmap=\"Reds\",\n", + " alpha=0.9,\n", + " label='Anomaly'\n", + " )\n", + "\n", + " # Annotate medoids of the cluster\n", + " cluster_medoids = cluster_non_noise[cluster_non_noise[cluster_medoid_column] == 1].sort_values(by=cluster_size_column, ascending=False).head(20)\n", + " for index, row in cluster_medoids.iterrows():\n", + " plot.annotate(\n", + " text=f\"{row[cluster_label_column]}:{truncate(row[code_unit_column], 20)} ({row[anomaly_score_column]:.4f})\",\n", + " xy=(row[x_position_column], row[y_position_column]),\n", + " xytext=(5, 5),\n", + " alpha=0.4,\n", + " **plot_annotation_style\n", + " )\n", + "\n", + " anomalies = cluster_anomalies.sort_values(by=anomaly_score_column, ascending=False).reset_index(drop=True).head(6)\n", + " for dataframe_index, row in anomalies.iterrows():\n", + " index = typing.cast(int, dataframe_index)\n", + " plot.annotate(\n", + " text=f\"{row[cluster_label_column]}:{truncate(row[code_unit_column], 20)} ({row[anomaly_score_column]:.4f})\",\n", + " xy=(row[x_position_column], row[y_position_column]),\n", + " xytext=(5, 5 + (index % 5) * 10),\n", + " color='red',\n", + " **plot_annotation_style\n", + " )\n", + "\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aea29887", + "metadata": {}, + "outputs": [], + "source": [ + "plot_anomalies(java_package_anomaly_detection_features, title_prefix=\"Java Package Anomalies\")" + ] + }, + { + "cell_type": "markdown", + "id": "5682bb64", + "metadata": {}, + "source": [ + "## 2. Java Types" + ] + }, + { + "cell_type": "markdown", + "id": "25370d7f", + "metadata": {}, + "source": [ + "### 2.1 Query Features\n", + "\n", + "Query all features that are relevant for anomaly detection. Some of them come from precalculated clustering (HDBSCAN), node embeddings (Fast Random Projection), community detection algorithms (Leiden, Local Clustering Coefficient), centrality algorithms (Page Rank, Article Rank, Betweenness) and classical metrics like the in-/out-degree.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3db1ba29", + "metadata": {}, + "outputs": [], + "source": [ + "java_type_anomaly_detection_features_query = \"\"\"\n", + " MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit:Java:Type)\n", + " WHERE codeUnit.incomingDependencies IS NOT NULL\n", + " AND codeUnit.outgoingDependencies IS NOT NULL\n", + " and codeUnit.embeddingsFastRandomProjectionTunedForClustering IS NOT NULL\n", + " AND codeUnit.centralityPageRank IS NOT NULL\n", + " AND codeUnit.centralityArticleRank IS NOT NULL\n", + " AND codeUnit.centralityBetweenness IS NOT NULL\n", + " AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANRadiusAverage IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANNormalizedDistanceToMedoid IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANSize IS NOT NULL\n", + " AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL\n", + " AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL\n", + " AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL\n", + " RETURN DISTINCT \n", + " codeUnit.fqn AS codeUnitName\n", + " ,codeUnit.name AS shortCodeUnitName\n", + " ,artifact.name AS projectName\n", + " ,codeUnit.incomingDependencies AS incomingDependencies\n", + " ,codeUnit.outgoingDependencies AS outgoingDependencies\n", + " ,codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree\n", + " ,codeUnit.embeddingsFastRandomProjectionTunedForClustering AS embedding\n", + " ,codeUnit.centralityPageRank AS pageRank\n", + " ,codeUnit.centralityArticleRank AS articleRank\n", + " ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference\n", + " ,codeUnit.centralityBetweenness AS betweenness\n", + " ,codeUnit.communityLocalClusteringCoefficient AS locallusteringCoefficient\n", + " ,1.0 - codeUnit.clusteringHDBSCANProbability AS clusterApproximateOutlierScore\n", + " ,codeUnit.clusteringHDBSCANNoise AS clusterNoise\n", + " ,codeUnit.clusteringHDBSCANRadiusAverage AS clusterRadiusAverage\n", + " ,codeUnit.clusteringHDBSCANNormalizedDistanceToMedoid AS clusterDistanceToMedoid\n", + " ,codeUnit.clusteringHDBSCANLabel AS clusterLabel\n", + " ,codeUnit.clusteringHDBSCANSize AS clusterSize\n", + " ,codeUnit.clusteringHDBSCANMedoid AS clusterMedoid\n", + " ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX\n", + " ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\n", + "\"\"\"\n", + "\n", + "java_type_anomaly_detection_features = query_cypher_to_data_frame(java_type_anomaly_detection_features_query)\n", + "java_type_features_to_standardize = java_type_anomaly_detection_features.columns.drop(['codeUnitName', 'shortCodeUnitName', 'projectName', 'embedding', 'clusterLabel', 'clusterSize', 'clusterMedoid', 'embeddingVisualizationX', 'embeddingVisualizationY']).to_list()\n", + "\n", + "display(java_type_anomaly_detection_features.head(5))" + ] + }, + { + "cell_type": "markdown", + "id": "f4c67ed0", + "metadata": {}, + "source": [ + "### 1.2 Data preparation\n", + "\n", + "Prepare the data by standardizing numeric fields and reducing the dimensionality of the node embeddings to not dominate the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "681090a6", + "metadata": {}, + "outputs": [], + "source": [ + "validate_data(java_type_anomaly_detection_features)\n", + "java_type_anomaly_detection_features_standardized = standardize_features(java_type_anomaly_detection_features, java_type_features_to_standardize)\n", + "java_type_anomaly_detection_node_embeddings_reduced = reduce_dimensionality_of_node_embeddings(java_type_anomaly_detection_features)\n", + "\n", + "java_type_anomaly_detection_features_prepared = np.hstack([java_type_anomaly_detection_features_standardized, java_type_anomaly_detection_node_embeddings_reduced])\n", + "java_type_anomaly_detection_feature_names = list(java_type_features_to_standardize) + [f'pca_{i}' for i in range(java_type_anomaly_detection_node_embeddings_reduced.shape[1])]" + ] + }, + { + "cell_type": "markdown", + "id": "4ce7ac1b", + "metadata": {}, + "source": [ + "### 2.3 List the top 10 anomalies found using Isolation Forest\n", + "\n", + "> The IsolationForest 'isolates' observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66b9a864", + "metadata": {}, + "outputs": [], + "source": [ + "java_type_anomaly_detection_features = detect_anomalies(java_type_anomaly_detection_features_prepared, java_type_anomaly_detection_features)\n", + "display(get_top_10_anomalies(java_type_anomaly_detection_features))" + ] + }, + { + "cell_type": "markdown", + "id": "4e565f84", + "metadata": {}, + "source": [ + "### 2.4 Plot the 20 most influential features\n", + "\n", + "Use Random Forest as a proxy to estimate the importance of each feature contributing to the anomaly score." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b97f299", + "metadata": {}, + "outputs": [], + "source": [ + "java_type_anomaly_detection_importances = get_feature_importances(java_type_anomaly_detection_features, java_type_anomaly_detection_features_prepared)\n", + "java_type_anomaly_detection_importances_series = pd.Series(java_type_anomaly_detection_importances, index=java_type_anomaly_detection_feature_names).sort_values(ascending=False)\n", + "#display(java_type_anomaly_detection_importances_series.head(10))\n", + "\n", + "plot_feature_importances(java_type_anomaly_detection_importances_series, title_prefix='Java Types')" + ] + }, + { + "cell_type": "markdown", + "id": "68a00628", + "metadata": {}, + "source": [ + "### 2.5. Plot anomalies\n", + "\n", + "Plots clustered nodes and highlights anomalies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ecc9fb4", + "metadata": {}, + "outputs": [], + "source": [ + "plot_anomalies(java_type_anomaly_detection_features, title_prefix=\"Java Type Anomalies\")" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "JohT" + } + ], + "code_graph_analysis_pipeline_data_validation": "ValidateAlwaysFalse", + "kernelspec": { + "display_name": "codegraph", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + }, + "title": "Anomaly Detection - Manual Exploration" + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/domains/anomaly-detection/features/AnomalyDetectionClusteringResultsReset.cypher b/domains/anomaly-detection/features/AnomalyDetectionClusteringResultsReset.cypher new file mode 100644 index 000000000..414b5d2da --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionClusteringResultsReset.cypher @@ -0,0 +1,9 @@ +// Reset all clustering results related to anomaly detection for code units to force a clean recalculation + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + REMOVE codeUnit.clusteringHDBSCANLabel + ,codeUnit.clusteringHDBSCANProbability + ,codeUnit.clusteringHDBSCANNoise + ,codeUnit.clusteringHDBSCANMedoid + ,codeUnit.clusteringHDBSCANSize \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-ArticleRank-Exists.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-ArticleRank-Exists.cypher new file mode 100644 index 000000000..3c0ebc854 --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-ArticleRank-Exists.cypher @@ -0,0 +1,9 @@ +// Return the first node with a centralityArticleRank if it exists + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.centralityArticleRank IS NOT NULL + RETURN codeUnit.name AS shortCodeUnitName + ,elementId(codeUnit) AS nodeElementId + ,codeUnit.centralityArticleRank AS articleRank + LIMIT 1 \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-ArticleRank-Write.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-ArticleRank-Write.cypher new file mode 100644 index 000000000..39b6df461 --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-ArticleRank-Write.cypher @@ -0,0 +1,10 @@ +// Calculates and writes the Article Rank centrality score for anomaly detection + +CALL gds.articleRank.write( + $projection_name + '-directed-cleaned', { + maxIterations: 50 + ,relationshipWeightProperty: $projection_weight_property + ,writeProperty: 'centralityArticleRank' +}) + YIELD nodePropertiesWritten, ranIterations, didConverge, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis +RETURN nodePropertiesWritten, ranIterations, didConverge, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-Betweenness-Exists.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-Betweenness-Exists.cypher new file mode 100644 index 000000000..c0b1e80dd --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-Betweenness-Exists.cypher @@ -0,0 +1,9 @@ +// Return the first node with a centralityBetweenness if it exists + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.centralityBetweenness IS NOT NULL + RETURN codeUnit.name AS shortCodeUnitName + ,elementId(codeUnit) AS nodeElementId + ,codeUnit.centralityBetweenness AS pageRank + LIMIT 1 \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-Betweenness-Write.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-Betweenness-Write.cypher new file mode 100644 index 000000000..64084ec5c --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-Betweenness-Write.cypher @@ -0,0 +1,9 @@ +// Calculates and writes the Betweeness centrality score for anomaly detection + +CALL gds.betweenness.write( + $projection_name + '-directed-cleaned', { + relationshipWeightProperty: $projection_weight_property + ,writeProperty: 'centralityBetweenness' +}) + YIELD nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis +RETURN nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-LocalClusteringCoefficient-Exists.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-LocalClusteringCoefficient-Exists.cypher new file mode 100644 index 000000000..e4e75e76b --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-LocalClusteringCoefficient-Exists.cypher @@ -0,0 +1,9 @@ +// Return the first node with a clusteringCoefficient if it exists + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL + RETURN codeUnit.name AS shortCodeUnitName + ,elementId(codeUnit) AS nodeElementId + ,codeUnit.communityLocalClusteringCoefficient AS clusteringCoefficient + LIMIT 1 \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-LocalClusteringCoefficient-Write.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-LocalClusteringCoefficient-Write.cypher new file mode 100644 index 000000000..7dffea307 --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-LocalClusteringCoefficient-Write.cypher @@ -0,0 +1,8 @@ +// Calculates and writes the local clustering coefficient for anomaly detection + +CALL gds.localClusteringCoefficient.write( + $projection_name + '-cleaned', { + writeProperty: 'communityLocalClusteringCoefficient' +}) + YIELD averageClusteringCoefficient, nodeCount, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis +RETURN averageClusteringCoefficient, nodeCount, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-PageRank-Exists.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-PageRank-Exists.cypher new file mode 100644 index 000000000..5063b0bdd --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-PageRank-Exists.cypher @@ -0,0 +1,9 @@ +// Return the first node with a centralityPageRank if it exists + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.centralityPageRank IS NOT NULL + RETURN codeUnit.name AS shortCodeUnitName + ,elementId(codeUnit) AS nodeElementId + ,codeUnit.centralityPageRank AS pageRank + LIMIT 1 \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-PageRank-Write.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-PageRank-Write.cypher new file mode 100644 index 000000000..e88618a7c --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-PageRank-Write.cypher @@ -0,0 +1,10 @@ +// Calculates and writes the Article Rank centrality score for anomaly detection + +CALL gds.pageRank.write( + $projection_name + '-directed-cleaned', { + maxIterations: 50 + ,relationshipWeightProperty: $projection_weight_property + ,writeProperty: 'centralityPageRank' +}) + YIELD nodePropertiesWritten, ranIterations, didConverge, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis +RETURN nodePropertiesWritten, ranIterations, didConverge, preProcessingMillis, computeMillis, postProcessingMillis, writeMillis \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeatures-Reset.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeatures-Reset.cypher new file mode 100644 index 000000000..1bf66c68b --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeatures-Reset.cypher @@ -0,0 +1,8 @@ +// Reset all features related to anomaly detection for code units to force a recalculation + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + REMOVE codeUnit.communityLocalClusteringCoefficient + ,codeUnit.centralityArticleRank + ,codeUnit.centralityPageRank + ,codeUnit.centralityBetweenness \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeatures.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeatures.cypher new file mode 100644 index 000000000..bd6a6c696 --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeatures.cypher @@ -0,0 +1,49 @@ +// Query code unit nodes with their anomaly detection + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit[$community_property] IS NOT NULL + AND codeUnit.incomingDependencies IS NOT NULL + AND codeUnit.outgoingDependencies IS NOT NULL + AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL + AND codeUnit.centralityArticleRank IS NOT NULL + AND codeUnit.centralityPageRank IS NOT NULL + AND codeUnit.centralityBetweenness IS NOT NULL + AND codeUnit.clusteringHDBSCANLabel IS NOT NULL + AND codeUnit.clusteringHDBSCANProbability IS NOT NULL + AND codeUnit.clusteringHDBSCANNoise IS NOT NULL + AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL + AND codeUnit.clusteringHDBSCANSize IS NOT NULL + AND codeUnit.clusteringHDBSCANRadiusMax IS NOT NULL + AND codeUnit.clusteringHDBSCANRadiusAverage IS NOT NULL + AND codeUnit.clusteringHDBSCANNormalizedDistanceToMedoid IS NOT NULL + AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL + AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + RETURN DISTINCT + coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.name AS shortCodeUnitName + ,elementId(codeUnit) AS nodeElementId + ,coalesce(artifactName, projectName) AS projectName + ,codeUnit.incomingDependencies AS incomingDependencies + ,codeUnit.outgoingDependencies AS outgoingDependencies + ,codeUnit[$community_property] AS communityId + ,codeUnit.communityLocalClusteringCoefficient AS clusteringCoefficient + ,codeUnit.centralityArticleRank AS articleRank + ,codeUnit.centralityPageRank AS pageRank + ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference + ,codeUnit.centralityBetweenness AS betweenness + ,codeUnit.clusteringHDBSCANLabel AS clusteringLabel + ,codeUnit.clusteringHDBSCANProbability AS clusteringProbability + ,codeUnit.clusteringHDBSCANNoise AS clusteringIsNoise + ,codeUnit.clusteringHDBSCANMedoid AS clusteringIsMedoid + ,codeUnit.clusteringHDBSCANSize AS clusteringSize + ,codeUnit.clusteringHDBSCANRadiusMax AS clusteringRadiusMax + ,codeUnit.clusteringHDBSCANRadiusAverage AS clusteringRadiusAverage + ,codeUnit.clusteringHDBSCANNormalizedDistanceToMedoid AS clusteringNormalizedDistanceToMedoid + ,codeUnit.embeddingFastRandomProjectionVisualizationX AS visualizationX + ,codeUnit.embeddingFastRandomProjectionVisualizationY AS visualizationY + ,coalesce(codeUnit.centralityPageRank, 0.00001) AS centrality \ No newline at end of file diff --git a/domains/anomaly-detection/features/Set_Parameters_Manual.cypher b/domains/anomaly-detection/features/Set_Parameters_Manual.cypher new file mode 100644 index 000000000..6467e7a88 --- /dev/null +++ b/domains/anomaly-detection/features/Set_Parameters_Manual.cypher @@ -0,0 +1,8 @@ +// Example on how to set the parameters for anomaly detection + +:params { + "projection_name": "package-anomaly-detection", + "projection_node_label": "Package", + "projection_weight_property": "weight25PercentInterfaces", + "community_property": "communityLeidenIdTuned", +} \ No newline at end of file diff --git a/domains/anomaly-detection/queries/AnomalyDetectionDependencyHungryOrchestrators.cypher b/domains/anomaly-detection/queries/AnomalyDetectionDependencyHungryOrchestrators.cypher new file mode 100644 index 000000000..e3fb4a543 --- /dev/null +++ b/domains/anomaly-detection/queries/AnomalyDetectionDependencyHungryOrchestrators.cypher @@ -0,0 +1,31 @@ +// Anomaly Detection Query: Find dependency hungry orchestrators by listing the top 20 entries with the highest Article Rank >= 90% percentile and a Betweeenness centrality >= 90% percentile. +// Shows key code that depend on many others and also controls flow — likely orchestrators or managers. + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.centralityBetweenness IS NOT NULL + AND codeUnit.centralityArticleRank IS NOT NULL + AND codeUnit.incomingDependencies IS NOT NULL + AND codeUnit.outgoingDependencies IS NOT NULL + WITH collect(codeUnit) AS codeUnits + ,percentileDisc(codeUnit.centralityArticleRank, 0.90) AS articleRank90Percentile + ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweenness90Percentile + UNWIND codeUnits AS codeUnit + WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree + WHERE codeUnit.centralityArticleRank >= articleRank90Percentile + AND codeUnit.centralityBetweenness >= betweenness90Percentile +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + RETURN DISTINCT + coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.name AS shortCodeUnitName + ,coalesce(artifactName, projectName) AS projectName + ,codeUnit.centralityBetweenness AS betweenness + ,codeUnit.centralityArticleRank AS articleRank + ,degree + ,codeUnit.incomingDependencies AS incomingDependencies + ,codeUnit.outgoingDependencies AS outgoingDependencies + ORDER BY articleRank DESC, betweenness DESC + LIMIT 20 \ No newline at end of file diff --git a/domains/anomaly-detection/queries/AnomalyDetectionFragileStructuralBridges.cypher b/domains/anomaly-detection/queries/AnomalyDetectionFragileStructuralBridges.cypher new file mode 100644 index 000000000..f9766c6d3 --- /dev/null +++ b/domains/anomaly-detection/queries/AnomalyDetectionFragileStructuralBridges.cypher @@ -0,0 +1,31 @@ +// Anomaly Detection Query: Find fragile structural bridges, potential boundary-spanning modules and cohesion violations by listing the top 20 entries with the highest Betweeenness centrality >= 90% percentile and a local clustering coefficient <= 10% percentile. +// Shows code that connects otherwise unrelated parts of the graph — potential architectural risks. + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.centralityBetweenness IS NOT NULL + AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL + AND codeUnit.incomingDependencies IS NOT NULL + AND codeUnit.outgoingDependencies IS NOT NULL + WITH collect(codeUnit) AS codeUnits + ,percentileDisc(codeUnit.communityLocalClusteringCoefficient, 0.10) AS localClusteringCoefficient10Percentile + ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweenness90Percentile + UNWIND codeUnits AS codeUnit + WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree + WHERE codeUnit.communityLocalClusteringCoefficient <= localClusteringCoefficient10Percentile + AND codeUnit.centralityBetweenness >= betweenness90Percentile +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + RETURN DISTINCT + coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.name AS shortCodeUnitName + ,coalesce(artifactName, projectName) AS projectName + ,codeUnit.centralityBetweenness AS betweenness + ,codeUnit.communityLocalClusteringCoefficient AS localClusteringCoefficient + ,degree + ,codeUnit.incomingDependencies AS incomingDependencies + ,codeUnit.outgoingDependencies AS outgoingDependencies + ORDER BY betweenness DESC, localClusteringCoefficient ASC + LIMIT 20 \ No newline at end of file diff --git a/domains/anomaly-detection/queries/AnomalyDetectionHiddenBridgeNodes.cypher b/domains/anomaly-detection/queries/AnomalyDetectionHiddenBridgeNodes.cypher new file mode 100644 index 000000000..e93b8d1a0 --- /dev/null +++ b/domains/anomaly-detection/queries/AnomalyDetectionHiddenBridgeNodes.cypher @@ -0,0 +1,31 @@ +// Anomaly Detection Query: Find hidden bridge code or misplaced responsibilities by listing the top 20 entries with the highest Betweeenness centrality >= 90% percentile and a Page Rank <= 10% percentile. +// Shows code that mediates flow, but isn’t highly depended on — structural surprise. + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.centralityBetweenness IS NOT NULL + AND codeUnit.centralityPageRank IS NOT NULL + AND codeUnit.incomingDependencies IS NOT NULL + AND codeUnit.outgoingDependencies IS NOT NULL + WITH collect(codeUnit) AS codeUnits + ,percentileDisc(codeUnit.centralityPageRank, 0.10) AS pageRank10Percentile + ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweenness90Percentile + UNWIND codeUnits AS codeUnit + WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree + WHERE codeUnit.centralityPageRank <= pageRank10Percentile + AND codeUnit.centralityBetweenness >= betweenness90Percentile +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + RETURN DISTINCT + coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.name AS shortCodeUnitName + ,coalesce(artifactName, projectName) AS projectName + ,codeUnit.centralityBetweenness AS betweenness + ,codeUnit.centralityPageRank AS pageRank + ,degree + ,codeUnit.incomingDependencies AS incomingDependencies + ,codeUnit.outgoingDependencies AS outgoingDependencies + ORDER BY betweenness DESC, pageRank ASC + LIMIT 20 \ No newline at end of file diff --git a/domains/anomaly-detection/queries/AnomalyDetectionOverReferencesUtilities.cypher b/domains/anomaly-detection/queries/AnomalyDetectionOverReferencesUtilities.cypher new file mode 100644 index 000000000..876ba74e9 --- /dev/null +++ b/domains/anomaly-detection/queries/AnomalyDetectionOverReferencesUtilities.cypher @@ -0,0 +1,31 @@ +// Anomaly Detection Query: Find over-referenced utility code by listing the top 20 entries with the highest Page Rank >= 90% percentile and a low local clustering coefficient below the 10% percentile. +// Shows code that is widely referenced, but loosely coupled in neighborhood — could be over-generalized or abused. + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL + AND codeUnit.centralityPageRank IS NOT NULL + AND codeUnit.incomingDependencies IS NOT NULL + AND codeUnit.outgoingDependencies IS NOT NULL + WITH collect(codeUnit) AS codeUnits + ,percentileDisc(codeUnit.communityLocalClusteringCoefficient, 0.10) AS localClusteringCoefficient10PercentPercentile + ,percentileDisc(codeUnit.centralityPageRank, 0.90) AS pageRank90PercentPercentile + UNWIND codeUnits AS codeUnit + WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree + WHERE codeUnit.communityLocalClusteringCoefficient <= localClusteringCoefficient10PercentPercentile + AND codeUnit.centralityPageRank >= pageRank90PercentPercentile +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + RETURN DISTINCT + coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.name AS shortCodeUnitName + ,coalesce(artifactName, projectName) AS projectName + ,codeUnit.communityLocalClusteringCoefficient AS localClusteringCoefficient + ,codeUnit.centralityPageRank AS pageRank + ,degree + ,codeUnit.incomingDependencies AS incomingDependencies + ,codeUnit.outgoingDependencies AS outgoingDependencies + ORDER BY pageRank DESC, localClusteringCoefficient ASC + LIMIT 20 \ No newline at end of file diff --git a/domains/anomaly-detection/queries/AnomalyDetectionPopularBottlenecks.cypher b/domains/anomaly-detection/queries/AnomalyDetectionPopularBottlenecks.cypher new file mode 100644 index 000000000..9b66b27c6 --- /dev/null +++ b/domains/anomaly-detection/queries/AnomalyDetectionPopularBottlenecks.cypher @@ -0,0 +1,31 @@ +// Anomaly Detection Query: Find popular bottlenecks by listing the top 20 entries with the highest Betweeenness centrality >= 90% percentile and a Page Rank >= 90% percentile. +// Shows key code that is both heavily depended on and control flow — critical hubs. + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.centralityBetweenness IS NOT NULL + AND codeUnit.centralityPageRank IS NOT NULL + AND codeUnit.incomingDependencies IS NOT NULL + AND codeUnit.outgoingDependencies IS NOT NULL + WITH collect(codeUnit) AS codeUnits + ,percentileDisc(codeUnit.centralityPageRank, 0.90) AS pageRank90Percentile + ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweenness90Percentile + UNWIND codeUnits AS codeUnit + WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree + WHERE codeUnit.centralityPageRank >= pageRank90Percentile + AND codeUnit.centralityBetweenness >= betweenness90Percentile +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + RETURN DISTINCT + coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.name AS shortCodeUnitName + ,coalesce(artifactName, projectName) AS projectName + ,codeUnit.centralityBetweenness AS betweenness + ,codeUnit.centralityPageRank AS pageRank + ,degree + ,codeUnit.incomingDependencies AS incomingDependencies + ,codeUnit.outgoingDependencies AS outgoingDependencies + ORDER BY pageRank DESC, betweenness DESC + LIMIT 20 \ No newline at end of file diff --git a/domains/anomaly-detection/queries/AnomalyDetectionPotentialImbalancedRoles.cypher b/domains/anomaly-detection/queries/AnomalyDetectionPotentialImbalancedRoles.cypher new file mode 100644 index 000000000..fd7cd6a30 --- /dev/null +++ b/domains/anomaly-detection/queries/AnomalyDetectionPotentialImbalancedRoles.cypher @@ -0,0 +1,34 @@ +// Anomaly Detection Query: Find potential imbalanced roles in the codebase by listing the top 40 most significant Page Rank to Article Rank differences. + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.centralityPageRank IS NOT NULL + AND codeUnit.centralityArticleRank IS NOT NULL + WITH collect(codeUnit) AS codeUnits + ,avg (codeUnit.centralityPageRank - codeUnit.centralityArticleRank) AS pageToArticleRankDifferenceMean + ,stDev(codeUnit.centralityPageRank - codeUnit.centralityArticleRank) AS pageToArticleRankDifferenceStandardDeviation + WHERE pageToArticleRankDifferenceStandardDeviation <> 0 + UNWIND codeUnits AS codeUnit + WITH *, codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference + WITH *, (pageToArticleRankDifference - pageToArticleRankDifferenceMean) / + pageToArticleRankDifferenceStandardDeviation AS pageToArticleRankDifferenceZScore +// Only include code units with a PageRank vs ArticleRank difference more than 2 (z-score) standard deviations from the mean +WHERE abs(pageToArticleRankDifferenceZScore) > 2.0 +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + RETURN DISTINCT + coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.name AS shortCodeUnitName + ,coalesce(artifactName, projectName) AS projectName + ,sign(pageToArticleRankDifference) AS pageToArticleRankSign + ,pageToArticleRankDifferenceZScore + ,pageToArticleRankDifference + ,codeUnit.centralityPageRank AS pageRank + ,codeUnit.centralityArticleRank AS articleRank + //For Debugging + //,pageToArticleRankDifferenceMean + //,pageToArticleRankDifferenceStandardDeviation + ORDER BY abs(pageToArticleRankDifferenceZScore) DESC + LIMIT 40 \ No newline at end of file diff --git a/domains/anomaly-detection/queries/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher b/domains/anomaly-detection/queries/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher new file mode 100644 index 000000000..74e9f886e --- /dev/null +++ b/domains/anomaly-detection/queries/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher @@ -0,0 +1,30 @@ +// Anomaly Detection Query: Find potential over-engineered or isolated code unit by listing the top 20 entries with the highest local clustering coefficient and a Page Rank below the 5% percentile. + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL + AND codeUnit.centralityPageRank IS NOT NULL + AND codeUnit.incomingDependencies IS NOT NULL + AND codeUnit.outgoingDependencies IS NOT NULL + WITH collect(codeUnit) AS codeUnits + ,percentileDisc(codeUnit.centralityPageRank, 0.10) AS pageRank10PercentPercentile + ,percentileDisc(codeUnit.communityLocalClusteringCoefficient, 0.90) AS localClusteringCoefficient90PercentPercentile + UNWIND codeUnits AS codeUnit + WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree + WHERE codeUnit.centralityPageRank <= pageRank10PercentPercentile + AND codeUnit.communityLocalClusteringCoefficient >= localClusteringCoefficient90PercentPercentile +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + RETURN DISTINCT + coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.name AS shortCodeUnitName + ,coalesce(artifactName, projectName) AS projectName + ,codeUnit.communityLocalClusteringCoefficient AS localClusteringCoefficient + ,codeUnit.centralityPageRank AS pageRank + ,degree + ,codeUnit.incomingDependencies AS incomingDependencies + ,codeUnit.outgoingDependencies AS outgoingDependencies + ORDER BY localClusteringCoefficient DESC, pageRank ASC + LIMIT 20 \ No newline at end of file diff --git a/domains/anomaly-detection/queries/AnomalyDetectionSilentCoordinators.cypher b/domains/anomaly-detection/queries/AnomalyDetectionSilentCoordinators.cypher new file mode 100644 index 000000000..945b73e54 --- /dev/null +++ b/domains/anomaly-detection/queries/AnomalyDetectionSilentCoordinators.cypher @@ -0,0 +1,29 @@ +// Anomaly Detection Query: Find silent coordinators by listing the top 20 entries with the highest betweeenness >= 90% percentile and a in-degree <= 10% percentile. +// Shows code that controls lots of interactions, yet not many modules depend on it — hidden complexity + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.centralityBetweenness IS NOT NULL + AND codeUnit.incomingDependencies IS NOT NULL + AND codeUnit.outgoingDependencies IS NOT NULL + WITH collect(codeUnit) AS codeUnits + ,percentileDisc(codeUnit.incomingDependencies, 0.10) AS incomingDependencies10Percentile + ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweenness90Percentile + UNWIND codeUnits AS codeUnit + WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree + WHERE codeUnit.incomingDependencies <= incomingDependencies10Percentile + AND codeUnit.centralityBetweenness <= betweenness90Percentile +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + RETURN DISTINCT + coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.name AS shortCodeUnitName + ,coalesce(artifactName, projectName) AS projectName + ,codeUnit.centralityBetweenness AS betweenness + ,degree + ,codeUnit.incomingDependencies AS incomingDependencies + ,codeUnit.outgoingDependencies AS outgoingDependencies + ORDER BY betweenness DESC, codeUnit.incomingDependencies ASC + LIMIT 20 \ No newline at end of file diff --git a/domains/anomaly-detection/queries/AnomalyDetectionUnexpectedCentralNodes.cypher b/domains/anomaly-detection/queries/AnomalyDetectionUnexpectedCentralNodes.cypher new file mode 100644 index 000000000..a1fdd6b5b --- /dev/null +++ b/domains/anomaly-detection/queries/AnomalyDetectionUnexpectedCentralNodes.cypher @@ -0,0 +1,30 @@ +// Anomaly Detection Query: Find hidden bottlenecks or hubs by listing the top 20 entries with the highest betweeenness >= 90% percentile and a degree <= 10% percentile. +// Shows code with high structural importance and only a few incoming and outgoing dependencies — often unexpected. + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.centralityBetweenness IS NOT NULL + AND codeUnit.incomingDependencies IS NOT NULL + AND codeUnit.outgoingDependencies IS NOT NULL + WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree + WITH collect(codeUnit) AS codeUnits + ,percentileDisc(degree, 0.10) AS degree10Percentile + ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweenness90Percentile + UNWIND codeUnits AS codeUnit + WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree + WHERE degree <= degree10Percentile + AND codeUnit.centralityBetweenness <= betweenness90Percentile +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + RETURN DISTINCT + coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.name AS shortCodeUnitName + ,coalesce(artifactName, projectName) AS projectName + ,codeUnit.centralityBetweenness AS betweenness + ,degree + ,codeUnit.incomingDependencies AS incomingDependencies + ,codeUnit.outgoingDependencies AS outgoingDependencies + ORDER BY betweenness DESC, degree ASC + LIMIT 20 \ No newline at end of file diff --git a/domains/anomaly-detection/tunedLeidenCommunityDetection.py b/domains/anomaly-detection/tunedLeidenCommunityDetection.py new file mode 100755 index 000000000..832a25f13 --- /dev/null +++ b/domains/anomaly-detection/tunedLeidenCommunityDetection.py @@ -0,0 +1,391 @@ +#!/usr/bin/env python + +# This Python script runs the Leiden community detection algorithm +# and applies hyper-parameter tuning to get a overall modularity > 0.3 +# and as many clusters as possible. +# The results with the best parameters are written back into the Graph database. + +# Prerequisite: Provide the password for Neo4j in the environment variable "NEO4J_INITIAL_PASSWORD". + +import typing + +import os +import sys +import argparse +import pprint + +import pandas as pd + +from neo4j import GraphDatabase, Driver + +import optuna +from optuna.samplers import TPESampler +from optuna.importance import get_param_importances, MeanDecreaseImpurityImportanceEvaluator +from optuna.trial import TrialState + + +class Parameters: + required_parameters_ = ["projection_name", "projection_node_label", "projection_weight_property", "community_property"] + + def __init__(self, input_parameters: typing.Dict[str, str], verbose: bool = False, write_results_into_database: bool = True): + self.query_parameters_ = input_parameters.copy() # copy enforces immutability + self.verbose_ = verbose + self.write_results_into_database_ = write_results_into_database + + def __repr__(self): + pretty_dict = pprint.pformat(self.query_parameters_, indent=4) + return f"Parameters: verbose={self.verbose_}, write_results_into_database={self.write_results_into_database_}, query_parameters:\n{pretty_dict}" + + @staticmethod + def log_dependency_versions_() -> None: + print('---------------------------------------') + + print('Python version: {}'.format(sys.version)) + + from pandas import __version__ as pandas_version + print('pandas version: {}'.format(pandas_version)) + + from neo4j import __version__ as neo4j_version + print('neo4j version: {}'.format(neo4j_version)) + + from optuna import __version__ as optuna_version + print('optuna version: {}'.format(optuna_version)) + + print('---------------------------------------') + + @classmethod + def from_input_parameters(cls, input_parameters: typing.Dict[str, str], verbose: bool = False, write_results_into_database: bool = True): + """ + Creates a Parameters instance from a dictionary of input parameters. + The dictionary must contain the following keys: + - "projection_name": The name of the projection. + - "projection_node_label": The node type of the projection. + - "projection_weight_property": The weight property of the projection. + - "community_property": The node property that is written back into the Graph with the result of the community detection algorithm. + """ + missing_parameters = [parameter for parameter in cls.required_parameters_ if parameter not in input_parameters] + if missing_parameters: + raise ValueError("Missing parameters:", missing_parameters) + created_parameters = cls(input_parameters, verbose, write_results_into_database) + if created_parameters.is_verbose(): + print(created_parameters) + cls.log_dependency_versions_() + return created_parameters + + @classmethod + def example(cls): + return cls(dict( + projection_name="java-package-tuned-community", + projection_node_label="Package", + projection_weight_property="weight25PercentInterfaces", + community_property="communityLeidenIdTuned" + )) + + def get_query_parameters(self) -> typing.Dict[str, str]: + return self.query_parameters_.copy() # copy enforces immutability + + def clone_with_projection_name(self, projection_name: str): + updated_parameter = self.get_query_parameters() + updated_parameter.update({"projection_name": projection_name}) + return Parameters(updated_parameter) + + def get_projection_name(self) -> str: + return self.query_parameters_["projection_name"] + + def is_verbose(self) -> bool: + return self.verbose_ + + def is_write_results_into_database(self) -> bool: + return self.write_results_into_database_ + + +def parse_input_parameters() -> Parameters: + # Convert list of "key=value" strings to a dictionary + def parse_key_value_list(param_list: typing.List[str]) -> typing.Dict[str, str]: + param_dict = {} + for item in param_list: + if '=' in item: + key, value = item.split('=', 1) + param_dict[key] = value + return param_dict + + parser = argparse.ArgumentParser(description="Tuned Leiden Community Detection Algorithm for maximized community count and modularity > 0.3.") + parser.add_argument('--verbose', action='store_true', help='Enable verbose mode to log all details') + parser.add_argument('--write-results-into-database', action=argparse.BooleanOptionalAction, help='Write the results back into Neo4j (default) or just mutate the projected graph (prefix no-)') + parser.add_argument('query_parameters', nargs='*', type=str, help='List of key=value Cypher query parameters') + parser.set_defaults(verbose=False, write_results_into_database=True) + args = parser.parse_args() + return Parameters.from_input_parameters(parse_key_value_list(args.query_parameters), args.verbose, args.write_results_into_database) + + +def get_graph_database_driver() -> Driver: + driver = GraphDatabase.driver( + uri="bolt://localhost:7687", + auth=("neo4j", os.environ.get("NEO4J_INITIAL_PASSWORD")) + ) + driver.verify_connectivity() + return driver + + +def query_cypher_to_data_frame(query: typing.LiteralString, parameters: typing.Optional[typing.Dict[str, typing.Any]] = None): + records, summary, keys = driver.execute_query(query, parameters_=parameters) + return pd.DataFrame([record.values() for record in records], columns=keys) + + +def soft_ramp_limited_penalty(score: float, lower_threshold: float, upper_threshold: float, sharpness: int) -> float: + if score <= lower_threshold: + return 1.0 # No penalty + elif score >= upper_threshold: + return 0.0 # Full penalty + else: + # Normalize noise into [0, 1] range for ramp + x = (score - lower_threshold) / (upper_threshold - lower_threshold) + return max(0.0, 1 - x**sharpness) + + +class TuneableLeidenCommunityDetection: # (sklearn.BaseEstimator): + """ + Encapsulates Leiden community detection algorithm for hyper-parameter tuning. + """ + # Extend from sklearn BaseEstimator to use e.g. GridSearchCV for hyperparameter tuning. + # The implementation is sklearn compatible and follows its schema (e.g. fit and score method). + + cypher_query_for_leiden_community_statistics_: typing.LiteralString = """ + CALL gds.leiden.stats( + $projection_name + '-cleaned', { + gamma: toFloat($leiden_gamma), + theta: toFloat($leiden_theta), + maxLevels: toInteger($leiden_max_levels), + tolerance: 0.0000001, + consecutiveIds: true, + relationshipWeightProperty: $projection_weight_property + }) + """ + cypher_query_to_mutate_leiden_communities_: typing.LiteralString = """ + CALL gds.leiden.mutate( + $projection_name + '-cleaned', { + gamma: toFloat($leiden_gamma), + theta: toFloat($leiden_theta), + maxLevels: toInteger($leiden_max_levels), + tolerance: 0.0000001, + consecutiveIds: true, + relationshipWeightProperty: $projection_weight_property, + mutateProperty: $community_property + }) + """ + cypher_query_to_delete_leiden_communities_: typing.LiteralString = """ + CALL gds.graph.nodeProperties.drop( + $projection_name + '-cleaned' + ,$community_property + ,{ failIfMissing: false } + ) + """ + cypher_query_to_write_leiden_communities_: typing.LiteralString = """ + CALL gds.leiden.write( + $projection_name + '-cleaned', { + gamma: toFloat($leiden_gamma), + theta: toFloat($leiden_theta), + maxLevels: toInteger($leiden_max_levels), + tolerance: 0.0000001, + consecutiveIds: true, + relationshipWeightProperty: $projection_weight_property, + writeProperty: $community_property + }) + """ + cypher_query_to_write_mutated_leiden_communities_: typing.LiteralString = """ + CALL gds.graph.nodeProperties.write( + $projection_name + '-cleaned' + ,[$community_property] + ) + """ + + def __init__(self, + parameters: Parameters = Parameters.example(), + # Tuneable algorithm parameters + gamma: float = 1.0, + theta: float = 0.001, + max_levels: int = 10, + ): + self.parameters = parameters + self.verbose = parameters.is_verbose() + + self.gamma = gamma + self.theta = theta + self.max_levels = max_levels + + self.mutation_finished_ = False + + def __to_algorithm_parameters(self) -> typing.Dict['str', 'str']: + return { + "leiden_gamma": str(self.gamma), + "leiden_theta": str(self.theta), + "leiden_max_levels": str(self.max_levels), + **self.parameters.get_query_parameters() + } + + def __run_algorithm(self) -> pd.DataFrame: + algorithm_parameters = self.__to_algorithm_parameters() + # For Debugging: + # print("Calculating Leiden communities using Neo4j Graph Data Science with the following parameters: " + str(algorithm_parameters)) + return query_cypher_to_data_frame(self.cypher_query_for_leiden_community_statistics_, parameters=algorithm_parameters) + + def __check_fitted(self) -> None: + """ + Checks if the model has been fitted by checking if the embeddings_ attribute exists. + Raises a ValueError if the model has not been fitted yet. + """ + if not hasattr(self, 'community_statistics_'): + raise ValueError("The model has not been fitted yet. Please call the fit method before.") + + def fit(self, X=None, y=None) -> typing.Self: + """ + Fits the model by calculating Leiden communities and their statistics. + """ + self.community_statistics_ = self.__run_algorithm() + return self + + def score(self, X=None, y=None) -> float: + """ + The returned score is high for community detection results with high modularity and high community count. + A penalty assures that a modularity lower than 0.3 (*1) will result in a score of zero ("worst"). + The community count is normalized by dividing it through the number of nodes in the projected Graph. + To give the relative community count more weight, it is multiplied by 100. + + (*1) Mane, Prachita; Shanbhag, Sunanda; Kamath, Tanmayee; Mackey, Patrick; and Springer, John, + "Analysis of Community Detection Algorithms for Large Scale Cyber Networks" (2016) + """ + soft_ramped_modularity = 1.0 - soft_ramp_limited_penalty(self.get_modularity(), 0.30, 0.35, sharpness=1) + score = float(self.get_community_count() * 100) / float(self.get_node_count_()) * soft_ramped_modularity + # - For debugging purposes: + # print(f"Score {score:.4f}= community count {self.get_community_count()} x soft_ramped {soft_ramped_modularity:.4f} modularity {self.get_modularity():.04f}") + return score + + def mutate_communities(self) -> typing.Self: + """ + Calculate Leiden communities and add them to (mutate) the projected in-memory Graph. + This is useful for further processing or analysis. + """ + algorithm_parameters = self.__to_algorithm_parameters() + if self.verbose: + print("") + print("Mutate communities to the projected Graph with the following parameters: " + str(algorithm_parameters)) + print("") + + query_cypher_to_data_frame(self.cypher_query_to_delete_leiden_communities_, parameters=algorithm_parameters) + query_cypher_to_data_frame(self.cypher_query_to_mutate_leiden_communities_, parameters=algorithm_parameters) + + self.mutation_finished_ = True + print(f"Best Leiden Community Detection results saved in projected graph (mutate).") + return self + + def write_communities(self) -> typing.Self: + """ + Writes the calculated communities to the Neo4j database. + This is useful for further processing or analysis. + """ + algorithm_parameters = self.__to_algorithm_parameters() + if self.verbose: + print("") + print("Writing communities to Neo4j with the following parameters: " + str(algorithm_parameters)) + print("") + + if self.mutation_finished_: + query_cypher_to_data_frame(self.cypher_query_to_write_mutated_leiden_communities_, parameters=algorithm_parameters) + else: + query_cypher_to_data_frame(self.cypher_query_to_write_leiden_communities_, parameters=algorithm_parameters) + + print(f"Best Leiden Community Detection results written back into Neo4j.") + return self + + def get_modularity(self) -> float: + """ + Returns the modularity (global/overall) of the community statistics + """ + self.__check_fitted() + return float(self.community_statistics_['modularity'].iloc[0]) + + def get_community_count(self) -> int: + """ + Returns the number of detected communities + """ + self.__check_fitted() + return int(self.community_statistics_['communityCount'].iloc[0]) + + def get_node_count_(self) -> int: + """ + Returns the number of nodes in the projected Graph + """ + self.__check_fitted() + return int(self.community_statistics_['nodeCount'].iloc[0]) + + +def output_detailed_optuna_tuning_results(optimized_study: optuna.Study) -> None: + name_of_the_optimized_algorithm = 'Leiden Community Detection' + + print("") + print(f"Best {name_of_the_optimized_algorithm} score:", optimized_study.best_value) + print("") + print(f"Best {name_of_the_optimized_algorithm} parameter influence:", get_param_importances(optimized_study, evaluator=MeanDecreaseImpurityImportanceEvaluator())) + + valid_trials = [trial for trial in optimized_study.trials if trial.value is not None and trial.state == TrialState.COMPLETE] + top_trials = sorted(valid_trials, key=lambda t: typing.cast(float, t.value), reverse=True)[:10] + for i, trial in enumerate(top_trials): + print(f"Best {name_of_the_optimized_algorithm} parameter rank: {i+1}, trial: {trial.number}, Value = {trial.value:.6f}, Params: {trial.params}") + print("") + + +def get_tuned_leiden_community_detection_algorithm(parameters: Parameters) -> TuneableLeidenCommunityDetection: + if not parameters.is_verbose(): + optuna.logging.set_verbosity(optuna.logging.WARNING) + + def objective(trial): + # Suggest values for each hyperparameter + tuneable_parameters = dict( + gamma=trial.suggest_float("gamma", low=0.7, high=1.3, step=0.01), + theta=trial.suggest_float("theta", 0.0001, 0.01, log=True), + # Fixed max_levels = 10 (default) since experiments showed only minor differences in the results + # max_levels = trial.suggest_int("max_levels", 8, 12) + ) + tuneable_leiden_community_detection = TuneableLeidenCommunityDetection(parameters, **tuneable_parameters).fit() + return tuneable_leiden_community_detection.score() + + study_name = "LeidenCommunityDetection - " + parameters.get_projection_name() + # For in-depth analysis of the tuning, add the following two parameters: + # , storage=f"sqlite:///optuna_study_node_embeddings_java.db", load_if_exists=True) + study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42), study_name=study_name) + + # Try (enqueue) specific settings first that led to good results in initial experiments + study.enqueue_trial({'gamma': 1.0, 'theta': 0.001, 'max_levels': 10}) # default values + study.enqueue_trial({'gamma': 1.14, 'theta': 0.001, 'max_levels': 10}) + + # Execute the hyperparameter tuning + study.optimize(objective, n_trials=20, timeout=30) + + # Output tuning results + print(f"Best Leiden Community Detection parameters for {parameters.get_projection_name()} (Optuna):", study.best_params) + if parameters.is_verbose(): + output_detailed_optuna_tuning_results(study) + + # Run the node embeddings algorithm again again with the best parameters + tuned_leiden_community_detection = TuneableLeidenCommunityDetection(parameters, **study.best_params).fit() + + print("Best Leiden Community Detection Modularity: ", tuned_leiden_community_detection.get_modularity()) + print("Best Leiden Community Detection Community Count: ", tuned_leiden_community_detection.get_community_count()) + + return tuned_leiden_community_detection + +# ------------------------------------------------------------------------------------------------------------ +# MAIN +# ------------------------------------------------------------------------------------------------------------ + + +parameters = parse_input_parameters() +driver = get_graph_database_driver() + +tuned_leiden_community_detection = get_tuned_leiden_community_detection_algorithm(parameters) +if parameters.is_write_results_into_database(): + tuned_leiden_community_detection.write_communities() +else: + tuned_leiden_community_detection.mutate_communities() + +driver.close() \ No newline at end of file diff --git a/domains/anomaly-detection/tunedNodeEmbeddingClustering.py b/domains/anomaly-detection/tunedNodeEmbeddingClustering.py new file mode 100755 index 000000000..2aae7e7cd --- /dev/null +++ b/domains/anomaly-detection/tunedNodeEmbeddingClustering.py @@ -0,0 +1,755 @@ +#!/usr/bin/env python + +# This Python script performs unsupervised clustering to automatically assign meaningful labels to code units — such as Java packages and Java types — and their dependencies, based on how structurally similar they are within a software system. +# This is useful for understanding code structure, detecting modular boundaries, and identifying anomalies or outliers in large software systems without requiring manual labeling. +# It takes the code structure as a graph in Neo4j and generates node embeddings using Fast Random Projection (FastRP). +# These embeddings capture structural similarity and are clustered using HDBSCAN to assign labels or detect noise. +# All results - including embeddings, cluster labels, and 2D coordinates — are written back to Neo4j for further use. + +# Prerequisite: +# - Already existing Graph with analyzed code units (like Java Packages) and their dependencies. +# - Provide the password for Neo4j in the environment variable "NEO4J_INITIAL_PASSWORD". + +import typing +import numpy.typing as numpy_typing + +import os +import sys +import argparse +import pprint +import contextlib + +import pandas as pd +import numpy as np + +from neo4j import GraphDatabase, Driver + +# from sklearn.base import BaseEstimator # Extend from sklearn BaseEstimator to use e.g. GridSearchCV for hyperparameter tuning. +from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, normalized_mutual_info_score +from sklearn.cluster import HDBSCAN # type: ignore + +import optuna +from optuna.samplers import TPESampler +from optuna.importance import get_param_importances, MeanDecreaseImpurityImportanceEvaluator +from optuna.trial import TrialState + + +class Parameters: + required_parameters_ = ["projection_name", "projection_node_label", "projection_weight_property", "community_property", "embedding_property"] + + def __init__(self, input_parameters: typing.Dict[str, str], verbose: bool = False): + self.query_parameters_ = input_parameters.copy() # copy enforces immutability + self.verbose_ = verbose + + def __repr__(self): + pretty_dict = pprint.pformat(self.query_parameters_, indent=4) + return f"Parameters: verbose={self.verbose_}, query_parameters:\n{pretty_dict}" + + @staticmethod + def log_dependency_versions_() -> None: + print('---------------------------------------') + + print('Python version: {}'.format(sys.version)) + + from numpy import __version__ as numpy_version + print('numpy version: {}'.format(numpy_version)) + + from pandas import __version__ as pandas_version + print('pandas version: {}'.format(pandas_version)) + + from sklearn import __version__ as sklearn_version + print('scikit-learn version: {}'.format(sklearn_version)) + + from neo4j import __version__ as neo4j_version + print('neo4j version: {}'.format(neo4j_version)) + + from optuna import __version__ as optuna_version + print('optuna version: {}'.format(optuna_version)) + + print('---------------------------------------') + + @classmethod + def from_input_parameters(cls, input_parameters: typing.Dict[str, str], verbose: bool = False): + """ + Creates a Parameters instance from a dictionary of input parameters. + The dictionary must contain the following keys: + - "projection_name": The name of the projection. + - "projection_node_label": The node type of the projection. + - "projection_weight_property": The weight property of the projection. + - "community_property": The node property containing the pre-calculated reference community id. + """ + missing_parameters = [parameter for parameter in cls.required_parameters_ if parameter not in input_parameters] + if missing_parameters: + raise ValueError("Missing parameters:", missing_parameters) + created_parameters = cls(input_parameters, verbose) + if created_parameters.is_verbose(): + print(created_parameters) + cls.log_dependency_versions_() + return created_parameters + + @classmethod + def example(cls): + return cls(dict( + projection_name="java-package-embeddings-clustering", + projection_node_label="Package", + projection_weight_property="weight25PercentInterfaces", + community_property="communityLeidenIdTuned", + )) + + def get_query_parameters(self) -> typing.Dict[str, str]: + return self.query_parameters_.copy() # copy enforces immutability + + def clone_with_projection_name(self, projection_name: str): + updated_parameter = self.get_query_parameters() + updated_parameter.update({"projection_name": projection_name}) + return Parameters(updated_parameter) + + def get_projection_name(self) -> str: + return self.query_parameters_["projection_name"] + + def get_projection_node_label(self) -> str: + return self.query_parameters_["projection_node_label"] + + def get_embedding_property(self) -> str: + return self.query_parameters_["embedding_property"] + + def is_verbose(self) -> bool: + return self.verbose_ + + +def parse_input_parameters() -> Parameters: + # Convert list of "key=value" strings to a dictionary + def parse_key_value_list(param_list: typing.List[str]) -> typing.Dict[str, str]: + param_dict = {} + for item in param_list: + if '=' in item: + key, value = item.split('=', 1) + param_dict[key] = value + return param_dict + + parser = argparse.ArgumentParser( + description="Unsupervised clustering to assign labels to code units (Java packages, types,...) and their dependencies based on how structurally similar they are within a software system.") + parser.add_argument('--verbose', action='store_true', help='Enable verbose mode to log all details') + parser.add_argument('query_parameters', nargs='*', type=str, help='List of key=value Cypher query parameters') + parser.set_defaults(verbose=False) + args = parser.parse_args() + return Parameters.from_input_parameters(parse_key_value_list(args.query_parameters), args.verbose) + + +def get_graph_database_driver() -> Driver: + driver = GraphDatabase.driver( + uri="bolt://localhost:7687", + auth=("neo4j", os.environ.get("NEO4J_INITIAL_PASSWORD")) + ) + driver.verify_connectivity() + return driver + + +def query_cypher_to_data_frame(query: typing.LiteralString, parameters: typing.Optional[typing.Dict[str, typing.Any]] = None): + records, summary, keys = driver.execute_query(query, parameters_=parameters) + return pd.DataFrame([record.values() for record in records], columns=keys) + + +def query_cypher_to_data_frame_suppress_warnings(query: typing.LiteralString, parameters: typing.Optional[typing.Dict[str, typing.Any]] = None): + """ + Executes the Cypher query in the given file and returns the result as a pandas DataFrame. + This function suppresses any warnings or error messages that would normally be printed to stderr. + This is useful when you want to run a query without cluttering the output with warnings. + Parameters: + - filename: The name of the file containing the Cypher query. + - parameters: Optional dictionary of parameters to pass to the Cypher query. + Returns: + - A pandas DataFrame containing the results of the Cypher query. + """ + with open(os.devnull, 'w') as devnull, contextlib.redirect_stderr(devnull): + return query_cypher_to_data_frame(query, parameters) + + +def write_batch_data_into_database(dataframe: pd.DataFrame, node_label: str, id_column: str = "nodeElementId", batch_size: int = 1000, verbose: bool = False) -> None: + """ + Writes the given dataframe to the Neo4j database using a batch write operation. + + Parameters: + - dataframe: The pandas DataFrame to write. + - label: The label to use for the nodes in the Neo4j database. + - id_column: The name of the column in the DataFrame that contains the node IDs. + - cypher_query_file: The file containing the Cypher query for writing the data. + - batch_size: The number of rows to write in each batch. + """ + def prepare_rows(dataframe): + rows = [] + for _, row in dataframe.iterrows(): + properties_without_id = row.drop(labels=[id_column]).to_dict() + rows.append({ + "nodeId": row[id_column], + "properties": properties_without_id + }) + return rows + + def update_batch(transaction, rows): + query = """ + UNWIND $rows AS row + MATCH (codeUnit) + WHERE elementId(codeUnit) = row.nodeId + AND $node_label IN labels(codeUnit) + SET codeUnit += row.properties + """ + transaction.run(query, rows=rows, node_label=node_label) + + with driver.session() as session: + for start in range(0, len(dataframe), batch_size): + batch_dataframe = dataframe.iloc[start:start + batch_size] + batch_rows = prepare_rows(batch_dataframe) + if verbose: + print(f"Writing data from {start} to {start + batch_size} resulting in length {len(batch_rows)}") + session.execute_write(update_batch, batch_rows) + + +def get_noise_ratio(clustering_results: numpy_typing.NDArray) -> float: + """ + Returns the ratio of noise points in the clustering results. + Noise points are labeled as -1 in HDBSCAN. + + Parameters: + - clustering_results: NDArray containing the clustering results. + + Returns: + - A float representing the noise ratio. + """ + return np.sum(clustering_results == -1) / len(clustering_results) + + +def adjusted_mutual_info_score_without_noise_penalty(clustering_results: numpy_typing.NDArray, reference_communities: numpy_typing.NDArray) -> float: + mask_noise = clustering_results != -1 # Exclude noise points from the comparison + return float(adjusted_mutual_info_score(reference_communities[mask_noise], clustering_results[mask_noise])) + + +def soft_ramp_limited_penalty(score: float, lower_threshold: float, upper_threshold: float, sharpness: int) -> float: + if score <= lower_threshold: + return 1.0 # No penalty + elif score >= upper_threshold: + return 0.0 # Full penalty + else: + # Normalize noise into [0, 1] range for ramp + x = (score - lower_threshold) / (upper_threshold - lower_threshold) + return max(0.0, 1 - x**sharpness) + + +def adjusted_mutual_info_score_with_soft_ramp_noise_penalty(clustering_results: numpy_typing.NDArray, reference_communities: numpy_typing.NDArray) -> float: + """ + Computes the adjusted mutual information score with a custom noise penalty based on a soft ramp function. + + Parameters: + - clustering_results: NDArray containing the clustering results. + - reference_communities: NDArray containing the reference communities for comparison. + - kwargs: Additional keyword arguments for the noise penalty function (e.g., sharpness). + + Returns: + - A float representing the adjusted mutual information score with noise penalty. + """ + score = adjusted_mutual_info_score_without_noise_penalty(reference_communities, clustering_results) + penalty = soft_ramp_limited_penalty(get_noise_ratio(clustering_results), lower_threshold=0.6, upper_threshold=0.8, sharpness=2) + return float(score) * penalty + + +def output_detailed_optuna_tuning_results(optimized_study: optuna.Study, name_of_the_optimized_algorithm: str): + + print(f"Best {name_of_the_optimized_algorithm} score with penalty:", optimized_study.best_value) + print(f"Best {name_of_the_optimized_algorithm} parameter influence:", get_param_importances(optimized_study, evaluator=MeanDecreaseImpurityImportanceEvaluator())) + + valid_trials = [trial for trial in optimized_study.trials if trial.value is not None and trial.state == TrialState.COMPLETE] + top_trials = sorted(valid_trials, key=lambda t: typing.cast(float, t.value), reverse=True)[:10] + for i, trial in enumerate(top_trials): + print(f"Best {name_of_the_optimized_algorithm} parameter rank: {i+1}, trial: {trial.number}, Value = {trial.value:.6f}, Params: {trial.params}") + + +class TunedClusteringResult: + def __init__(self, labels: numpy_typing.NDArray, probabilities: numpy_typing.NDArray, medoids: numpy_typing.NDArray): + self.labels = labels + self.probabilities = probabilities + self.medoids = medoids + self.cluster_count = len(set(labels)) - (1 if -1 in labels else 0) + self.noise_count = np.sum(labels == -1) + self.noise_ratio = self.noise_count / len(labels) if len(labels) > 0 else 0 + + def __repr__(self): + return f"TunedClusteringResult(cluster_count={self.cluster_count}, noise_count={self.noise_count}, noise_ratio={self.noise_ratio}, labels=[...], probabilities=[...], medoids=[...])" + + +def tuned_hierarchical_density_based_spatial_clustering(embeddings: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> TunedClusteringResult: + """ + Applies the optimized hierarchical density-based spatial clustering algorithm (HDBSCAN) to the given node embeddings. + The parameters are tuned to get results similar to the ones of the community detection algorithm. + The result is a list of cluster ids for each node embedding. + """ + base_clustering_parameter = dict( + metric='manhattan', # Turned out to be the best option in most of the initial experiments + allow_single_cluster=False + ) + + def objective(trial): + max_node_count = embeddings.shape[0] + clusterer = HDBSCAN( + **base_clustering_parameter, + # Restrict node count dependent parameters to the max overall node count for small graphs using "min" + min_cluster_size=trial.suggest_int("min_cluster_size", 4, min(max_node_count, 50)), + min_samples=trial.suggest_int("min_samples", 2, min(max_node_count, 30)) + ) + labels = clusterer.fit_predict(embeddings) + return adjusted_mutual_info_score_with_soft_ramp_noise_penalty(labels, reference_community_ids) + + # For in-depth analysis of the tuning, add the following two parameters: + # , storage=f"sqlite:///optuna_study_node_embeddings_java.db", load_if_exists=True) + study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42), study_name="HDBSCAN") + + # Try (enqueue) two specific settings first that led to good results in initial experiments + study.enqueue_trial({"min_cluster_size": 4, "min_samples": 2}) + study.enqueue_trial({"min_cluster_size": 5, "min_samples": 2}) + + # Start the hyperparameter tuning + study.optimize(objective, n_trials=20, timeout=10) + print(f"Best HDBSCAN parameters (Optuna):", study.best_params) + if parameters.is_verbose(): + output_detailed_optuna_tuning_results(study, 'HDBSCAN') + + # Run the clustering again with the best parameters + cluster_algorithm = HDBSCAN(**base_clustering_parameter, **study.best_params, n_jobs=-1, store_centers='medoid') + best_model = cluster_algorithm.fit(embeddings) + + return TunedClusteringResult(best_model.labels_, best_model.probabilities_, best_model.medoids_) + + +class CommunityComparingScores: + def __init__(self, adjusted_mutual_info_score: float, adjusted_rand_index: float, normalized_mutual_information: float): + self.adjusted_mutual_info_score = adjusted_mutual_info_score + self.adjusted_rand_index = adjusted_rand_index + self.normalized_mutual_information = normalized_mutual_information + self.scores = { + "Adjusted Mutual Info Score": adjusted_mutual_info_score, + "Adjusted Rand Index": adjusted_rand_index, + "Normalized Mutual Information": normalized_mutual_information + } + + def __repr__(self): + return f"CommunityComparingScores(adjusted_mutual_info_score={self.adjusted_mutual_info_score}, adjusted_rand_index={self.adjusted_rand_index}, normalized_mutual_information={self.normalized_mutual_information})" + + +def get_community_comparing_scores(cluster_labels: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> CommunityComparingScores: + """ + Returns a DataFrame with the scores of the clustering algorithm compared to the community detection algorithm. + The scores are calculated using the adjusted rand index (ARI) and the normalized mutual information (NMI). + """ + + # Create a mask to filter out noise points. In HDBSCAN, noise points are labeled as -1 + mask = cluster_labels != -1 + ami = float(adjusted_mutual_info_score(reference_community_ids[mask], cluster_labels[mask])) + ari = adjusted_rand_score(reference_community_ids[mask], cluster_labels[mask]) + nmi = float(normalized_mutual_info_score(reference_community_ids[mask], cluster_labels[mask])) + + return CommunityComparingScores(ami, ari, nmi) + + +def add_cluster_medoids_to_embeddings(embeddings: pd.DataFrame, clustering_result: TunedClusteringResult) -> pd.DataFrame: + """ + Adds the column 'clusteringTunedHDBSCANIsMedoid' that marks the center (medoid) of a cluster with 1 and all other entries with 0. + """ + assigned_labels = [] + + def is_medoid(row): + """ Checks if the embedding of the given row is a medoid (=center node of the cluster that may act as a representative).""" + for medoid in clustering_result.medoids: + if row['clusteringTunedHDBSCANLabel'] in assigned_labels: + return 0 # The cluster with this label already has a medoid assigned + if np.array_equal(row['embedding'], medoid): + assigned_labels.append(row['clusteringTunedHDBSCANLabel']) + return 1 + return 0 + + embeddings['clusteringTunedHDBSCANIsMedoid'] = embeddings.apply(is_medoid, axis=1) + return embeddings + + +def add_center_distances(embeddings: pd.DataFrame) -> pd.DataFrame: + """ + Adds the column 'clusteringTunedHDBSCANNormalizedDistanceToMedoid': + - Manhattan distance (L1 norm) to the cluster medoid, normalized per cluster. + - Noise points (label == -1) receive 0.0. + Adds the column 'clusteringTunedHDBSCANClusterMaxRadius': + - Maximum Manhattan distance (L1 norm) of all points inside the same cluster (same label) to the medoid of that cluster + - Noise points (label == -1) receive 0.0. + Adds the column 'clusteringTunedHDBSCANClusterAverageRadius': + - Average Manhattan distance (L1 norm) of all points inside the same cluster (same label) to the medoid of that cluster + - Noise points (label == -1) receive 0.0. + Assumes: + - 'embedding' column contains vectors. + - 'clusteringTunedHDBSCANLabel' exists. + - 'clusteringTunedHDBSCANIsMedoid' is already set (1 for medoid, 0 otherwise) by function "add_cluster_medoids_to_embeddings". + """ + # Map cluster label -> medoid embedding from existing dataframe + medoids_by_label = {} + for _, row in embeddings[embeddings['clusteringTunedHDBSCANIsMedoid'] == 1].iterrows(): + medoids_by_label[row['clusteringTunedHDBSCANLabel']] = row['embedding'] + + # Prepare array to store normalized distances + normalized_distances = np.full(len(embeddings), 0.0) + + cluster_max_distance_to_medoid = {} + cluster_average_distance_to_medoid = {} + + # Group by cluster and compute normalized Manhattan distance to medoid + for label, group in embeddings.groupby('clusteringTunedHDBSCANLabel'): + if label == -1 or label not in medoids_by_label: + cluster_max_distance_to_medoid[-1] = 0.0 + cluster_average_distance_to_medoid[-1] = 0.0 + continue # Skip noise or missing medoids + + medoid = medoids_by_label[label] + group_indices = group.index + group_embeddings = np.stack(group['embedding'].apply(np.array).tolist()) + + # Compute Manhattan distances + distances = np.sum(np.abs(group_embeddings - medoid), axis=1) + + # Normalize per cluster + max_distance = np.max(distances) + cluster_max_distance_to_medoid[label] = max_distance + cluster_average_distance_to_medoid[label] = np.average(distances) + + if max_distance == 0: + normalized = np.zeros_like(distances) + else: + normalized = distances / max_distance + + normalized_distances[group_indices] = normalized + + embeddings['clusteringTunedHDBSCANNormalizedDistanceToMedoid'] = normalized_distances + embeddings['clusteringTunedHDBSCANClusterMaxRadius'] = embeddings['clusteringTunedHDBSCANLabel'].map(cluster_max_distance_to_medoid) + embeddings['clusteringTunedHDBSCANClusterAverageRadius'] = embeddings['clusteringTunedHDBSCANLabel'].map(cluster_average_distance_to_medoid) + + return embeddings + + +def add_clustering_results_to_embeddings(embeddings: pd.DataFrame, clustering_result: TunedClusteringResult) -> pd.DataFrame: + """ + Adds the clustering results to the embeddings DataFrame. + """ + embeddings['clusteringTunedHDBSCANLabel'] = clustering_result.labels + embeddings['clusteringTunedHDBSCANProbability'] = clustering_result.probabilities + + # Add the cluster size + cluster_sizes = embeddings['clusteringTunedHDBSCANLabel'].value_counts() + embeddings['clusteringTunedHDBSCANClusterSize'] = embeddings['clusteringTunedHDBSCANLabel'].map(cluster_sizes) + + embeddings = add_cluster_medoids_to_embeddings(embeddings, clustering_result) + embeddings = add_center_distances(embeddings) # requires medoids + return embeddings + + +def get_labels_by_cluster_count_descending(embeddings: pd.DataFrame) -> pd.DataFrame: + """ + Returns the clustering results distribution for the given clustering name. + """ + return embeddings.groupby('clusteringTunedHDBSCANLabel').aggregate( + probability=('clusteringTunedHDBSCANProbability', 'mean'), + count=('codeUnitName', 'count'), + communityIds=('communityId', lambda x: list(set(x))), + codeUnitNames=('codeUnitName', lambda x: list(set(x))), + ).reset_index().sort_values(by='count', ascending=False) + + +class TunedHierarchicalDensityBasedSpatialClusteringResult: + def __init__(self, embeddings: pd.DataFrame, clustering_result: TunedClusteringResult, community_comparing_scores: CommunityComparingScores, clustering_results_distribution: pd.DataFrame): + self.embeddings = embeddings + self.clustering_result = clustering_result + self.community_comparing_scores = community_comparing_scores + self.clustering_results_distribution = clustering_results_distribution + + def __repr__(self): + return f"TunedHierarchicalDensityBasedSpatialClusteringResult(embeddings={self.embeddings}, clustering_result={self.clustering_result}, community_comparing_scores={self.community_comparing_scores}, clustering_results_distribution={self.clustering_results_distribution})" + + +def coordinate_tuned_hierarchical_density_based_spatial_clustering(embeddings: pd.DataFrame) -> TunedHierarchicalDensityBasedSpatialClusteringResult: + """ + Applies the tuned hierarchical density-based spatial clustering algorithm (HDBSCAN) to the given node embeddings. + The parameters are tuned to get results similar to the ones of the community detection algorithm. + The result is the input DataFrame with the clustering results added. + """ + + # Apply the tuned HDBSCAN clustering algorithm + embeddings_values = np.array(embeddings.embedding.tolist()) + community_reference_ids = np.array(embeddings.communityId.tolist()) + + clustering_result = tuned_hierarchical_density_based_spatial_clustering(embeddings_values, community_reference_ids) + print(clustering_result) + + community_comparing_scores = get_community_comparing_scores(clustering_result.labels, community_reference_ids) + print(community_comparing_scores) + + # Add the clustering results to the embeddings DataFrame + embeddings_with_clusters = add_clustering_results_to_embeddings(embeddings, clustering_result) + + # Get the clustering results distribution + clustering_results_distribution = get_labels_by_cluster_count_descending(embeddings_with_clusters) + + return TunedHierarchicalDensityBasedSpatialClusteringResult(embeddings_with_clusters, clustering_result, community_comparing_scores, clustering_results_distribution) + + +class HierarchicalDensityClusteringScores: + + def __init__(self, embedding_dimension: int, adjusted_mutual_info_score: float, confidence_score: float, noise_ratio: float, cluster_count: int): + self.embedding_dimension = embedding_dimension + self.adjusted_mutual_info_score = adjusted_mutual_info_score + self.confidence_score = confidence_score + self.noise_ratio = noise_ratio + self.cluster_count = cluster_count + + def __repr__(self): + return f"HierarchicalDensityClusteringScores(embedding_dimension={self.embedding_dimension}, adjusted_mutual_info_score={self.adjusted_mutual_info_score}, confidence_score={self.confidence_score}, noise_ratio={self.noise_ratio}, cluster_count={self.cluster_count})" + + @classmethod + def cluster_embeddings_with_references(cls, embedding_column: pd.Series, reference_community_id_column: pd.Series) -> 'HierarchicalDensityClusteringScores': + """ + Clusters the embeddings with the reference community ids and returns the clustering scores. + + Parameters + ---------- + embedding_column : pd.Series + The column containing the embeddings to be clustered. + reference_community_id_column : pd.Series + The column containing the reference community ids to compare the clustering results against. + + Returns + ------- + HierarchicalDensityClusteringScores + An instance of HierarchicalDensityClusteringScores containing the clustering scores. + """ + hierarchical_density_based_spatial_clustering = HDBSCAN( + cluster_selection_method='eom', + metric='manhattan', + min_samples=2, + min_cluster_size=5, + allow_single_cluster=False, + n_jobs=-1 + ) + embeddings = np.array(embedding_column.tolist()) + clustering_result = hierarchical_density_based_spatial_clustering.fit(embeddings) + + reference_community_ids = np.array(reference_community_id_column.tolist()) + adjusted_mutual_info_score_value = adjusted_mutual_info_score_with_soft_ramp_noise_penalty(clustering_result.labels_, reference_community_ids) + + confidence_score = np.mean(clustering_result.probabilities_[clustering_result.labels_ != -1]) + noise_count = np.sum(clustering_result.labels_ == -1) + noise_ratio = noise_count / len(clustering_result.labels_) + cluster_count = len(set(clustering_result.labels_)) - (1 if -1 in clustering_result.labels_ else 0) + return cls(len(embeddings[0]), adjusted_mutual_info_score_value, confidence_score, noise_ratio, cluster_count) + + +class TuneableFastRandomProjectionNodeEmbeddings: # (sklearn.BaseEstimator): + """ + Encapsulates Fast Random Projection (FRP, FastRP) for hyper-parameter tuning. + """ + # Extend from sklearn BaseEstimator to use e.g. GridSearchCV for hyperparameter tuning. + # The implementation is sklearn compatible and follows its schema (e.g. fit and score method). + + cypher_query_for_generating_embeddings_: typing.LiteralString = """ + CALL gds.fastRP.stream( + $projection_name + '-cleaned', { + embeddingDimension: toInteger($embedding_dimension) + ,randomSeed: toInteger($embedding_random_seed) + ,normalizationStrength: toFloat($normalization_strength) + ,iterationWeights: [0.0, 0.0, 1.0, toFloat($forth_iteration_weight)] + ,relationshipWeightProperty: $projection_weight_property + } + ) + YIELD nodeId, embedding + WITH gds.util.asNode(nodeId) AS codeUnit + ,embedding + OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName + OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + RETURN DISTINCT + coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.name AS shortCodeUnitName + ,elementId(codeUnit) AS nodeElementId + ,coalesce(artifactName, projectName) AS projectName + ,codeUnit[$community_property] AS communityId + ,embedding + """ + + cypher_query_for_writing_embeddings_: typing.LiteralString = """ + CALL gds.fastRP.write( + $projection_name + '-cleaned', { + embeddingDimension: toInteger($embedding_dimension) + ,randomSeed: 30 + ,relationshipWeightProperty: $projection_weight_property + ,writeProperty: $write_property + } + ) + """ + cypher_file_for_read_ = "../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Tuneable_Stream.cypher" + cypher_file_for_write_ = "../cypher/Node_Embeddings/Node_Embeddings_1e_Fast_Random_Projection_Tuneable_Write.cypher" + + def __init__(self, + parameters: Parameters = Parameters.example(), + # Tuneable algorithm parameters + embedding_dimension: int = 64, + random_seed: int = 42, + normalization_strength: float = 0.3, + forth_iteration_weight: float = 1.0, + ): + self.parameters_ = parameters + self.verbose_ = parameters.is_verbose() + self.write_property_ = parameters.get_embedding_property() + + self.embedding_dimension = embedding_dimension + self.random_seed = random_seed + self.normalization_strength = normalization_strength + self.forth_iteration_weight = forth_iteration_weight + + def __to_algorithm_parameters(self) -> typing.Dict['str', 'str']: + return { + "embedding_dimension": str(self.embedding_dimension), + "normalization_strength": str(self.normalization_strength), + "forth_iteration_weight": str(self.forth_iteration_weight), + "embedding_random_seed": str(self.random_seed), + "write_property": str(self.write_property_), + **self.parameters_.get_query_parameters() + } + + def __run_algorithm(self) -> pd.DataFrame: + algorithm_parameters = self.__to_algorithm_parameters() + # For Debugging: + # print("Generating embeddings using Neo4j Graph Data Science with the following parameters: " + str(algorithm_parameters)) + if self.verbose_: + return query_cypher_to_data_frame(self.cypher_query_for_generating_embeddings_, parameters=algorithm_parameters) + + return query_cypher_to_data_frame_suppress_warnings(self.cypher_query_for_generating_embeddings_, parameters=algorithm_parameters) + + def __check_fitted(self) -> None: + """ + Checks if the model has been fitted by checking if the embeddings_ attribute exists. + Raises a ValueError if the model has not been fitted yet. + """ + if not hasattr(self, 'embeddings_') or not hasattr(self, 'clustering_scores_'): + raise ValueError("The model has not been fitted yet. Please call the fit method before.") + + def fit(self, X=None, y=None) -> typing.Self: + """ + Fits the model by generating node embeddings and calculating the Hopkins statistic. + """ + self.embeddings_ = self.__run_algorithm() + self.clustering_scores_ = HierarchicalDensityClusteringScores.cluster_embeddings_with_references(self.embeddings_.embedding, self.embeddings_.communityId) + return self + + def score(self, X=None, y=None) -> float: + """ + Returns the score of the model based on the adjusted mutual info score comparing the clusters with pre calculated Leiden communities. + """ + self.__check_fitted() + return self.clustering_scores_.adjusted_mutual_info_score + + def write_embeddings(self) -> typing.Self: + """ + Writes the generated embeddings to the Neo4j database. + This is useful for further processing or analysis of the embeddings. + """ + algorithm_parameters = self.__to_algorithm_parameters() + if self.verbose_: + print("") + print("Writing embeddings to Neo4j with the following parameters: " + str(algorithm_parameters)) + print("") + + if self.verbose_: + query_cypher_to_data_frame(self.cypher_query_for_writing_embeddings_, parameters=algorithm_parameters) + else: + query_cypher_to_data_frame_suppress_warnings(self.cypher_query_for_writing_embeddings_, parameters=algorithm_parameters) + print(f"Best Fast Random Projection results written back into Neo4j.") + return self + + def get_embeddings(self) -> pd.DataFrame: + """ + Returns the generated embeddings + """ + self.__check_fitted() + return self.embeddings_ + + def get_clustering_scores(self) -> HierarchicalDensityClusteringScores: + """ + Returns the clustering scores, which include the adjusted mutual info score, confidence score, noise ratio, and cluster count. + """ + self.__check_fitted() + return self.clustering_scores_ + + +def get_tuned_fast_random_projection_node_embeddings(parameters: Parameters) -> TuneableFastRandomProjectionNodeEmbeddings: + if not parameters.is_verbose(): + optuna.logging.set_verbosity(optuna.logging.WARNING) + + def objective(trial): + # Suggest values for each hyperparameter + tuneable_parameters = { + "embedding_dimension": trial.suggest_categorical("embedding_dimension", [64, 128, 256]), + "normalization_strength": trial.suggest_float("normalization_strength", low=-1.0, high=1.0, step=0.1), + "forth_iteration_weight": trial.suggest_float("forth_iteration_weight", low=0.0, high=2.0, step=0.1), + } + # Note: Fast Random Projection is intentionally applied to the whole Graph without sampling. + # It scales well for larger Graphs and it is beneficial for the quality of the downstream clustering. + tuneable_fast_random_projection = TuneableFastRandomProjectionNodeEmbeddings(parameters, **tuneable_parameters) + tuneable_fast_random_projection.fit() + return tuneable_fast_random_projection.score() + + study_name = "FastRandomProjection - " + parameters.get_projection_name() + # For in-depth analysis of the tuning, add the following two parameters: + # , storage=f"sqlite:///optuna_study_node_embeddings_java.db", load_if_exists=True) + study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42), study_name=study_name) # , storage=f"sqlite:///optuna_study_node_embeddings_java.db", load_if_exists=True) + + # Try (enqueue) two specific settings first that led to good results in initial experiments + study.enqueue_trial({'embedding_dimension': 128, 'forth_iteration_weight': 0.5, 'normalization_strength': 0.3}) + study.enqueue_trial({'embedding_dimension': 128, 'forth_iteration_weight': 1.0, 'normalization_strength': 0.5}) + study.enqueue_trial({'embedding_dimension': 256, 'forth_iteration_weight': 0.5, 'normalization_strength': 0.3}) + study.enqueue_trial({'embedding_dimension': 256, 'forth_iteration_weight': 1.0, 'normalization_strength': 0.3}) + + # Start the hyperparameter tuning + study.optimize(objective, n_trials=80, timeout=40) + print(f"Best Fast Random Projection (FastRP) parameters for {parameters.get_projection_name()} (Optuna):", study.best_params) + if parameters.is_verbose(): + output_detailed_optuna_tuning_results(study, 'Fast Random Projection (FastRP)') + + # Run the node embeddings algorithm again with the best parameters and return it + return TuneableFastRandomProjectionNodeEmbeddings(parameters, **study.best_params).fit() + + +# ------------------------------------------------------------------------------------------------------------ +# MAIN +# ------------------------------------------------------------------------------------------------------------ + + +parameters = parse_input_parameters() +driver = get_graph_database_driver() + +tuned_fast_random_projection = get_tuned_fast_random_projection_node_embeddings(parameters) +embeddings = tuned_fast_random_projection.get_embeddings() + +clustering_results = coordinate_tuned_hierarchical_density_based_spatial_clustering(embeddings) +embeddings = clustering_results.embeddings + +if parameters.is_verbose(): + print("HDBSCAN clustered labels by their size descending (top 10):", clustering_results.clustering_results_distribution.head(10)) + print("HDBSCAN clustered labels by their probability descending (top 10):", clustering_results.clustering_results_distribution.sort_values(by='probability', ascending=False).head(10)) + +tuned_fast_random_projection.write_embeddings() +data_to_write = pd.DataFrame(data={ + 'nodeElementId': embeddings["nodeElementId"], + 'clusteringHDBSCANLabel': embeddings['clusteringTunedHDBSCANLabel'], + 'clusteringHDBSCANProbability': embeddings['clusteringTunedHDBSCANProbability'], + 'clusteringHDBSCANNoise': (embeddings['clusteringTunedHDBSCANLabel'] == -1).astype(int), + 'clusteringHDBSCANMedoid': embeddings['clusteringTunedHDBSCANIsMedoid'].astype(int), + 'clusteringHDBSCANSize': embeddings['clusteringTunedHDBSCANClusterSize'].astype(int), + 'clusteringHDBSCANRadiusAverage': embeddings['clusteringTunedHDBSCANClusterAverageRadius'], + 'clusteringHDBSCANRadiusMax': embeddings['clusteringTunedHDBSCANClusterMaxRadius'], + 'clusteringHDBSCANNormalizedDistanceToMedoid': embeddings['clusteringTunedHDBSCANNormalizedDistanceToMedoid'], +}) +write_batch_data_into_database(data_to_write, parameters.get_projection_node_label(), verbose=parameters.is_verbose()) + +driver.close() \ No newline at end of file diff --git a/domains/anomaly-detection/umap2dNodeEmbeddings.py b/domains/anomaly-detection/umap2dNodeEmbeddings.py new file mode 100755 index 000000000..f8ec49308 --- /dev/null +++ b/domains/anomaly-detection/umap2dNodeEmbeddings.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python + +# This Python script uses UMAP (https://umap-learn.readthedocs.io) to reduce the dimensionality of node embeddings to two dimensions for visualization purposes. +# This is useful to get a visual intuition about the structure of the code units (like Java packages) and their dependencies. +# The resulting 2D coordinates are written back to Neo4j for further use. + +# Prerequisite: +# - Provide the password for Neo4j in the environment variable "NEO4J_INITIAL_PASSWORD". +# - Already existing Graph with analyzed code units (like Java Packages) and their dependencies. +# - Already existing node embeddings for the code units, e.g. generated by Fast Random Projection (FastRP) or other algorithms. + +import typing + +import os +import sys +import argparse +import pprint + +import pandas as pd +import numpy as np + +from neo4j import GraphDatabase, Driver +import umap + + +class Parameters: + required_parameters_ = ["projection_node_label", "embedding_property"] + + def __init__(self, input_parameters: typing.Dict[str, str], verbose: bool = False): + self.query_parameters_ = input_parameters.copy() # copy enforces immutability + self.verbose_ = verbose + + def __repr__(self): + pretty_dict = pprint.pformat(self.query_parameters_, indent=4) + return f"Parameters: verbose={self.verbose_}, query_parameters:\n{pretty_dict}" + + @staticmethod + def log_dependency_versions_() -> None: + print('---------------------------------------') + + print('Python version: {}'.format(sys.version)) + + from numpy import __version__ as numpy_version + print('numpy version: {}'.format(numpy_version)) + + from pandas import __version__ as pandas_version + print('pandas version: {}'.format(pandas_version)) + + from neo4j import __version__ as neo4j_version + print('neo4j version: {}'.format(neo4j_version)) + + from umap import __version__ as umap_version + print('umap version: {}'.format(umap_version)) + + print('---------------------------------------') + + @classmethod + def from_input_parameters(cls, input_parameters: typing.Dict[str, str], verbose: bool = False): + """ + Creates a Parameters instance from a dictionary of input parameters. + The dictionary must contain the following keys: + - "projection_node_label": The node type of the projection. + """ + missing_parameters = [parameter for parameter in cls.required_parameters_ if parameter not in input_parameters] + if missing_parameters: + raise ValueError("Missing parameters:", missing_parameters) + created_parameters = cls(input_parameters, verbose) + if created_parameters.is_verbose(): + print(created_parameters) + cls.log_dependency_versions_() + return created_parameters + + @classmethod + def example(cls): + return cls(dict(projection_node_label="Package")) + + def get_query_parameters(self) -> typing.Dict[str, str]: + return self.query_parameters_.copy() # copy enforces immutability + + def clone_with_projection_name(self, projection_name: str): + updated_parameter = self.get_query_parameters() + updated_parameter.update({"projection_name": projection_name}) + return Parameters(updated_parameter) + + def get_projection_node_label(self) -> str: + return self.query_parameters_["projection_node_label"] + + def get_embedding_property(self) -> str: + return self.query_parameters_["embedding_property"] + + def is_verbose(self) -> bool: + return self.verbose_ + + +def parse_input_parameters() -> Parameters: + # Convert list of "key=value" strings to a dictionary + def parse_key_value_list(param_list: typing.List[str]) -> typing.Dict[str, str]: + param_dict = {} + for item in param_list: + if '=' in item: + key, value = item.split('=', 1) + param_dict[key] = value + return param_dict + + parser = argparse.ArgumentParser( + description="Unsupervised clustering to assign labels to code units (Java packages, types,...) and their dependencies based on how structurally similar they are within a software system.") + parser.add_argument('--verbose', action='store_true', help='Enable verbose mode to log all details') + parser.add_argument('query_parameters', nargs='*', type=str, help='List of key=value Cypher query parameters') + parser.set_defaults(verbose=False) + args = parser.parse_args() + return Parameters.from_input_parameters(parse_key_value_list(args.query_parameters), args.verbose) + + +def get_graph_database_driver() -> Driver: + driver = GraphDatabase.driver( + uri="bolt://localhost:7687", + auth=("neo4j", os.environ.get("NEO4J_INITIAL_PASSWORD")) + ) + driver.verify_connectivity() + return driver + + +def query_cypher_to_data_frame(query: typing.LiteralString, parameters: typing.Optional[typing.Dict[str, typing.Any]] = None): + records, summary, keys = driver.execute_query(query, parameters_=parameters) + return pd.DataFrame([record.values() for record in records], columns=keys) + + +def write_batch_data_into_database(dataframe: pd.DataFrame, node_label: str, id_column: str = "nodeElementId", batch_size: int = 1000, verbose: bool = False) -> None: + """ + Writes the given dataframe to the Neo4j database using a batch write operation. + + Parameters: + - dataframe: The pandas DataFrame to write. + - label: The label to use for the nodes in the Neo4j database. + - id_column: The name of the column in the DataFrame that contains the node IDs. + - cypher_query_file: The file containing the Cypher query for writing the data. + - batch_size: The number of rows to write in each batch. + """ + def prepare_rows(dataframe): + rows = [] + for _, row in dataframe.iterrows(): + properties_without_id = row.drop(labels=[id_column]).to_dict() + rows.append({ + "nodeId": row[id_column], + "properties": properties_without_id + }) + return rows + + def update_batch(transaction, rows): + query = """ + UNWIND $rows AS row + MATCH (codeUnit) + WHERE elementId(codeUnit) = row.nodeId + AND $node_label IN labels(codeUnit) + SET codeUnit += row.properties + """ + transaction.run(query, rows=rows, node_label=node_label) + + with driver.session() as session: + for start in range(0, len(dataframe), batch_size): + batch_dataframe = dataframe.iloc[start:start + batch_size] + batch_rows = prepare_rows(batch_dataframe) + if verbose: + print(f"Writing data from {start} to {start + batch_size} resulting in length {len(batch_rows)}") + session.execute_write(update_batch, batch_rows) + + +def prepare_node_embeddings_for_2d_visualization(embeddings: pd.DataFrame) -> pd.DataFrame: + """ + Reduces the dimensionality of the node embeddings (e.g. 64 floating point numbers in an array) + to two dimensions for 2D visualization using UMAP. + see https://umap-learn.readthedocs.io + """ + + if embeddings.empty: + print("No projected data for node embeddings dimensionality reduction available") + return embeddings + + # Convert the list of embeddings to a numpy array + embeddings_as_numpy_array = np.array(embeddings.embedding.to_list()) + + # Use UMAP to reduce the dimensionality to 2D for visualization + umap_reducer = umap.UMAP(n_components=2, min_dist=0.3, random_state=47, n_jobs=1, verbose=parameters.is_verbose()) + two_dimension_node_embeddings = umap_reducer.fit_transform(embeddings_as_numpy_array) + + # Convert to dense numpy array (works for both sparse and dense input) + two_dimension_node_embeddings = np.asarray(two_dimension_node_embeddings) + + # Add the 2D coordinates to the DataFrame + embeddings['embeddingVisualizationX'] = two_dimension_node_embeddings[:, 0] + embeddings['embeddingVisualizationY'] = two_dimension_node_embeddings[:, 1] + + return embeddings + + +# ------------------------------------------------------------------------------------------------------------ +# MAIN +# ------------------------------------------------------------------------------------------------------------ + +parameters = parse_input_parameters() +driver = get_graph_database_driver() + +cypher_query_embeddings_: typing.LiteralString = """ + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit[$embedding_property] IS NOT NULL + RETURN elementId(codeUnit) AS nodeElementId + ,codeUnit[$embedding_property] AS embedding + """ + +embeddings = query_cypher_to_data_frame(cypher_query_embeddings_, parameters.get_query_parameters()) +if embeddings.empty: + print(f"No node embeddings found for projection node label '{parameters.get_projection_node_label()}' and embedding property '{parameters.get_embedding_property()}'.") + sys.exit(1) + +embeddings = prepare_node_embeddings_for_2d_visualization(embeddings) + +data_to_write = pd.DataFrame(data={ + 'nodeElementId': embeddings["nodeElementId"], + parameters.get_embedding_property() + 'VisualizationX': embeddings["embeddingVisualizationX"], + parameters.get_embedding_property() + 'VisualizationY': embeddings["embeddingVisualizationY"], +}) +write_batch_data_into_database(data_to_write, parameters.get_projection_node_label()) + +driver.close() \ No newline at end of file diff --git a/jupyter/NodeEmbeddingsJavaHyperparameterTuning.ipynb b/jupyter/NodeEmbeddingsJavaHyperparameterTuning.ipynb new file mode 100644 index 000000000..ad4123f85 --- /dev/null +++ b/jupyter/NodeEmbeddingsJavaHyperparameterTuning.ipynb @@ -0,0 +1,3155 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "2f0eabc4", + "metadata": {}, + "source": [ + "# Hyperparameter tuning of Java Node Embeddings\n", + "\n", + "This notebook demonstrates different methods for node embeddings and how to further reduce their dimensionality to be able to visualize them in a 2D plot. \n", + "\n", + "Node embeddings are essentially an array of floating point numbers (length = embedding dimension) that can be used as \"features\" in machine learning. These numbers approximate the relationship and similarity information of each node and can also be seen as a way to encode the topology of the graph.\n", + "\n", + "## Considerations\n", + "\n", + "Due to dimensionality reduction some information gets lost, especially when visualizing node data in two dimensions. Nevertheless, it helps to get an intuition on what node embeddings are and how much of the similarity and neighborhood information is retained. The latter can be observed by how well nodes of the same color and therefore same community are placed together and how much bigger nodes with a high centrality score influence them. \n", + "\n", + "If the visualization doesn't show a somehow clear separation between the communities (colors) here are some ideas for tuning: \n", + "- Clean the data, e.g. filter out very few nodes with extremely high degree that aren't actually that important\n", + "- Try directed vs. undirected projections\n", + "- Tune the embedding algorithm, e.g. use a higher dimensionality\n", + "- Tune t-SNE that is used to reduce the node embeddings dimension to two dimensions for visualization. \n", + "\n", + "It could also be the case that the node embeddings are good enough and well suited the way they are despite their visualization for the down stream task like node classification or link prediction. In that case it makes sense to see how the whole pipeline performs before tuning the node embeddings in detail. \n", + "\n", + "## Note about data dependencies\n", + "\n", + "PageRank centrality and Leiden community are also fetched from the Graph and need to be calculated first.\n", + "This makes it easier to see if the embeddings approximate the structural information of the graph in the plot.\n", + "If these properties are missing you will only see black dots all of the same size.\n", + "\n", + "
\n", + "\n", + "### References\n", + "- [jqassistant](https://jqassistant.org)\n", + "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)\n", + "- [Tutorial: Applied Graph Embeddings](https://neo4j.com/developer/graph-data-science/applied-graph-embeddings)\n", + "- [Visualizing the embeddings in 2D](https://github.com/openai/openai-cookbook/blob/main/examples/Visualizing_embeddings_in_2D.ipynb)\n", + "- [scikit-learn TSNE](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE)\n", + "- [AttributeError: 'list' object has no attribute 'shape'](https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape)\n", + "- [Fast Random Projection (neo4j)](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp)\n", + "- [HashGNN (neo4j)](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn)\n", + "- [node2vec (neo4j)](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/node2vec) computes a vector representation of a node based on second order random walks in the graph. \n", + "- [Complete guide to understanding Node2Vec algorithm](https://towardsdatascience.com/complete-guide-to-understanding-node2vec-algorithm-4e9a35e5d147)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4191f259", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import contextlib\n", + "\n", + "from IPython.display import display\n", + "import pandas as pd\n", + "import typing as typ\n", + "import numpy as np\n", + "from openTSNE.sklearn import TSNE\n", + "\n", + "from sklearn.base import BaseEstimator\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.neighbors import NearestNeighbors\n", + "from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, normalized_mutual_info_score\n", + "from sklearn.cluster import HDBSCAN\n", + "\n", + "import matplotlib.pyplot as plot\n", + "import seaborn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0676813", + "metadata": {}, + "outputs": [], + "source": [ + "#The following cell uses the build-in %html \"magic\" to override the CSS style for tables to a much smaller size.\n", + "#This is especially needed for PDF export of tables with multiple columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebac1bb9", + "metadata": {}, + "outputs": [], + "source": [ + "%%html\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07319282", + "metadata": {}, + "outputs": [], + "source": [ + "# Main Colormap\n", + "# main_color_map = 'nipy_spectral'\n", + "main_color_map = 'viridis'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8ef41ff", + "metadata": {}, + "outputs": [], + "source": [ + "from matplotlib import __version__ as matplotlib_version\n", + "print('matplotlib version: {}'.format(matplotlib_version))\n", + "\n", + "from numpy import __version__ as numpy_version\n", + "print('numpy version: {}'.format(numpy_version))\n", + "\n", + "from openTSNE import __version__ as openTSNE_version\n", + "print('openTSNE version: {}'.format(openTSNE_version))\n", + "\n", + "from pandas import __version__ as pandas_version\n", + "print('pandas version: {}'.format(pandas_version))\n", + "\n", + "from sklearn import __version__ as sklearn_version\n", + "print('scikit-learn version: {}'.format(sklearn_version))\n", + "\n", + "from seaborn import __version__ as seaborn_version # type: ignore\n", + "print('seaborn version: {}'.format(seaborn_version))\n", + "\n", + "from optuna import __version__ as optuna_version\n", + "print('optuna version: {}'.format(optuna_version))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c5dab37", + "metadata": {}, + "outputs": [], + "source": [ + "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell \n", + "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n", + "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n", + "from neo4j import GraphDatabase\n", + "\n", + "driver = GraphDatabase.driver(\n", + " uri=\"bolt://localhost:7687\", \n", + " auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\"))\n", + ")\n", + "driver.verify_connectivity()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1db254b", + "metadata": {}, + "outputs": [], + "source": [ + "def get_cypher_query_from_file(filename) -> str:\n", + " with open(filename) as file:\n", + " return ' '.join(file.readlines())\n", + " \n", + "\n", + "def query_cypher_to_data_frame(filename, parameters: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", + " records, summary, keys = driver.execute_query(query_=get_cypher_query_from_file(filename), parameters_=parameters)\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", + "\n", + "\n", + "def query_cypher_to_data_frame_suppress_warnings(filename, parameters: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", + " \"\"\"\n", + " Executes the Cypher query in the given file and returns the result as a pandas DataFrame.\n", + " This function suppresses any warnings or error messages that would normally be printed to stderr.\n", + " This is useful when you want to run a query without cluttering the output with warnings.\n", + " Parameters:\n", + " - filename: The name of the file containing the Cypher query.\n", + " - parameters: Optional dictionary of parameters to pass to the Cypher query.\n", + " Returns:\n", + " - A pandas DataFrame containing the results of the Cypher query.\n", + " \"\"\"\n", + " import contextlib\n", + " with open(os.devnull, 'w') as devnull, contextlib.redirect_stderr(devnull):\n", + " return query_cypher_to_data_frame(filename, parameters)\n", + "\n", + "def query_cypher_to_data_frame_for_verbosity(verbose: bool) -> typ.Callable:\n", + " \"\"\"\n", + " Returns a function that executes a Cypher query from a file and returns the result as a pandas DataFrame.\n", + " If verbose is True, it returns a function that prints warnings and errors to stderr.\n", + " If verbose is False, it returns a function that suppresses warnings and errors.\n", + " Parameters:\n", + " - verbose: A boolean indicating whether to print warnings and errors.\n", + " Returns:\n", + " - A function that takes a filename and optional parameters, and returns a pandas DataFrame.\n", + " \"\"\"\n", + " return query_cypher_to_data_frame if verbose else query_cypher_to_data_frame_suppress_warnings\n", + "\n", + "def query_first_non_empty_cypher_to_data_frame(*filenames : str, parameters: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", + " \"\"\"\n", + " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", + " If all given file names result in empty results, the last (empty) result will be returned.\n", + " By additionally specifying \"limit=\" the \"LIMIT\" keyword will appended to query so that only the first results get returned.\n", + " \"\"\"\n", + " result=pd.DataFrame()\n", + " for filename in filenames:\n", + " result=query_cypher_to_data_frame(filename, parameters)\n", + " if not result.empty:\n", + " print(\"The results have been provided by the query filename: \" + filename)\n", + " return result\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d011fe71", + "metadata": {}, + "outputs": [], + "source": [ + "def write_batch_data_into_database(dataframe: pd.DataFrame, node_label: str, id_column: str = \"nodeElementId\", cypher_query_file: str = \"../cypher/Dependencies_Projection/Dependencies_14_Write_Batch_Data.cypher\", batch_size: int = 1000):\n", + " \"\"\"\n", + " Writes the given dataframe to the Neo4j database using a batch write operation.\n", + " \n", + " Parameters:\n", + " - dataframe: The pandas DataFrame to write.\n", + " - label: The label to use for the nodes in the Neo4j database.\n", + " - id_column: The name of the column in the DataFrame that contains the node IDs.\n", + " - cypher_query_file: The file containing the Cypher query for writing the data.\n", + " - batch_size: The number of rows to write in each batch.\n", + " \"\"\"\n", + " def prepare_rows(dataframe):\n", + " rows = []\n", + " for _, row in dataframe.iterrows():\n", + " properties_without_id = row.drop(labels=[id_column]).to_dict()\n", + " rows.append({\n", + " \"nodeId\": row[id_column],\n", + " \"properties\": properties_without_id\n", + " })\n", + " return rows\n", + "\n", + " def update_batch(transaction, rows):\n", + " query = get_cypher_query_from_file(cypher_query_file)\n", + " transaction.run(query, dependencies_projection_rows=rows, dependencies_projection_node=node_label)\n", + "\n", + " with driver.session() as session:\n", + " for start in range(0, len(dataframe), batch_size):\n", + " batch_dataframe = dataframe.iloc[start:start + batch_size]\n", + " batch_rows = prepare_rows(batch_dataframe)\n", + " return session.execute_write(update_batch, batch_rows)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d2e62d6", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO option to choose between directed and undirected projection\n", + "\n", + "def create_undirected_projection(parameters: dict) -> bool: \n", + " \"\"\"\n", + " Creates an undirected homogenous in-memory Graph projection for/with Neo4j Graph Data Science Plugin.\n", + " It returns True if there is data available for the given parameter and False otherwise.\n", + " Parameters\n", + " ----------\n", + " dependencies_projection : str\n", + " The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n", + " dependencies_projection_node : str\n", + " The label of the nodes that will be used for the projection. Example: \"Package\"\n", + " dependencies_projection_weight_property : str\n", + " The name of the node property that contains the dependency weight. Example: \"weight25PercentInterfaces\"\n", + " \"\"\"\n", + " \n", + " is_data_missing=query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_0_Check_Projectable.cypher\", parameters).empty\n", + " if is_data_missing: return False\n", + "\n", + " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_1_Delete_Projection.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_2_Delete_Subgraph.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_1_Delete_Projection.cypher\", dict(dependencies_projection=parameters[\"dependencies_projection\"] + '-cleaned-sampled'))\n", + " # To include the direction of the relationships use the following line to create the projection:\n", + " # query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_3_Create_Projection.cypher\", parameters)\n", + " node_count : int = 0\n", + " if parameters[\"dependencies_projection_node\"] == \"Type\":\n", + " results=query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_4c_Create_Undirected_Java_Type_Projection.cypher\", parameters)\n", + " node_count=results[\"nodeCount\"].values[0]\n", + " else:\n", + " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_4_Create_Undirected_Projection.cypher\", parameters)\n", + " results=query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_5_Create_Subgraph.cypher\", parameters)\n", + " node_count=results[\"nodeCount\"].values[0]\n", + " \n", + " print(\"The number of nodes in the original projection is: \" + str(node_count))\n", + "\n", + " return True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47df5481", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy.typing as numpy_typing\n", + "import numpy as np\n", + "\n", + "def get_projected_graph_information(projection_name: str) -> pd.DataFrame:\n", + " \"\"\"\n", + " Returns the projection information for the given parameters.\n", + " Parameters\n", + " ----------\n", + " projection_name : str\n", + " The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n", + " \"\"\"\n", + "\n", + " parameters = dict(\n", + " dependencies_projection=projection_name,\n", + " )\n", + " return query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_12_Get_Projection_Statistics.cypher\", parameters)\n", + "\n", + "\n", + "def get_projected_graph_node_count(projection_name: str) -> int:\n", + " \"\"\"\n", + " Returns the number of nodes in the projected graph.\n", + " Parameters\n", + " ----------\n", + " projection_name : str\n", + " The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n", + " \"\"\"\n", + " \n", + " graph_information = get_projected_graph_information(projection_name)\n", + " if graph_information.empty:\n", + " return 0\n", + " return graph_information[\"nodeCount\"].values[0]\n", + "\n", + "\n", + "def get_all_data_without_slicing_cross_validator_for_node_count(node_count: int) -> typ.List[typ.Tuple[np.ndarray, np.ndarray]]:\n", + " \"\"\"\n", + " Returns a list with a single tuple containing the node indices for cross-validation so that all data is used for training and testing.\n", + " This is useful for the case when no slicing is applied, i.e., all data is used for training and testing.\n", + "\n", + " Parameters\n", + " ----------\n", + " node_count : int\n", + " The number of nodes in the projected graph.\n", + " \"\"\"\n", + " node_indices = np.arange(node_count)\n", + " all_data_without_slicing_cross_validator = [(node_indices, node_indices)]\n", + " return all_data_without_slicing_cross_validator\n", + "\n", + "\n", + "def get_initial_dummy_data_for_hyperparameter_tuning(\n", + " node_count: int, \n", + ") -> numpy_typing.NDArray:\n", + " \"\"\"\n", + " Returns a list with a single tuple containing the node indices as dummy data for hyperparameter tuning.\n", + " \n", + " Parameters\n", + " ----------\n", + " node_count : int\n", + " The number of nodes in the projected graph.\n", + " \"\"\"\n", + " \n", + " node_indices = np.arange(node_count)\n", + " return node_indices.reshape(-1, 1) # Reshape to fit the model's shape requirements" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5068985", + "metadata": {}, + "outputs": [], + "source": [ + "class GraphSamplingResult:\n", + " \"\"\"\n", + " A class to represent the result of a graph sampling operation.\n", + " \"\"\"\n", + "\n", + " # TODO Make the sampling threshold configurable by environment variable \n", + " # The choses default is very low to favor performance over tuning quality.\n", + " # The reason is that experiments showed that the non-sampled Fast Random Projection provides the best results.\n", + " # Sampled node2vec and HashGNN results are only for comparison / experimentation. Its ok to limit their resource consumption.\n", + " default_graph_sampling_threshold = 256 \n", + " \n", + " # Private static (or class?) method that cant be access from the outside and that converts the parameters to the sampled graph:\n", + " def __parameters_for_sampled_graph(self, parameters: dict) -> dict:\n", + " \"\"\"\n", + " Converts the parameters to the sampled graph by adapting dependencies_projection to match the name of the sampled graph.\n", + " \"\"\"\n", + " parameters_for_sampled_graph = parameters.copy()\n", + " parameters_for_sampled_graph[\"dependencies_projection\"] = parameters_for_sampled_graph[\"dependencies_projection\"] + '-cleaned-sampled'\n", + " return parameters_for_sampled_graph\n", + "\n", + "\n", + " def __init__(self, is_sampled: bool, node_count: int, parameters: dict):\n", + " \"\"\"\n", + " Initializes the GraphSamplingResult with the sampled status and node count.\n", + "\n", + " Parameters\n", + " ----------\n", + " is_sampled : bool\n", + " Indicates whether the graph was sampled or not.\n", + " node_count : int\n", + " The number of nodes in the sampled graph (or the original in case it wasn't sampled).\n", + " parameters : dict\n", + " The updated parameters for the sampled graph or the copied original parameters.\n", + " \"\"\"\n", + " \n", + " # Check if the parameters dictionary contains the key \"dependencies_projection\"\n", + " if \"dependencies_projection\" not in parameters:\n", + " raise ValueError(\"The parameters dictionary must contain the key 'dependencies_projection'.\")\n", + " \n", + " self.is_sampled = is_sampled\n", + " self.node_count = node_count\n", + " self.updated_parameters = self.__parameters_for_sampled_graph(parameters) if is_sampled else parameters\n", + " \n", + " @classmethod\n", + " def not_sampled(this_class, parameters: dict):\n", + " \"\"\"\n", + " Creates a GraphSamplingResult instance indicating that the graph was not sampled.\n", + " \"\"\"\n", + " node_count = get_projected_graph_node_count(parameters[\"dependencies_projection\"])\n", + " return this_class(False, node_count, parameters)\n", + "\n", + " def __repr__(self):\n", + " return f\"GraphSamplingResult(is_sampled={self.is_sampled}, node_count={self.node_count}, updated_parameters={self.updated_parameters})\"\n", + "\n", + "\n", + "def sample_graph_if_size_exceeds_limit(parameters: dict, graph_sampling_threshold: int = GraphSamplingResult.default_graph_sampling_threshold) -> GraphSamplingResult:\n", + " \"\"\"\n", + " Samples the graph if the number of nodes exceeds the node count limit.\n", + " Sampling takes a random subset of the graph to reduce the size of the graph for further processing.\n", + " It returns True if the graph was sampled and False otherwise.\n", + "\n", + " Parameters\n", + " ----------\n", + " parameters : dict\n", + " dependencies_projection : str\n", + " The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n", + " \"\"\"\n", + " if graph_sampling_threshold is None or graph_sampling_threshold <= 0:\n", + " print(f\"Graph size limit is not set: {graph_sampling_threshold}. Sampling is not performed.\")\n", + " return GraphSamplingResult.not_sampled(parameters)\n", + "\n", + " graph_information=get_projected_graph_information(parameters[\"dependencies_projection\"])\n", + " node_count = graph_information[\"nodeCount\"].values[0]\n", + " if node_count <= graph_sampling_threshold:\n", + " print(f\"The number of nodes in the projection is: {node_count} and is below the limit of {graph_sampling_threshold}. Sampling is not performed.\")\n", + " return GraphSamplingResult.not_sampled(parameters)\n", + " \n", + " # Delete sampled graph projection if it already exists\n", + " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_1_Delete_Projection.cypher\", dict(dependencies_projection=parameters[\"dependencies_projection\"] + '-cleaned-sampled-cleaned'))\n", + "\n", + " sampling_parameters = dict(\n", + " dependencies_projection = parameters[\"dependencies_projection\"] + '-cleaned',\n", + " dependencies_projection_sampling_ratio = graph_sampling_threshold / node_count\n", + " )\n", + " results=query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_13_Sample_Projected_Graph.cypher\", sampling_parameters)\n", + " node_count=results[\"nodeCount\"].values[0]\n", + " print(\"The number of nodes in the sampled projection is: \" + str(node_count))\n", + " \n", + " return GraphSamplingResult(True, node_count, parameters)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ca184ef", + "metadata": {}, + "outputs": [], + "source": [ + "# Inspired by (but rewritten): https://github.com/prathmachowksey/Hopkins-Statistic-Clustering-Tendency/blob/master/Hopkins-Statistic-Clustering-Tendency.ipynb\n", + "def hopkins_statistic(\n", + " data,\n", + " sample_ratio: float = 0.05,\n", + " n_trials: int = 1,\n", + " random_state=None,\n", + " distance_metric='euclidean'\n", + "):\n", + " \"\"\"\n", + " Computes the Hopkins statistic to assess the cluster tendency of a dataset.\n", + "\n", + " Parameters:\n", + " data (array-like or DataFrame): Input data matrix of shape (n_samples, n_features).\n", + " sample_ratio (float): Proportion of samples to draw (default: 0.05).\n", + " n_trials (int): Number of repeated trials for averaging (default: 1).\n", + " random_state (int or None): Seed for reproducibility.\n", + " distance_metric (str): Distance metric to use for nearest neighbors (default: 'euclidean').\n", + "\n", + " Returns:\n", + " float: Mean Hopkins statistic over n_trials (range: 0 to 1).\n", + " \n", + " References:\n", + " Richard G. Lawson, Peter C. Jurs (1990). New index for clustering tendency and its application to chemical problems.\n", + " https://pubs.acs.org/doi/abs/10.1021/ci00065a010\n", + " \"\"\"\n", + "\n", + " import numpy as np\n", + " import pandas as pd\n", + " from sklearn.neighbors import NearestNeighbors\n", + "\n", + " if data is None:\n", + " return 0\n", + " \n", + " if isinstance(data, pd.DataFrame):\n", + " print(\"Warning: Converting DataFrame\")\n", + " data = data.values\n", + "\n", + " np.random.seed(random_state)\n", + " num_points, num_features = data.shape\n", + " sample_size = max(1, int(sample_ratio * num_points))\n", + "\n", + " hopkins_values = []\n", + "\n", + " for _ in range(n_trials):\n", + " # Sample points from the dataset\n", + " random_number_generator = np.random.default_rng(random_state)\n", + " sampled_indices = random_number_generator.choice(num_points, size=sample_size, replace=False)\n", + " real_sample = data[sampled_indices]\n", + "\n", + " # Generate uniformly distributed random points within the data bounds\n", + " data_min = data.min(axis=0)\n", + " data_max = data.max(axis=0)\n", + "\n", + " uniform_sample = random_number_generator.uniform(data_min, data_max, size=(sample_size, num_features))\n", + "\n", + " # Fit NearestNeighbors on the full dataset\n", + " nearest_neighbors = NearestNeighbors(n_neighbors=2, metric=distance_metric)\n", + " nearest_neighbors.fit(data)\n", + "\n", + " # Distance from uniform points to their nearest neighbor in real data\n", + " uniform_distances, _ = nearest_neighbors.kneighbors(uniform_sample)\n", + " uniform_nearest_distances = uniform_distances[:, 0]\n", + "\n", + " # Distance from sampled real points to their second nearest neighbor (to skip self)\n", + " real_distances, _ = nearest_neighbors.kneighbors(real_sample)\n", + " real_nearest_distances = real_distances[:, 1]\n", + "\n", + " # Hopkins statistic for this trial\n", + " total_uniform_distance = np.sum(uniform_nearest_distances)\n", + " total_real_distance = np.sum(real_nearest_distances)\n", + " \n", + " total_distance = total_uniform_distance + total_real_distance\n", + " if np.isclose(total_distance, 0.0, rtol=1e-09, atol=1e-09) or np.isnan(total_distance):\n", + " print(f\"Warning: Zero distance: total_uniform_distance={total_uniform_distance}, total_real_distance={total_real_distance}, data_min={min(data_min)}, data_max={max(data_max)}, sample_size={sample_size}, num_points={num_points}, num_features={num_features}\")\n", + " hopkins_score = 0.0\n", + " else:\n", + " hopkins_score = total_uniform_distance / total_distance\n", + "\n", + " hopkins_values.append(hopkins_score)\n", + "\n", + " return np.mean(hopkins_values) if n_trials > 1 else hopkins_values[0]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59cf2659", + "metadata": {}, + "outputs": [], + "source": [ + "from numpy.typing import NDArray\n", + "\n", + "def get_noise_ratio(clustering_results: NDArray) -> float:\n", + " \"\"\"\n", + " Returns the ratio of noise points in the clustering results.\n", + " Noise points are labeled as -1 in HDBSCAN.\n", + " \n", + " Parameters:\n", + " - clustering_results: NDArray containing the clustering results.\n", + " \n", + " Returns:\n", + " - A float representing the noise ratio.\n", + " \"\"\"\n", + " return np.sum(clustering_results == -1) / len(clustering_results)\n", + "\n", + "def adjusted_mutual_info_score_without_noise_penalty(clustering_results: NDArray, reference_communities: NDArray) -> float:\n", + " from sklearn.metrics import adjusted_mutual_info_score\n", + " \n", + " mask_noise = clustering_results != -1 # Exclude noise points from the comparison\n", + " return float(adjusted_mutual_info_score(reference_communities[mask_noise], clustering_results[mask_noise]))\n", + "\n", + "def soft_ramp_limited_penalty(score, lower_threshold=0.6, upper_threshold=0.8, sharpness=2) -> float:\n", + " if score <= lower_threshold:\n", + " return 1.0 # No penalty\n", + " elif score >= upper_threshold:\n", + " return 0.0 # Full penalty\n", + " else:\n", + " # Normalize noise into [0, 1] range for ramp\n", + " x = (score - lower_threshold) / (upper_threshold - lower_threshold)\n", + " return max(0.0, 1 - x**sharpness)\n", + "\n", + "\n", + "def adjusted_mutual_info_score_with_soft_ramp_noise_penalty(clustering_results: NDArray, reference_communities: NDArray, **kwargs) -> float:\n", + " \"\"\"\n", + " Computes the adjusted mutual information score with a custom noise penalty based on a soft ramp function.\n", + " \n", + " Parameters:\n", + " - clustering_results: NDArray containing the clustering results.\n", + " - reference_communities: NDArray containing the reference communities for comparison.\n", + " - kwargs: Additional keyword arguments for the noise penalty function (e.g., sharpness).\n", + " \n", + " Returns:\n", + " - A float representing the adjusted mutual information score with noise penalty.\n", + " \"\"\"\n", + " score = adjusted_mutual_info_score_without_noise_penalty(reference_communities, clustering_results)\n", + " penalty = soft_ramp_limited_penalty(get_noise_ratio(clustering_results), **kwargs)\n", + " return float(score) * penalty\n", + "\n", + "#For debugging/explanation purposes\n", + "# def plot_soft_ramp_limited_score():\n", + "# \"\"\"\n", + "# Plots the noise penalty curve for the custom soft ramp function.\n", + "# The curve shows how the penalty decreases as noise increases, with a sharpness parameter.\n", + "# \"\"\"\n", + "# import numpy as np\n", + "# import matplotlib.pyplot as plot\n", + "\n", + "# noise = np.linspace(0, 1, 200)\n", + "# penalty_2 = [soft_ramp_limited_score(n, sharpness=2) for n in noise]\n", + "# penalty_3 = [soft_ramp_limited_score(n, sharpness=3) for n in noise]\n", + "# penalty_4 = [soft_ramp_limited_score(n, sharpness=4) for n in noise]\n", + "\n", + "# plot.plot(noise, penalty_2, label='Soft Ramp Penalty (sharpness=2)')\n", + "# plot.plot(noise, penalty_3, label='Soft Ramp Penalty (sharpness=3)')\n", + "# plot.plot(noise, penalty_4, label='Soft Ramp Penalty (sharpness=4)')\n", + "# plot.axvline(0.4, color='gray', linestyle='--', label='Ramp Start (0.4)')\n", + "# plot.axvline(0.6, color='red', linestyle='--', label='Ramp End (0.6)')\n", + "# plot.xlabel(\"Noise Ratio\")\n", + "# plot.ylabel(\"Penalty\")\n", + "# plot.title(\"Custom Noise Penalty Function\")\n", + "# plot.legend()\n", + "# plot.grid(True)\n", + "# plot.show()\n", + "\n", + "# plot_soft_ramp_limited_score()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "881bacf4", + "metadata": {}, + "outputs": [], + "source": [ + "from numpy.typing import NDArray\n", + "\n", + "class TunedClusteringResult:\n", + " def __init__(self, labels: NDArray, probabilities : NDArray):\n", + " self.labels = labels\n", + " self.probabilities = probabilities\n", + " self.cluster_count = len(set(labels)) - (1 if -1 in labels else 0)\n", + " self.noise_count = np.sum(labels == -1)\n", + " self.noise_ratio = self.noise_count / len(labels) if len(labels) > 0 else 0\n", + " def __repr__(self):\n", + " return f\"TunedClusteringResult(cluster_count={self.cluster_count}, noise_count={self.noise_count}, noise_ratio={self.noise_ratio}, labels=[...], probabilities=[...], )\"\n", + "\n", + "def tuned_hierarchical_density_based_spatial_clustering(embeddings: NDArray, reference_community_ids: NDArray) -> TunedClusteringResult:\n", + " \"\"\"\n", + " Applies the optimized hierarchical density-based spatial clustering algorithm (HDBSCAN) to the given node embeddings.\n", + " The parameters are tuned to get results similar to the ones of the community detection algorithm.\n", + " The result is a list of cluster ids for each node embedding.\n", + " \"\"\"\n", + " from sklearn.model_selection import GridSearchCV\n", + " from sklearn.cluster import HDBSCAN\n", + " import numpy as np\n", + "\n", + " # specify parameters and distributions to sample from\n", + " hyper_parameter_distributions = {\n", + " \"min_samples\": [2, 3, 4, 5, 7, 10],\n", + " \"min_cluster_size\": [4, 5, 7, 10],\n", + " # Since the \"eom\" method is the default for HDBSCAN and it seems to work well for most cases, we use it as the default method.\n", + " \"cluster_selection_method\": [\"eom\"], #[\"eom\", \"leaf\"],\n", + " # Since \"manhattan\" seems to get selected most of the time, and has an advantage for high-dimensional data, we use it as the default metric.\n", + " \"metric\": [\"manhattan\"], # [\"euclidean\", \"manhattan\"], \n", + " }\n", + " \n", + " def adjusted_mutual_info_score_with_noise_penalty_for_community_references(community_references):\n", + " \"\"\"\n", + " Creates a custom scoring function based on the Adjusted Rand Index (ARI) that penalizes for high noise ratio in clustering.\n", + " Input:\n", + " - community_references: The true labels of the communities for the data points.\n", + " Output:\n", + " - A scoring function that can directly be used for e.g. GridSearchCV/RandomizedSearchCV and that takes an estimator and data (embeddings) and returns the ARI score with a penalty for noise ratio.\n", + " \"\"\"\n", + " def adjusted_mutual_info_scorer_with_noise_penalty(estimator, embeddings):\n", + " clustering_result = estimator.fit_predict(embeddings)\n", + " return adjusted_mutual_info_score_with_soft_ramp_noise_penalty(clustering_result, community_references)\n", + "\n", + " return adjusted_mutual_info_scorer_with_noise_penalty\n", + "\n", + "\n", + " # Use custom CV that feeds all data to each fold (no slicing)\n", + " all_data_without_slicing_cross_validator = [(np.arange(len(embeddings)), np.arange(len(embeddings)))]\n", + "\n", + " tuned_hdbscan = GridSearchCV(\n", + " estimator=HDBSCAN(),\n", + " refit=False, # Without refit, the estimator doesn't need to implement the 'predict' method. Drawback: Only the best parameters are returned, not the best model.\n", + " param_grid=hyper_parameter_distributions,\n", + " n_jobs=-1,\n", + " scoring=adjusted_mutual_info_score_with_noise_penalty_for_community_references(reference_community_ids),\n", + " cv=all_data_without_slicing_cross_validator,\n", + " verbose=1\n", + " )\n", + "\n", + " tuned_hdbscan.fit(embeddings)\n", + "\n", + " #print(\"Best adjusted rand score with noise penalty:\", tuned_hdbscan.best_score_)\n", + " print(\"Tuned HDBSCAN parameters:\", tuned_hdbscan.best_params_)\n", + "\n", + " # Run the clustering again with the best parameters\n", + " cluster_algorithm = HDBSCAN(**tuned_hdbscan.best_params_, n_jobs=-1, allow_single_cluster=False)\n", + " best_model = cluster_algorithm.fit(embeddings)\n", + "\n", + " results = TunedClusteringResult(best_model.labels_, best_model.probabilities_)\n", + " print(f\"Number of HDBSCAN clusters (excluding noise): {results.cluster_count:.0f}\")\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7848d625", + "metadata": {}, + "outputs": [], + "source": [ + "import optuna\n", + "\n", + "def output_optuna_tuning_results(optimized_study: optuna.Study, name_of_the_optimized_algorithm: str):\n", + " from typing import cast\n", + " from optuna.importance import get_param_importances, MeanDecreaseImpurityImportanceEvaluator\n", + " from optuna.trial import TrialState\n", + "\n", + " print(f\"Best {name_of_the_optimized_algorithm} parameters (Optuna):\", optimized_study.best_params)\n", + " print(f\"Best {name_of_the_optimized_algorithm} score with penalty :\", optimized_study.best_value)\n", + " print(f\"Best {name_of_the_optimized_algorithm} parameter influence:\", get_param_importances(optimized_study, evaluator=MeanDecreaseImpurityImportanceEvaluator()))\n", + " \n", + " valid_trials = [trial for trial in optimized_study.trials if trial.value is not None and trial.state == TrialState.COMPLETE]\n", + " top_trials = sorted(valid_trials, key=lambda t: cast(float, t.value), reverse=True)[:10]\n", + " for i, trial in enumerate(top_trials):\n", + " print(f\"Best {name_of_the_optimized_algorithm} parameter rank: {i+1}, trial: {trial.number}, Value = {trial.value:.6f}, Params: {trial.params}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93090818", + "metadata": {}, + "outputs": [], + "source": [ + "from numpy.typing import NDArray\n", + "\n", + "# TODO keep either this (additional optuna dependency) or the implementation above (no additional dependency but not as efficient)\n", + "def tuned_hierarchical_density_based_spatial_clustering_optuna(embeddings: NDArray, reference_community_ids: NDArray) -> TunedClusteringResult:\n", + " \"\"\"\n", + " Applies the optimized hierarchical density-based spatial clustering algorithm (HDBSCAN) to the given node embeddings.\n", + " The parameters are tuned to get results similar to the ones of the community detection algorithm.\n", + " The result is a list of cluster ids for each node embedding.\n", + " \"\"\"\n", + " import optuna\n", + " from optuna.samplers import TPESampler\n", + " from optuna.importance import get_param_importances\n", + " from sklearn.cluster import HDBSCAN # type: ignore\n", + " import numpy as np\n", + "\n", + " base_clustering_parameter = dict(\n", + " metric='manhattan', # Turned out to be the best option in most of the initial experiments\n", + " allow_single_cluster=False\n", + " )\n", + "\n", + " def objective(trial):\n", + " min_cluster_size = trial.suggest_int(\"min_cluster_size\", 4, 50)\n", + " min_samples = trial.suggest_int(\"min_samples\", 2, 30)\n", + "\n", + " clusterer = HDBSCAN(\n", + " **base_clustering_parameter,\n", + " min_cluster_size=min_cluster_size,\n", + " min_samples=min_samples\n", + " )\n", + " labels = clusterer.fit_predict(embeddings)\n", + " return adjusted_mutual_info_score_with_soft_ramp_noise_penalty(labels, reference_community_ids)\n", + "\n", + " # TODO create study with db?\n", + " study = optuna.create_study(direction=\"maximize\", sampler=TPESampler(seed=42), study_name=\"HDBSCAN\")#, storage=f\"sqlite:///optuna_study_node_embeddings_java.db\", load_if_exists=True)\n", + " \n", + " # Try (enqueue) two specific settings first that led to good results in initial experiments\n", + " study.enqueue_trial({\"min_cluster_size\": 4, \"min_samples\": 2})\n", + " study.enqueue_trial({\"min_cluster_size\": 5, \"min_samples\": 2})\n", + " \n", + " # Start the hyperparameter tuning\n", + " study.optimize(objective, n_trials=20, timeout=10)\n", + " output_optuna_tuning_results(study, 'HDBSCAN')\n", + "\n", + " # Run the clustering again with the best parameters\n", + " cluster_algorithm = HDBSCAN(**base_clustering_parameter, **study.best_params, n_jobs=-1)\n", + " best_model = cluster_algorithm.fit(embeddings)\n", + "\n", + " return TunedClusteringResult(best_model.labels_, best_model.probabilities_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fca5ab3f", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy.typing as numpy_typing\n", + "\n", + "class CommunityComparingScores:\n", + " def __init__(self, adjusted_mutual_info_score: float, adjusted_rand_index: float, normalized_mutual_information: float):\n", + " self.adjusted_mutual_info_score = adjusted_mutual_info_score\n", + " self.adjusted_rand_index = adjusted_rand_index\n", + " self.normalized_mutual_information = normalized_mutual_information\n", + " self.scores = {\n", + " \"Adjusted Mutual Info Score\": adjusted_mutual_info_score,\n", + " \"Adjusted Rand Index\": adjusted_rand_index,\n", + " \"Normalized Mutual Information\": normalized_mutual_information\n", + " }\n", + " def __repr__(self):\n", + " return f\"CommunityComparingScores(adjusted_mutual_info_score={self.adjusted_mutual_info_score}, adjusted_rand_index={self.adjusted_rand_index}, normalized_mutual_information={self.normalized_mutual_information})\"\n", + "\n", + "def get_community_comparing_scores(cluster_labels: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> CommunityComparingScores:\n", + " \"\"\"\n", + " Returns a DataFrame with the scores of the clustering algorithm compared to the community detection algorithm.\n", + " The scores are calculated using the adjusted rand index (ARI) and the normalized mutual information (NMI).\n", + " \"\"\"\n", + " from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, normalized_mutual_info_score\n", + "\n", + " # Create a mask to filter out noise points. In HDBSCAN, noise points are labeled as -1\n", + " mask = cluster_labels != -1\n", + " ami = float(adjusted_mutual_info_score(reference_community_ids[mask], cluster_labels[mask]))\n", + " ari = adjusted_rand_score(reference_community_ids[mask], cluster_labels[mask])\n", + " nmi = float(normalized_mutual_info_score(reference_community_ids[mask], cluster_labels[mask]))\n", + "\n", + " return CommunityComparingScores(ami, ari, nmi)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1cd6fc6", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Literal\n", + "import pandas as pd\n", + "\n", + "def get_clustering_property_name(clustering_property_type: Literal['Label', 'Probability'] = 'Label', clustering_name: str = \"TunedHDBSCAN\"):\n", + " \"\"\"\n", + " Assembles the property name for clustering results.\n", + " This helps to have a uniform schema.\n", + " \"\"\"\n", + " return 'clustering' + clustering_name + clustering_property_type\n", + "\n", + "def add_clustering_results_to_embeddings(embeddings: pd.DataFrame, clustering_result: TunedClusteringResult, clustering_name: str = \"TunedHDBSCAN\") -> pd.DataFrame:\n", + " \"\"\"\n", + " Adds the clustering results to the embeddings DataFrame.\n", + " \"\"\"\n", + " embeddings[get_clustering_property_name('Label', clustering_name)] = clustering_result.labels\n", + " embeddings[get_clustering_property_name('Probability', clustering_name)] = clustering_result.probabilities\n", + " return embeddings\n", + "\n", + "def get_clustering_results_distribution(embeddings: pd.DataFrame, clustering_name: str = \"TunedHDBSCAN\") -> pd.DataFrame:\n", + " \"\"\"\n", + " Returns the clustering results distribution for the given clustering name.\n", + " \"\"\"\n", + " return embeddings.groupby(get_clustering_property_name('Label', clustering_name)).aggregate(\n", + " probability=(get_clustering_property_name('Probability', clustering_name), 'mean'),\n", + " count=('codeUnitName', 'count'),\n", + " communityIds=('communityId', lambda x: list(set(x))),\n", + " codeUnitNames=('codeUnitName', lambda x: list(set(x))),\n", + " ).reset_index().sort_values(by='count', ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "046d61fd", + "metadata": {}, + "outputs": [], + "source": [ + "class TunedHierarchicalDensityBasedSpatialClusteringResult:\n", + " def __init__(self, embeddings: pd.DataFrame, clustering_result: TunedClusteringResult, community_comparing_scores: CommunityComparingScores, clustering_results_distribution: pd.DataFrame):\n", + " self.embeddings = embeddings\n", + " self.clustering_result = clustering_result\n", + " self.community_comparing_scores = community_comparing_scores\n", + " self.clustering_results_distribution = clustering_results_distribution\n", + " def __repr__(self):\n", + " return f\"TunedHierarchicalDensityBasedSpatialClusteringResult(embeddings={self.embeddings}, clustering_result={self.clustering_result}, community_comparing_scores={self.community_comparing_scores}, clustering_results_distribution={self.clustering_results_distribution})\"\n", + "\n", + "\n", + "def add_tuned_hierarchical_density_based_spatial_clustering(embeddings: pd.DataFrame, clustering_name: str = \"TunedHDBSCAN\") -> TunedHierarchicalDensityBasedSpatialClusteringResult:\n", + " \"\"\"\n", + " Applies the tuned hierarchical density-based spatial clustering algorithm (HDBSCAN) to the given node embeddings.\n", + " The parameters are tuned to get results similar to the ones of the community detection algorithm.\n", + " The result is the input DataFrame with the clustering results added.\n", + " \"\"\"\n", + " import time\n", + "\n", + " # Apply the tuned HDBSCAN clustering algorithm\n", + " embeddings_values = np.array(embeddings.embedding.tolist())\n", + " community_reference_ids = np.array(embeddings.communityId.tolist())\n", + " \n", + " # TODO keep only one implementation\n", + " grid_search_hdbscan_start = time.time()\n", + " clustering_result = tuned_hierarchical_density_based_spatial_clustering(embeddings_values, community_reference_ids)\n", + " grid_search_hdbscan_end = time.time()\n", + " print(clustering_result)\n", + " \n", + " community_comparing_scores = get_community_comparing_scores(clustering_result.labels, community_reference_ids)\n", + " print(community_comparing_scores)\n", + " \n", + " # ----------\n", + "\n", + " optuna_start = time.time()\n", + " clustering_result = tuned_hierarchical_density_based_spatial_clustering_optuna(embeddings_values, community_reference_ids)\n", + " optuna_end = time.time()\n", + " print(clustering_result)\n", + " \n", + " community_comparing_scores = get_community_comparing_scores(clustering_result.labels, community_reference_ids)\n", + " print(community_comparing_scores)\n", + "\n", + " # ----------\n", + " print(f\"Grid Search tuning time: {grid_search_hdbscan_end - grid_search_hdbscan_start:.2f} seconds\")\n", + " print(f\"Optuna tuning time: {optuna_end - optuna_start:.2f} seconds\")\n", + " # ----------\n", + "\n", + " # Add the clustering results to the embeddings DataFrame\n", + " embeddings = add_clustering_results_to_embeddings(embeddings, clustering_result, clustering_name)\n", + " \n", + " # Get the clustering results distribution\n", + " clustering_results_distribution = get_clustering_results_distribution(embeddings, clustering_name)\n", + " \n", + " # Display the clustering results distribution\n", + " display(clustering_results_distribution)\n", + " \n", + " return TunedHierarchicalDensityBasedSpatialClusteringResult(embeddings, clustering_result, community_comparing_scores, clustering_results_distribution)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e4b8901", + "metadata": {}, + "outputs": [], + "source": [ + "node_embedding_tuning_scores = []\n", + "\n", + "def reset_node_embedding_tuning_scores():\n", + " \"\"\"\n", + " Resets the collected node embedding scores\n", + " This is useful to start a new evaluation run without old results.\n", + " \"\"\"\n", + " global node_embedding_tuning_scores\n", + " node_embedding_tuning_scores = []\n", + "\n", + "\n", + "def add_node_embedding_tuning_scores(embedding_dimension: int,\n", + " adjusted_mutual_info_score: float,\n", + " confidence_score: float,\n", + " clustering_noise_ratio: float,\n", + " cluster_count: int):\n", + " \"\"\"\n", + " Collects node embedding scores for later analysis.\n", + " \"\"\"\n", + "\n", + " global node_embedding_tuning_scores\n", + " node_embedding_tuning_scores.append(dict(\n", + " embedding_dimension = embedding_dimension,\n", + " adjusted_mutual_info_score = adjusted_mutual_info_score,\n", + " confidence_score = confidence_score,\n", + " clustering_noise_ratio = clustering_noise_ratio,\n", + " cluster_count = cluster_count\n", + " ))\n", + "\n", + "\n", + "def plot_node_embedding_tuning_scores():\n", + " \"\"\"\n", + " Plots the clustering noise ratio and cluster count against the adjusted mutual info score for the Fast Random Projection node embeddings.\n", + " This function uses matplotlib to create two horizontally arranged subplots:\n", + " - Left: clustering noise ratio vs. adjusted mutual info score\n", + " - Right: cluster count vs. adjusted mutual info score\n", + " The color of the points represents the embedding dimension.\n", + " \"\"\"\n", + " import matplotlib.pyplot as plot\n", + " import pandas as pd\n", + "\n", + " tuning_scores = pd.DataFrame(node_embedding_tuning_scores)\n", + "\n", + " figure, axes = plot.subplots(1, 2, figsize=(16, 6), sharey=True)\n", + " figure.subplots_adjust(wspace=0.1)\n", + "\n", + " noise_ratio_plot = axes[0].scatter(\n", + " tuning_scores['clustering_noise_ratio'],\n", + " tuning_scores['adjusted_mutual_info_score'],\n", + " c=tuning_scores['embedding_dimension'],\n", + " cmap='viridis',\n", + " alpha=0.7\n", + " )\n", + " axes[0].set_xlabel('Clustering Noise Ratio')\n", + " axes[0].set_ylabel('Adjusted Mutual Info Score')\n", + " axes[0].set_title('Clustering Noise Ratio vs. Adjusted Mutual Info Score')\n", + "\n", + " cluster_count_plot = axes[1].scatter(\n", + " tuning_scores['cluster_count'],\n", + " tuning_scores['adjusted_mutual_info_score'],\n", + " c=tuning_scores['embedding_dimension'],\n", + " cmap='viridis',\n", + " alpha=0.7\n", + " )\n", + " axes[1].set_xlabel('Cluster Count')\n", + " axes[1].set_title('Cluster Count vs. Adjusted Mutual Info Score')\n", + "\n", + " # Place a single colorbar between the two subplots\n", + " colorbar = figure.colorbar(\n", + " cluster_count_plot,\n", + " ax=axes,\n", + " fraction=0.05,\n", + " aspect=30,\n", + " location='right',\n", + " )\n", + " colorbar.set_label('Embedding Dimension')\n", + "\n", + " plot.show()\n", + "\n", + "\n", + "def output_node_embedding_tuning_scores():\n", + " \"\"\"\n", + " Returns the DataFrame with the results of the Fast Random Projection node embeddings.\n", + " \"\"\"\n", + " node_embeddings_score_results_dataframe = pd.DataFrame(node_embedding_tuning_scores)\n", + " print(\"Min noise ratio:\", node_embeddings_score_results_dataframe.clustering_noise_ratio.min())\n", + " print(\"Max noise ratio:\", node_embeddings_score_results_dataframe.clustering_noise_ratio.max())\n", + " print(\"Min adjusted mutual info score:\", node_embeddings_score_results_dataframe.adjusted_mutual_info_score.min())\n", + " print(\"Max adjusted mutual info score:\", node_embeddings_score_results_dataframe.adjusted_mutual_info_score.max())\n", + " print(\"Min cluster count:\", node_embeddings_score_results_dataframe.cluster_count.min())\n", + " print(\"Max cluster count:\", node_embeddings_score_results_dataframe.cluster_count.max())\n", + " \n", + " plot_node_embedding_tuning_scores()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bbdf6ef", + "metadata": {}, + "outputs": [], + "source": [ + "class HierarchicalDensityClusteringScores:\n", + "\n", + " def __init__(self, embedding_dimension: int, adjusted_mutual_info_score: float, confidence_score: float, noise_ratio: float, cluster_count: int):\n", + " self.embedding_dimension = embedding_dimension\n", + " self.adjusted_mutual_info_score = adjusted_mutual_info_score\n", + " self.confidence_score = confidence_score\n", + " self.noise_ratio = noise_ratio\n", + " self.cluster_count = cluster_count\n", + "\n", + " def __repr__(self):\n", + " return f\"HierarchicalDensityClusteringScores(embedding_dimension={self.embedding_dimension}, adjusted_mutual_info_score={self.adjusted_mutual_info_score}, confidence_score={self.confidence_score}, noise_ratio={self.noise_ratio}, cluster_count={self.cluster_count})\"\n", + " \n", + " def append_to_tuning_scores(self):\n", + " add_node_embedding_tuning_scores(\n", + " self.embedding_dimension,\n", + " self.adjusted_mutual_info_score, \n", + " self.confidence_score, \n", + " self.noise_ratio,\n", + " self.cluster_count\n", + " )\n", + " return self\n", + "\n", + " @classmethod\n", + " def cluster_embeddings_with_references(cls, embedding_column: pd.Series, reference_community_id_column: pd.Series) -> 'HierarchicalDensityClusteringScores':\n", + " \"\"\"\n", + " Clusters the embeddings with the reference community ids and returns the clustering scores.\n", + " \n", + " Parameters\n", + " ----------\n", + " embedding_column : pd.Series\n", + " The column containing the embeddings to be clustered.\n", + " reference_community_id_column : pd.Series\n", + " The column containing the reference community ids to compare the clustering results against.\n", + " \n", + " Returns\n", + " -------\n", + " HierarchicalDensityClusteringScores\n", + " An instance of HierarchicalDensityClusteringScores containing the clustering scores.\n", + " \"\"\"\n", + " import numpy as np\n", + " from sklearn.cluster import HDBSCAN\n", + " \n", + " hierarchical_density_based_spatial_clustering = HDBSCAN(\n", + " cluster_selection_method='eom',\n", + " metric='manhattan',\n", + " min_samples=2,\n", + " min_cluster_size=5,\n", + " allow_single_cluster=False,\n", + " n_jobs=-1\n", + " )\n", + " embeddings = np.array(embedding_column.tolist())\n", + " clustering_result = hierarchical_density_based_spatial_clustering.fit(embeddings)\n", + " \n", + " reference_community_ids = np.array(reference_community_id_column.tolist())\n", + " adjusted_mutual_info_score_value = adjusted_mutual_info_score_with_soft_ramp_noise_penalty(clustering_result.labels_, reference_community_ids)\n", + " \n", + " confidence_score = np.mean(clustering_result.probabilities_[clustering_result.labels_ != -1])\n", + " noise_count = np.sum(clustering_result.labels_ == -1)\n", + " noise_ratio = noise_count / len(clustering_result.labels_)\n", + " cluster_count = len(set(clustering_result.labels_)) - (1 if -1 in clustering_result.labels_ else 0)\n", + " return cls(len(embeddings[0]), adjusted_mutual_info_score_value, confidence_score, noise_ratio, cluster_count)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f9cb820", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.base import BaseEstimator\n", + "import typing as typ\n", + "\n", + "class DependencyProjectionParameters:\n", + " def __init__(self, \n", + " projection_name: str = \"java-type-embeddings-notebook\",\n", + " projection_node: str = \"Type\",\n", + " projection_weight_property: str = \"weight\"\n", + " ):\n", + " self.projection_name = projection_name\n", + " self.projection_node = projection_node\n", + " self.projection_weight_property = projection_weight_property\n", + "\n", + " @classmethod\n", + " def from_projection_parameters(cls, projection_parameters: dict):\n", + " \"\"\"\n", + " Creates a DependencyProjectionParameters instance from a dictionary of projection parameters.\n", + " The dictionary must contain the following keys:\n", + " - \"dependencies_projection\": The name of the projection.\n", + " - \"dependencies_projection_node\": The node type of the projection.\n", + " - \"dependencies_projection_weight_property\": The weight property of the projection.\n", + " \"\"\"\n", + " if not all(key in projection_parameters for key in [\"dependencies_projection\", \"dependencies_projection_node\", \"dependencies_projection_weight_property\"]):\n", + " raise ValueError(\"The projection parameters must contain the keys: 'dependencies_projection', 'dependencies_projection_node', 'dependencies_projection_weight_property'.\")\n", + " return cls(\n", + " projection_name=projection_parameters[\"dependencies_projection\"],\n", + " projection_node=projection_parameters[\"dependencies_projection_node\"],\n", + " projection_weight_property=projection_parameters[\"dependencies_projection_weight_property\"]\n", + " )\n", + "\n", + " def get_cypher_parameters(self):\n", + " return {\n", + " \"dependencies_projection\": self.projection_name,\n", + " \"dependencies_projection_node\": self.projection_node,\n", + " \"dependencies_projection_weight_property\": self.projection_weight_property,\n", + " }\n", + " \n", + " def clone_with_projection_name(self, projection_name: str):\n", + " return DependencyProjectionParameters(\n", + " projection_name=projection_name,\n", + " projection_node=self.projection_node,\n", + " projection_weight_property=self.projection_weight_property\n", + " )\n", + "\n", + "def create_tuneable(class_to_create: typ.Type, verbose: bool = False) -> typ.Any:\n", + " if not hasattr(class_to_create, '__init__'):\n", + " raise ValueError(f\"The class {class_to_create.__name__} does not have an __init__ method. It cannot be created.\")\n", + " if not callable(class_to_create.__init__):\n", + " raise ValueError(f\"The class {class_to_create.__name__} has an __init__ method, but it is not callable. It cannot be created.\")\n", + " if not issubclass(class_to_create, BaseEstimator):\n", + " raise ValueError(f\"The class {class_to_create.__name__} does not inherit from BaseEstimator. It cannot be created.\")\n", + "\n", + " # print(f\"Creating a tuneable estimator for the class {class_to_create.__name__}...\")\n", + "\n", + " class TuneableEstimator():\n", + " def __init__(self):\n", + " self.class_to_create_ = class_to_create\n", + " self.verbose = verbose\n", + "\n", + " def with_projection_parameters(self, projection_parameters: dict) -> typ.Any:\n", + " \"\"\"\n", + " Creates an instance of the given class (using its constructor) with projection parameters from a dict.\n", + " The dict must contain the following keys: \n", + " - \"dependencies_projection\"\n", + " - \"dependencies_projection_node\"\n", + " - \"dependencies_projection_weight_property\".\n", + " \"\"\"\n", + " \n", + " #print(f\"...with projection parameters: {projection_parameters}\")\n", + " return self.class_to_create_(\n", + " dependency_projection = DependencyProjectionParameters.from_projection_parameters(projection_parameters), # type: ignore\n", + " verbose=self.verbose # type: ignore\n", + " )\n", + " return TuneableEstimator()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a94d6254", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.base import BaseEstimator\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "class TuneableFastRandomProjectionNodeEmbeddings(BaseEstimator):\n", + " \"\"\"\n", + " Can be used with GridSearchCV or RandomizedSearchCV to tune the parameters of the Fast Random Projection node embeddings.\n", + " \"\"\"\n", + "\n", + " cypher_file_for_read_ = \"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Tuneable_Stream.cypher\" \n", + " cypher_file_for_write_ = \"../cypher/Node_Embeddings/Node_Embeddings_1e_Fast_Random_Projection_Tuneable_Write.cypher\" \n", + " \n", + " def __init__(self, \n", + " dependency_projection: DependencyProjectionParameters = DependencyProjectionParameters(),\n", + " verbose: bool = False,\n", + " # Tuneable algorithm parameters\n", + " embedding_dimension: int = 64, \n", + " random_seed: int = 42,\n", + " fast_random_projection_normalization_strength: float = 0.3,\n", + " fast_random_projection_forth_iteration_weight: float = 1.0,\n", + " ):\n", + " self.dependency_projection = dependency_projection\n", + " self.verbose = verbose\n", + " \n", + " self.embedding_dimension = embedding_dimension\n", + " self.random_seed = random_seed\n", + " self.fast_random_projection_normalization_strength = fast_random_projection_normalization_strength\n", + " self.fast_random_projection_forth_iteration_weight = fast_random_projection_forth_iteration_weight\n", + "\n", + "\n", + " def __to_embedding_parameters(self):\n", + " return {\n", + " \"dependencies_projection_embedding_dimension\": str(self.embedding_dimension),\n", + " \"dependencies_projection_fast_random_projection_normalization_strength\": str(self.fast_random_projection_normalization_strength),\n", + " \"dependencies_projection_fast_random_projection_forth_iteration_weight\": str(self.fast_random_projection_forth_iteration_weight),\n", + " \"dependencies_projection_embedding_random_seed\": str(self.random_seed),\n", + " \"dependencies_projection_write_property\": \"embeddingsFastRandomProjectionForClustering\",\n", + " **self.dependency_projection.get_cypher_parameters()\n", + " } \n", + " \n", + "\n", + " def __generate_embeddings(self):\n", + " node_embedding_parameters = self.__to_embedding_parameters()\n", + " if self.verbose:\n", + " print(\"Generating embeddings using Neo4j Graph Data Science with the following parameters: \" + str(node_embedding_parameters))\n", + " return query_cypher_to_data_frame_for_verbosity(self.verbose)(self.cypher_file_for_read_, parameters=node_embedding_parameters)\n", + "\n", + "\n", + " def __check_fitted(self):\n", + " \"\"\"\n", + " Checks if the model has been fitted by checking if the embeddings_ attribute exists.\n", + " Raises a ValueError if the model has not been fitted yet.\n", + " \"\"\"\n", + " if not hasattr(self, 'embeddings_') or not hasattr(self, 'clustering_scores_'):\n", + " raise ValueError(\"The model has not been fitted yet. Please call the fit method before.\")\n", + "\n", + "\n", + " def fit(self, X=None, y=None):\n", + " \"\"\"\n", + " Fits the model by generating node embeddings and calculating the Hopkins statistic.\n", + " \"\"\"\n", + " self.embeddings_ = self.__generate_embeddings()\n", + " self.clustering_scores_ = HierarchicalDensityClusteringScores.cluster_embeddings_with_references(self.embeddings_.embedding, self.embeddings_.communityId).append_to_tuning_scores()\n", + " return self\n", + "\n", + " \n", + " def score(self, X=None, y=None):\n", + " \"\"\"\n", + " Returns the score of the model based on the adjusted mutual info score comparing the clusters with pre calculated Leiden communities.\n", + " \"\"\"\n", + " self.__check_fitted()\n", + " return self.clustering_scores_.adjusted_mutual_info_score\n", + "\n", + "\n", + " def write_embeddings(self):\n", + " \"\"\"\n", + " Writes the generated embeddings to the Neo4j database.\n", + " This is useful for further processing or analysis of the embeddings.\n", + " \"\"\"\n", + " node_embedding_parameters = self.__to_embedding_parameters()\n", + " print(\"Writing embeddings to Neo4j with the following parameters: \" + str(node_embedding_parameters))\n", + " query_cypher_to_data_frame_for_verbosity(self.verbose)(self.cypher_file_for_write_, parameters=node_embedding_parameters)\n", + " return self\n", + "\n", + " \n", + " def get_embeddings(self):\n", + " \"\"\"\n", + " Returns the generated embeddings\n", + " \"\"\"\n", + " self.__check_fitted()\n", + " return self.embeddings_\n", + "\n", + "\n", + " def get_clustering_scores(self) -> HierarchicalDensityClusteringScores:\n", + " \"\"\"\n", + " Returns the clustering scores, which include the adjusted mutual info score, confidence score, noise ratio, and cluster count.\n", + " \"\"\"\n", + " self.__check_fitted()\n", + " return self.clustering_scores_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd93448a", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.base import BaseEstimator\n", + "import numpy as np\n", + "\n", + "class TuneableNode2VecNodeEmbeddings(BaseEstimator):\n", + " \"\"\"\n", + " Can be used with GridSearchCV or RandomizedSearchCV to tune the parameters of node embeddings with node2vec.\n", + " \"\"\"\n", + "\n", + " cypher_file_name_ = \"../cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Tuneable_Stream.cypher\" \n", + " \n", + " def __init__(self, \n", + " dependency_projection: DependencyProjectionParameters = DependencyProjectionParameters(),\n", + " verbose: bool = False,\n", + " # Tuneable algorithm parameters\n", + " embedding_dimension: int = 64, \n", + " random_seed: int = 42,\n", + " node2vec_in_out_factor: float = 1.0,\n", + " node2vec_return_factor: float = 1.0,\n", + " node2vec_window_size: int = 10,\n", + " node2vec_walk_length: int = 80,\n", + " node2vec_walks_per_node: int = 10,\n", + " node2vec_iterations: int = 1,\n", + " node2vec_negative_sampling_rate: int = 5,\n", + " node2vec_positive_sampling_factor: float = 0.001,\n", + " ):\n", + " self.dependency_projection = dependency_projection\n", + " self.verbose = verbose\n", + "\n", + " self.embedding_dimension = embedding_dimension\n", + " self.random_seed = random_seed\n", + " self.node2vec_in_out_factor = node2vec_in_out_factor\n", + " self.node2vec_return_factor = node2vec_return_factor\n", + " self.node2vec_window_size = node2vec_window_size\n", + " self.node2vec_walk_length = node2vec_walk_length\n", + " self.node2vec_walks_per_node = node2vec_walks_per_node\n", + " self.node2vec_iterations = node2vec_iterations\n", + " self.node2vec_negative_sampling_rate = node2vec_negative_sampling_rate\n", + " self.node2vec_positive_sampling_factor = node2vec_positive_sampling_factor\n", + "\n", + "\n", + " def __to_embedding_parameters(self):\n", + " return {\n", + " \"dependencies_projection_embedding_dimension\": str(self.embedding_dimension),\n", + " \"dependencies_projection_embedding_random_seed\": str(self.random_seed),\n", + " \"dependencies_projection_node2vec_in_out_factor\": str(self.node2vec_in_out_factor),\n", + " \"dependencies_projection_node2vec_return_factor\": str(self.node2vec_return_factor),\n", + " \"dependencies_projection_node2vec_window_size\": str(self.node2vec_window_size),\n", + " \"dependencies_projection_node2vec_walk_length\": str(self.node2vec_walk_length),\n", + " \"dependencies_projection_node2vec_walks_per_node\": str(self.node2vec_walks_per_node),\n", + " \"dependencies_projection_node2vec_iterations\": str(self.node2vec_iterations),\n", + " \"dependencies_projection_node2vec_negative_sampling_rate\": str(self.node2vec_negative_sampling_rate),\n", + " \"dependencies_projection_node2vec_positive_sampling_factor\": str(self.node2vec_positive_sampling_factor),\n", + " **self.dependency_projection.get_cypher_parameters()\n", + " } \n", + " \n", + "\n", + " def __generate_embeddings(self):\n", + " node_embedding_parameters = self.__to_embedding_parameters()\n", + " if self.verbose:\n", + " print(\"Generating embeddings using Neo4j Graph Data Science with the following parameters: \" + str(node_embedding_parameters))\n", + " return query_cypher_to_data_frame_for_verbosity(self.verbose)(self.cypher_file_name_, parameters=node_embedding_parameters)\n", + "\n", + "\n", + " def __check_fitted(self):\n", + " \"\"\"\n", + " Checks if the model has been fitted by checking if the embeddings_ attribute exists.\n", + " Raises a ValueError if the model has not been fitted yet.\n", + " \"\"\"\n", + " if not hasattr(self, 'embeddings_') or not hasattr(self, 'clustering_scores_'):\n", + " raise ValueError(\"The model has not been fitted yet. Please call the fit method before.\")\n", + "\n", + "\n", + " def fit(self, X=None, y=None):\n", + " \"\"\"\n", + " Fits the model by generating node embeddings and calculating the Hopkins statistic.\n", + " \"\"\"\n", + " self.embeddings_ = self.__generate_embeddings()\n", + " self.clustering_scores_ = HierarchicalDensityClusteringScores.cluster_embeddings_with_references(self.embeddings_.embedding, self.embeddings_.communityId).append_to_tuning_scores()\n", + " return self\n", + " \n", + "\n", + " def refit_with_projection(self, projection_name: str):\n", + " \"\"\"\n", + " Re-fits the model for the given projection name.\n", + " This is useful for tuning the model with different projections (sampled/original).\n", + " \"\"\"\n", + " if projection_name == self.dependency_projection.projection_name:\n", + " print(f\"Projection name '{projection_name}' is the same as the current one. No re-fitting needed.\")\n", + " return self\n", + " \n", + " self.dependency_projection = self.dependency_projection.clone_with_projection_name(projection_name)\n", + " print(f\"Re-fitting the model with the following parameters: \" + str(self.__to_embedding_parameters()))\n", + " return self.fit()\n", + "\n", + "\n", + " def score(self, X=None, y=None):\n", + " \"\"\"\n", + " Returns the score of the model based on the adjusted mutual info score comparing the clusters with pre calculated Leiden communities.\n", + " \"\"\"\n", + " self.__check_fitted()\n", + " return self.clustering_scores_.adjusted_mutual_info_score\n", + " \n", + "\n", + " def get_embeddings(self):\n", + " \"\"\"\n", + " Returns the generated embeddings\n", + " \"\"\"\n", + " self.__check_fitted()\n", + " return self.embeddings_\n", + "\n", + "\n", + " def get_clustering_scores(self) -> HierarchicalDensityClusteringScores:\n", + " \"\"\"\n", + " Returns the clustering scores, which include the adjusted mutual info score, confidence score, noise ratio, and cluster count.\n", + " \"\"\"\n", + " self.__check_fitted()\n", + " return self.clustering_scores_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "651bede8", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.base import BaseEstimator\n", + "import numpy as np\n", + "\n", + "class TuneableHashGNNNodeEmbeddings(BaseEstimator):\n", + " \"\"\"\n", + " Can be used with GridSearchCV or RandomizedSearchCV to tune the parameters of node embeddings with HashGNN.\n", + " \"\"\"\n", + "\n", + " cypher_file_name_ = \"../cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Tuneable_Stream.cypher\" \n", + " \n", + " def __init__(self, \n", + " dependency_projection: DependencyProjectionParameters = DependencyProjectionParameters(),\n", + " verbose: bool = False,\n", + " # Tuneable algorithm parameters\n", + " embedding_dimension: int = 64, \n", + " random_seed: int = 42,\n", + " hashgnn_iterations: int = 2,\n", + " hashgnn_density_level: int = 2,\n", + " hashgnn_neighbor_influence: float = 1.0,\n", + " hashgnn_dimension_multiplier: int = 2,\n", + " ):\n", + " self.dependency_projection = dependency_projection\n", + " self.verbose = verbose\n", + "\n", + " self.embedding_dimension = embedding_dimension\n", + " self.random_seed = random_seed\n", + " self.hashgnn_iterations = hashgnn_iterations\n", + " self.hashgnn_density_level = hashgnn_density_level\n", + " self.hashgnn_neighbor_influence = hashgnn_neighbor_influence\n", + " self.hashgnn_dimension_multiplier = hashgnn_dimension_multiplier\n", + "\n", + "\n", + " def __to_embedding_parameters(self):\n", + " return {\n", + " \"dependencies_projection_embedding_dimension\": str(self.embedding_dimension),\n", + " \"dependencies_projection_embedding_random_seed\": str(self.random_seed),\n", + " \"dependencies_projection_hashgnn_iterations\": str(self.hashgnn_iterations),\n", + " \"dependencies_projection_hashgnn_density_level\": str(self.hashgnn_density_level),\n", + " \"dependencies_projection_hashgnn_neighbor_influence\": str(self.hashgnn_neighbor_influence),\n", + " \"dependencies_projection_hashgnn_dimension_multiplier\": str(self.hashgnn_dimension_multiplier),\n", + " **self.dependency_projection.get_cypher_parameters()\n", + " } \n", + " \n", + "\n", + " def __generate_embeddings(self):\n", + " node_embedding_parameters = self.__to_embedding_parameters()\n", + " if self.verbose:\n", + " print(\"Generating embeddings using Neo4j Graph Data Science with the following parameters: \" + str(node_embedding_parameters))\n", + " return query_cypher_to_data_frame_for_verbosity(self.verbose)(self.cypher_file_name_, parameters=node_embedding_parameters)\n", + "\n", + "\n", + " def __check_fitted(self):\n", + " \"\"\"\n", + " Checks if the model has been fitted by checking if the embeddings_ attribute exists.\n", + " Raises a ValueError if the model has not been fitted yet.\n", + " \"\"\"\n", + " if not hasattr(self, 'embeddings_') or not hasattr(self, 'clustering_scores_'):\n", + " raise ValueError(\"The model has not been fitted yet. Please call the fit method before.\")\n", + "\n", + "\n", + " def fit(self, X=None, y=None):\n", + " \"\"\"\n", + " Fits the model by generating node embeddings and calculating the Hopkins statistic.\n", + " \"\"\"\n", + " self.embeddings_ = self.__generate_embeddings()\n", + " self.clustering_scores_ = HierarchicalDensityClusteringScores.cluster_embeddings_with_references(self.embeddings_.embedding, self.embeddings_.communityId).append_to_tuning_scores()\n", + " return self\n", + "\n", + "\n", + " def refit_with_projection(self, projection_name: str):\n", + " \"\"\"\n", + " Re-fits the model for the given projection name.\n", + " This is useful for tuning the model with different projections (sampled/original).\n", + " \"\"\"\n", + " if projection_name == self.dependency_projection.projection_name:\n", + " print(f\"Projection name '{projection_name}' is the same as the current one. No re-fitting needed.\")\n", + " return self\n", + " \n", + " self.dependency_projection = self.dependency_projection.clone_with_projection_name(projection_name)\n", + " print(f\"Re-fitting the model with the following parameters: \" + str(self.__to_embedding_parameters()))\n", + " return self.fit()\n", + "\n", + "\n", + " def score(self, X=None, y=None):\n", + " \"\"\"\n", + " Returns the score of the model based on the adjusted mutual info score comparing the clusters with pre calculated Leiden communities.\n", + " \"\"\"\n", + " self.__check_fitted()\n", + " return self.clustering_scores_.adjusted_mutual_info_score\n", + " \n", + "\n", + " def get_embeddings(self):\n", + " \"\"\"\n", + " Returns the generated embeddings\n", + " \"\"\"\n", + " self.__check_fitted()\n", + " return self.embeddings_\n", + " \n", + "\n", + " def get_clustering_scores(self) -> HierarchicalDensityClusteringScores:\n", + " \"\"\"\n", + " Returns the clustering scores, which include the adjusted mutual info score, confidence score, noise ratio, and cluster count.\n", + " \"\"\"\n", + " self.__check_fitted()\n", + " return self.clustering_scores_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30064d60", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.base import BaseEstimator\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "class TuneableLeidenCommunityDetection(BaseEstimator):\n", + " \"\"\"\n", + " Can be used with GridSearchCV or RandomizedSearchCV to tune the parameters of the Leiden community detection algorithm.\n", + " \"\"\"\n", + "\n", + " cypher_file_for_statistics_ = \"../cypher/Community_Detection/Community_Detection_2b_Leiden_Tuneable_Statistics.cypher\" \n", + " cypher_file_for_write_ = \"../cypher/Community_Detection/Community_Detection_2d_Leiden_Tuneable_Write.cypher\" \n", + " \n", + " def __init__(self, \n", + " dependency_projection: DependencyProjectionParameters = DependencyProjectionParameters(),\n", + " verbose: bool = False,\n", + " # Tuneable algorithm parameters\n", + " gamma: float = 1.0,\n", + " theta: float = 0.001,\n", + " max_levels: int = 10,\n", + " ):\n", + " self.dependency_projection = dependency_projection\n", + " self.verbose = verbose\n", + "\n", + " self.gamma = gamma\n", + " self.theta = theta\n", + " self.max_levels = max_levels\n", + "\n", + "\n", + " def __to_algorithm_parameters(self):\n", + " return {\n", + " \"dependencies_leiden_gamma\": str(self.gamma),\n", + " \"dependencies_leiden_theta\": str(self.theta),\n", + " \"dependencies_leiden_max_levels\": str(self.max_levels),\n", + " \"dependencies_projection_write_property\": \"communityLeidenIdTuned\",\n", + " **self.dependency_projection.get_cypher_parameters()\n", + " } \n", + " \n", + "\n", + " def __run_algorithm(self):\n", + " algorithm_parameters = self.__to_algorithm_parameters()\n", + " if self.verbose:\n", + " print(\"Calculating Leiden communities using Neo4j Graph Data Science with the following parameters: \" + str(algorithm_parameters))\n", + " return query_cypher_to_data_frame_for_verbosity(self.verbose)(self.cypher_file_for_statistics_, parameters=algorithm_parameters)\n", + "\n", + "\n", + " def __check_fitted(self):\n", + " \"\"\"\n", + " Checks if the model has been fitted by checking if the embeddings_ attribute exists.\n", + " Raises a ValueError if the model has not been fitted yet.\n", + " \"\"\"\n", + " if not hasattr(self, 'community_statistics_'):\n", + " raise ValueError(\"The model has not been fitted yet. Please call the fit method before.\")\n", + "\n", + "\n", + " def fit(self, X=None, y=None):\n", + " \"\"\"\n", + " Fits the model by calculating Leiden communities and their statistics.\n", + " \"\"\"\n", + " self.community_statistics_ = self.__run_algorithm()\n", + " return self\n", + "\n", + " \n", + " def score(self, X=None, y=None):\n", + " \"\"\"\n", + " The returned score is high for community detection results with high modularity and high community count.\n", + " A penalty assures that a modularity lower than 0.3 (*1) will result in a score of zero (\"worst\").\n", + " The community count is normalized by dividing it through the number of nodes in the projected Graph.\n", + " To give the relative community count more weight, it is multiplied by 100. \n", + " \n", + " (*1) Mane, Prachita; Shanbhag, Sunanda; Kamath, Tanmayee; Mackey, Patrick; and Springer, John, \n", + " \"Analysis of Community Detection Algorithms for Large Scale Cyber Networks\" (2016)\n", + " \"\"\"\n", + " soft_ramped_modularity = 1.0 - soft_ramp_limited_penalty(self.get_modularity(), 0.30, 0.35, sharpness=1)\n", + " score = float(self.get_community_count() * 100) / float(self.get_node_count_()) * soft_ramped_modularity\n", + " # - For debugging purposes:\n", + " # print(f\"Score {score:.4f}= community count {self.get_community_count()} x soft_ramped {soft_ramped_modularity:.4f} modularity {self.get_modularity():.04f}\")\n", + " return score\n", + "\n", + "\n", + " def write_communities(self):\n", + " \"\"\"\n", + " Writes the calculated communities to the Neo4j database.\n", + " This is useful for further processing or analysis.\n", + " \"\"\"\n", + " algorithm_parameters = self.__to_algorithm_parameters()\n", + " print(\"Writing communities to Neo4j with the following parameters: \" + str(algorithm_parameters))\n", + " query_cypher_to_data_frame_for_verbosity(self.verbose)(self.cypher_file_for_write_, parameters=algorithm_parameters)\n", + " return self\n", + "\n", + "\n", + " def get_modularity(self) -> float:\n", + " \"\"\"\n", + " Returns the modularity (global/overall) of the community statistics\n", + " \"\"\"\n", + " self.__check_fitted()\n", + " return float(self.community_statistics_['modularity'].iloc[0])\n", + " \n", + " def get_community_count(self) -> int:\n", + " \"\"\"\n", + " Returns the number of detected communities\n", + " \"\"\"\n", + " self.__check_fitted()\n", + " return int(self.community_statistics_['communityCount'].iloc[0])\n", + " \n", + " def get_node_count_(self) -> int:\n", + " \"\"\"\n", + " Returns the number of nodes in the projected Graph\n", + " \"\"\"\n", + " self.__check_fitted()\n", + " return int(self.community_statistics_['nodeCount'].iloc[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09af6396", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_grid_search_hyperparameter_tuning_results(cv_results):\n", + " \"\"\"\n", + " Plots the results of the hyperparameter tuning from GridSearchCV.\n", + " Uses the first parameter (alphabetically) as the horizontal axis and each of the other parameters as vertical axes.\n", + " The mean test score is plotted against the parameter values.\n", + "\n", + " Parameters\n", + " ----------\n", + " cv_results : dict\n", + " The cv_results_ attribute from a fitted GridSearchCV object.\n", + " \"\"\"\n", + " import matplotlib.pyplot as plot\n", + " import pandas as pd\n", + " \n", + " tuning_statistics = pd.DataFrame(cv_results)\n", + "\n", + " # Extract parameter names\n", + " parameter_names = list(tuning_statistics['params'][0].keys())\n", + "\n", + " # Create subplots for the first parameter (horizontal) and each other parameter (vertical)\n", + " row_parameter = parameter_names[0]\n", + "\n", + " # filter out the first parameter (name) from parameter_names to get the other parameters as list\n", + " other_parameters = [name for name in parameter_names if name != row_parameter]\n", + " unique_row_parameter_values = sorted(tuning_statistics['param_' + row_parameter].unique())\n", + " row_count = len(other_parameters)\n", + " column_count = len(unique_row_parameter_values)\n", + "\n", + " import matplotlib.pyplot as plot\n", + "\n", + " figure, axes = plot.subplots(row_count, column_count, figsize=(6 * column_count, 5 * row_count))#, sharey='row')\n", + " if row_count == 1:\n", + " axes = np.expand_dims(axes, axis=0)\n", + " if column_count == 1:\n", + " axes = np.expand_dims(axes, axis=1)\n", + "\n", + " for column_index, row_parameter_value in enumerate(unique_row_parameter_values):\n", + " subset = tuning_statistics[tuning_statistics['param_' + row_parameter] == row_parameter_value]\n", + " for row_index, parameter_name in enumerate(other_parameters):\n", + " axis = axes[row_index, column_index]\n", + " x = subset['param_' + parameter_name]\n", + " y = subset['mean_test_score']\n", + " axis.plot(x, y, marker='o', linestyle='-')\n", + " axis.set_title(f\"{row_parameter}: {row_parameter_value}\\n{parameter_name}\", fontsize=12)\n", + " axis.set_xlabel(parameter_name)\n", + " if column_index == 0:\n", + " axis.set_ylabel(\"Mean Test Score\")\n", + " axis.grid(True)\n", + "\n", + " figure.suptitle(f'GridSearchCV Hyperparameter Tuning Results by {row_parameter}', fontsize=16)\n", + " plot.tight_layout(rect=(0.0, 0.03, 1.0, 0.95))\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4866d320", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_parameter_importance_from_grid_search(raw_tuning_results):\n", + " \"\"\"\n", + " Plots the importance of each hyperparameter based on how much variance in the score it explains.\n", + " The parameter with the highest variance in mean_test_score across its values is considered most important.\n", + "\n", + " Parameters\n", + " ----------\n", + " cv_results : dict\n", + " The cv_results_ attribute from a fitted GridSearchCV object.\n", + " \"\"\"\n", + " import matplotlib.pyplot as plot\n", + " import pandas as pd\n", + "\n", + " tuning_results = pd.DataFrame(raw_tuning_results)\n", + " parameter_columns = [column for column in tuning_results.columns if column.startswith('param_')]\n", + "\n", + " # Calculate variance in mean_test_score for each parameter\n", + " importances = {}\n", + " for parameter in parameter_columns:\n", + " grouped = tuning_results.groupby(parameter)['mean_test_score'].mean()\n", + " importances[parameter.replace('param_', '')] = grouped.var()\n", + "\n", + " # Sort parameters by importance\n", + " sorted_importances = sorted(importances.items(), key=lambda x: x[1], reverse=True)\n", + "\n", + " # Plot as horizontal bars\n", + " plot.figure(figsize=(10, 2))\n", + " plot.barh(\n", + " [parameter_name for parameter_name, _ in reversed(sorted_importances)],\n", + " [parameter_variance for _, parameter_variance in reversed(sorted_importances)]\n", + " )\n", + " plot.xlabel('Variance in Mean Test Score')\n", + " plot.ylabel('Parameter')\n", + " plot.xscale('log') # Use logarithmic scale for better visibility\n", + " plot.yticks(fontsize=8)\n", + " plot.xticks(fontsize=8, rotation=45)\n", + " plot.title('Parameter Importance (higher = more influence on score)')\n", + " plot.tight_layout()\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "923d7db2", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_grid_search_scores(raw_tuning_results):\n", + " \"\"\"\n", + " Plots the scores from GridSearchCV results.\n", + "\n", + " Parameters\n", + " ----------\n", + " cv_results : dict\n", + " The cv_results_ attribute from a fitted GridSearchCV object.\n", + " \"\"\"\n", + " import matplotlib.pyplot as plot\n", + " import pandas as pd\n", + "\n", + " results = pd.DataFrame(raw_tuning_results)\n", + " plot.figure(figsize=(10, 4))\n", + " plot.plot(results['mean_test_score'], label='Score', marker='o')\n", + " plot.xlabel('Parameter Combination Index')\n", + " plot.ylabel('Score')\n", + " plot.title('Grid Search Scores')\n", + " plot.legend()\n", + " plot.grid(True)\n", + " plot.tight_layout()\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df91775d", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_grid_search_timings(raw_tuning_results):\n", + " \"\"\"\n", + " Plots the fit times from GridSearchCV results.\n", + "\n", + " Parameters\n", + " ----------\n", + " cv_results : dict\n", + " The cv_results_ attribute from a fitted GridSearchCV object.\n", + " \"\"\"\n", + " import matplotlib.pyplot as plot\n", + " import pandas as pd\n", + "\n", + " results = pd.DataFrame(raw_tuning_results)\n", + " plot.figure(figsize=(10, 4))\n", + " plot.plot(results['mean_fit_time'], label='Mean Fit Time (s)', marker='o')\n", + " plot.xlabel('Parameter Combination Index')\n", + " plot.ylabel('Time (seconds)')\n", + " plot.title('Grid Search Timings')\n", + " plot.legend()\n", + " plot.grid(True)\n", + " plot.tight_layout()\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e608d8cb", + "metadata": {}, + "outputs": [], + "source": [ + "def list_top10_parameters(raw_tuning_results):\n", + " import pandas as pd\n", + "\n", + " # Convert cv_results_ to DataFrame and sort by mean_test_score descending\n", + " tuning_results = pd.DataFrame(raw_tuning_results)\n", + " parameter_columns = [column for column in tuning_results.columns if column.startswith('param_')]\n", + "\n", + " top10 = tuning_results.sort_values(by=\"mean_test_score\", ascending=False).head(10)\n", + "\n", + " # Display only the parameter columns and the score\n", + " print(top10[[\"mean_test_score\", *parameter_columns]].to_string(index=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe1df20b", + "metadata": {}, + "outputs": [], + "source": [ + "def output_tuning_details(tuning_results, title: str = ''):\n", + " \"\"\"\n", + " Outputs the tuning details of the GridSearchCV results.\n", + " Prints the best parameters, best score, and the number of evaluated parameter combinations.\n", + " \n", + " Parameters\n", + " ----------\n", + " tuning_results_ : GridSearchCV or dict\n", + " The fitted GridSearchCV object or its cv_results_ attribute.\n", + " \"\"\"\n", + " embeddings_array = np.array(tuning_results.best_estimator_.get_embeddings().embedding.tolist())\n", + " \n", + " print(title + \" - Best Parameters:\", tuning_results.best_params_)\n", + " print(title + \" - Best Score:\", tuning_results.best_score_)\n", + " print(title + \" - Evaluated Combinations:\", len(tuning_results.cv_results_['params']))\n", + " print(title + \" - Hopkins Statistic:\", hopkins_statistic(embeddings_array))\n", + " print(title + \" -\", tuning_results.best_estimator_.get_clustering_scores())\n", + "\n", + " plot_grid_search_hyperparameter_tuning_results(tuning_results.cv_results_)\n", + " plot_parameter_importance_from_grid_search(tuning_results.cv_results_)\n", + " plot_grid_search_scores(tuning_results.cv_results_)\n", + " plot_grid_search_timings(tuning_results.cv_results_)\n", + " list_top10_parameters(tuning_results.cv_results_)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e712775f", + "metadata": {}, + "outputs": [], + "source": [ + "class NodeEmbeddingsCreationResult:\n", + " def __init__(self, embeddings: pd.DataFrame, is_sampled_graph: bool = False):\n", + " self.embeddings = embeddings\n", + " self.is_sampled_graph = is_sampled_graph\n", + " def __repr__(self):\n", + " return f\"NodeEmbeddingsCreationResult(embeddings={self.embeddings}, is_sampled_graph={self.is_sampled_graph})\"\n", + "\n", + "# Feature ideas\n", + "# TODO deprecated?\n", + "# TODO option to choose between directed and undirected projection\n", + "# TODO run a community detection algorithm co-located in here when \"communityId\" is missing\n", + "# TODO run a centrality algorithm co-located in here when \"centrality\" score is missing\n", + "# TODO this function suffers from excessive parameters. Modularize it into smaller functions\n", + "def create_node_embeddings(cypher_file_name: str, parameters: dict, ignore_existing: bool = True, create_graph_projection: bool = True, graph_sampling_threshold: int = GraphSamplingResult.default_graph_sampling_threshold) -> NodeEmbeddingsCreationResult:\n", + " \"\"\"\n", + " Creates an in-memory Graph projection by calling \"create_undirected_projection\", \n", + " runs the cypher Query given as cypherFileName parameter to calculate and stream the node embeddings\n", + " and returns a DataFrame with the results.\n", + " \n", + " cypher_file_name\n", + " ----------\n", + " Name of the file containing the Cypher query that executes node embeddings procedure.\n", + "\n", + " parameters\n", + " ----------\n", + " dependencies_projection : str\n", + " The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n", + " dependencies_projection_node : str\n", + " The label of the nodes that will be used for the projection. Example: \"Package\"\n", + " dependencies_projection_weight_property : str\n", + " The name of the node property that contains the dependency weight. Example: \"weight25PercentInterfaces\"\n", + " dependencies_projection_embedding_dimension : str\n", + " The number of the dimensions and therefore size of the resulting array of floating point numbers\n", + " \"\"\"\n", + " \n", + " if create_graph_projection:\n", + " print(\"Create projection\")\n", + " is_data_available=create_undirected_projection(parameters)\n", + " \n", + " if not is_data_available:\n", + " print(\"No projected data for node embeddings calculation available\")\n", + " empty_result = pd.DataFrame(columns=[\"codeUnitName\", 'projectName', 'nodeElementId', 'communityId', 'centrality', 'embedding'])\n", + " return NodeEmbeddingsCreationResult(empty_result)\n", + " else:\n", + " print(\"Skip projection creation\")\n", + " \n", + " # Check if the graph has to be sampled because of its size\n", + " sampling_result=sample_graph_if_size_exceeds_limit(parameters, graph_sampling_threshold)\n", + " \n", + " node_embeddings_parameters = parameters.copy()\n", + " if ignore_existing:\n", + " embeddings = query_cypher_to_data_frame(cypher_file_name, parameters=sampling_result.updated_parameters)\n", + " else: \n", + " existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n", + " embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=node_embeddings_parameters)\n", + " \n", + " display(embeddings.head()) # Display the first entries of the table\n", + " hopkins_statistic_value = hopkins_statistic(np.array(embeddings.embedding.tolist()))\n", + " print(f\"Hopkins statistic value: {hopkins_statistic_value}\")\n", + " \n", + " return NodeEmbeddingsCreationResult(embeddings, sampling_result.is_sampled)" + ] + }, + { + "cell_type": "markdown", + "id": "f6ec6a9b", + "metadata": {}, + "source": [ + "### Dimensionality reduction with t-distributed stochastic neighbor embedding (t-SNE)\n", + "\n", + "The following function takes the original node embeddings with a higher dimensionality, e.g. 64 floating point numbers, and reduces them into a two dimensional array for visualization. \n", + "\n", + "> It converts similarities between data points to joint probabilities and tries to minimize the Kullback-Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data.\n", + "\n", + "(see https://opentsne.readthedocs.io)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "720aebd3", + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_node_embeddings_for_2d_visualization_tsne(embeddings: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " Reduces the dimensionality of the node embeddings (e.g. 64 floating point numbers in an array)\n", + " to two dimensions for 2D visualization.\n", + " see https://opentsne.readthedocs.io\n", + " \"\"\"\n", + "\n", + " if embeddings.empty: \n", + " print(\"No projected data for node embeddings dimensionality reduction available\")\n", + " return embeddings\n", + " \n", + " # Calling the fit_transform method just with a list doesn't seem to work (anymore?). \n", + " # It leads to an error with the following message: 'list' object has no attribute 'shape'\n", + " # This can be solved by converting the list to a numpy array using np.array(..).\n", + " # See https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape\n", + " embeddings_as_numpy_array = np.array(embeddings.embedding.to_list())\n", + "\n", + " # Use t-distributed stochastic neighbor embedding (t-SNE) to reduce the dimensionality \n", + " # of the previously calculated node embeddings to 2 dimensions for visualization\n", + " t_distributed_stochastic_neighbor_embedding = TSNE(n_components=2, verbose=False, random_state=47)\n", + " two_dimension_node_embeddings = t_distributed_stochastic_neighbor_embedding.fit_transform(embeddings_as_numpy_array)\n", + " # display(two_dimension_node_embeddings.shape) # Display the shape of the t-SNE result\n", + "\n", + " # Create a new DataFrame with the results of the 2 dimensional node embeddings\n", + " # and the code unit and artifact name of the query above as preparation for the plot\n", + " embeddings['embeddingVisualizationX'] = [value[0] for value in two_dimension_node_embeddings]\n", + " embeddings['embeddingVisualizationY'] = [value[1] for value in two_dimension_node_embeddings]\n", + "\n", + " # display(embeddings.head(10)) # Display the first line of the results\n", + " return embeddings\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "dd9b83c2", + "metadata": {}, + "source": [ + "### Dimensionality reduction with Uniform Manifold Approximation and Projection (UMAP)\n", + "\n", + "The following function takes the original node embeddings with a higher dimensionality, e.g. 64 floating point numbers, and reduces them into a two dimensional array for visualization using UMAP.\n", + "\n", + "> UMAP is a non-linear dimensionality reduction technique that preserves both local and global structure of the data, making it well-suited for visualizing high-dimensional embeddings in 2D.\n", + "\n", + "(see https://umap-learn.readthedocs.io)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f021ee6b", + "metadata": {}, + "outputs": [], + "source": [ + "import umap\n", + "\n", + "def prepare_node_embeddings_for_2d_visualization_umap(embeddings: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " Reduces the dimensionality of the node embeddings (e.g. 64 floating point numbers in an array)\n", + " to two dimensions for 2D visualization using UMAP.\n", + " see https://umap-learn.readthedocs.io\n", + " \"\"\"\n", + "\n", + " if embeddings.empty: \n", + " print(\"No projected data for node embeddings dimensionality reduction available\")\n", + " return embeddings\n", + "\n", + " # Convert the list of embeddings to a numpy array\n", + " embeddings_as_numpy_array = np.array(embeddings.embedding.to_list())\n", + "\n", + " # Use UMAP to reduce the dimensionality to 2D for visualization\n", + " # umap_reducer = umap.UMAP(min_dist=0.3, n_neighbors=15, n_components=2, metric='manhattan', random_state=47)\n", + " umap_reducer = umap.UMAP(n_components=2, min_dist=0.3, random_state=47)\n", + " two_dimension_node_embeddings = umap_reducer.fit_transform(embeddings_as_numpy_array)\n", + "\n", + " # Add the 2D coordinates to the DataFrame\n", + " embeddings['embeddingUMAPVisualizationX'] = two_dimension_node_embeddings[:, 0]\n", + " embeddings['embeddingUMAPVisualizationY'] = two_dimension_node_embeddings[:, 1]\n", + "\n", + " return embeddings\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8870d939", + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_node_embeddings_for_2d_visualization(embeddings: pd.DataFrame) -> pd.DataFrame:\n", + " embeddings = prepare_node_embeddings_for_2d_visualization_tsne(embeddings)\n", + " embeddings = prepare_node_embeddings_for_2d_visualization_umap(embeddings)\n", + " return embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d937e26e", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO delete if not used anymore\n", + "def plot_2d_node_embeddings_old(node_embeddings_for_visualization: pd.DataFrame, title: str, clustering_name: str = \"TunedHDBSCAN\", main_color_map: str = \"tab20\") -> None:\n", + " if node_embeddings_for_visualization.empty:\n", + " print(\"No projected data to plot available\")\n", + " return\n", + " \n", + " figure, (top, bottom) = plot.subplots(nrows=2, ncols=1, figsize=(8, 10))\n", + " figure.suptitle(title)\n", + " figure.subplots_adjust(top=0.92, left=0.01, right=0.99, bottom=0.01, hspace=0.2)\n", + "\n", + " node_embeddings_non_noise_cluster = node_embeddings_for_visualization[node_embeddings_for_visualization[get_clustering_property_name('Label', clustering_name)] != -1]\n", + " node_embeddings_noise_cluster = node_embeddings_for_visualization[node_embeddings_for_visualization[get_clustering_property_name('Label', clustering_name)] == -1]\n", + "\n", + " # Print the graph communities as a reference in the top plot\n", + " top.set_title(\"Leiden Community Detection\")\n", + " top.scatter(\n", + " x=node_embeddings_for_visualization.embeddingVisualizationX,\n", + " y=node_embeddings_for_visualization.embeddingVisualizationY,\n", + " s=node_embeddings_for_visualization.centrality * 300,\n", + " c=node_embeddings_for_visualization.communityId,\n", + " cmap=main_color_map,\n", + " )\n", + "\n", + " # Print the clustering results based on the node embeddings in the bottom plot\n", + " bottom.set_title(\"HDBSCAN Clustering\")\n", + " bottom.scatter(\n", + " x=node_embeddings_non_noise_cluster.embeddingVisualizationX,\n", + " y=node_embeddings_non_noise_cluster.embeddingVisualizationY,\n", + " s=node_embeddings_non_noise_cluster.centrality * 300,\n", + " c=node_embeddings_non_noise_cluster[get_clustering_property_name('Label', clustering_name)],\n", + " cmap=main_color_map,\n", + " )\n", + " bottom.scatter(\n", + " x=node_embeddings_noise_cluster.embeddingVisualizationX,\n", + " y=node_embeddings_noise_cluster.embeddingVisualizationY,\n", + " s=node_embeddings_noise_cluster.centrality * 300,\n", + " c='lightgrey'\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80968112", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plot\n", + "import seaborn\n", + "import numpy as np\n", + "\n", + "def plot_2d_node_embeddings(\n", + " node_embeddings_for_visualization: pd.DataFrame,\n", + " title: str,\n", + " clustering_name: str = \"TunedHDBSCAN\",\n", + " main_color_map: str = \"tab20\",\n", + " x_position_column = 'embeddingVisualizationX',\n", + " y_position_column = 'embeddingVisualizationY'\n", + ") -> None:\n", + " if node_embeddings_for_visualization.empty:\n", + " print(\"No projected data to plot available\")\n", + " return\n", + "\n", + " # Create figure and subplots\n", + " figure, (leiden_subplot, hdbscan_subplot) = plot.subplots(nrows=2, ncols=1, figsize=(10, 12))\n", + " figure.suptitle(title)\n", + " figure.subplots_adjust(top=0.94, left=0.05, right=0.95, bottom=0.04, hspace=0.25)\n", + "\n", + " # Setup columns\n", + " cluster_label_column_name = get_clustering_property_name('Label', clustering_name)\n", + " node_size_column = 'centrality'\n", + "\n", + " # Separate HDBSCAN non-noise and noise nodes\n", + " node_embeddings_without_noise = node_embeddings_for_visualization[node_embeddings_for_visualization[cluster_label_column_name] != -1]\n", + " node_embeddings_noise_only = node_embeddings_for_visualization[node_embeddings_for_visualization[cluster_label_column_name] == -1]\n", + "\n", + " # ------------------------------------------\n", + " # Top subplot: Leiden Communities with KDE\n", + " # ------------------------------------------\n", + " leiden_subplot.set_title(\"Leiden Community Detection\")\n", + "\n", + " unique_community_ids = node_embeddings_for_visualization[\"communityId\"].unique()\n", + " leiden_color_palette = seaborn.color_palette(main_color_map, len(unique_community_ids))\n", + " leiden_community_to_color = dict(zip(unique_community_ids, leiden_color_palette))\n", + "\n", + " for community_id in unique_community_ids:\n", + " community_nodes = node_embeddings_for_visualization[\n", + " node_embeddings_for_visualization[\"communityId\"] == community_id\n", + " ]\n", + "\n", + " # KDE cloud shape\n", + " seaborn.kdeplot(\n", + " x=community_nodes[x_position_column],\n", + " y=community_nodes[y_position_column],\n", + " fill=True,\n", + " alpha=0.12,\n", + " levels=3,\n", + " color=leiden_community_to_color[community_id],\n", + " ax=leiden_subplot,\n", + " )\n", + "\n", + " # Node scatter points\n", + " leiden_subplot.scatter(\n", + " x=community_nodes[x_position_column],\n", + " y=community_nodes[y_position_column],\n", + " s=community_nodes[node_size_column] * 300,\n", + " color=leiden_community_to_color[community_id],\n", + " alpha=0.7,\n", + " label=f\"Community {community_id}\"\n", + " )\n", + "\n", + " leiden_subplot.legend(title=\"Leiden Communities\", loc=\"best\", prop={'size': 6})\n", + "\n", + " # ------------------------------------------\n", + " # Bottom subplot: HDBSCAN Clustering with KDE\n", + " # ------------------------------------------\n", + " hdbscan_subplot.set_title(\"HDBSCAN Clustering\")\n", + "\n", + " unique_cluster_labels = node_embeddings_without_noise[cluster_label_column_name].unique()\n", + " hdbscan_color_palette = seaborn.color_palette(main_color_map, len(unique_cluster_labels))\n", + " hdbscan_cluster_to_color = dict(zip(unique_cluster_labels, hdbscan_color_palette))\n", + "\n", + " for cluster_label in unique_cluster_labels:\n", + " cluster_nodes = node_embeddings_without_noise[\n", + " node_embeddings_without_noise[cluster_label_column_name] == cluster_label\n", + " ]\n", + "\n", + " # KDE cloud shape\n", + " seaborn.kdeplot(\n", + " x=cluster_nodes[x_position_column],\n", + " y=cluster_nodes[y_position_column],\n", + " fill=True,\n", + " alpha=0.05,\n", + " levels=2,\n", + " color=hdbscan_cluster_to_color[cluster_label],\n", + " ax=hdbscan_subplot,\n", + " # linewidths=0\n", + " )\n", + "\n", + " # Node scatter points\n", + " hdbscan_subplot.scatter(\n", + " x=cluster_nodes[x_position_column],\n", + " y=cluster_nodes[y_position_column],\n", + " s=cluster_nodes[node_size_column] * 300,\n", + " color=hdbscan_cluster_to_color[cluster_label],\n", + " alpha=0.9,\n", + " label=f\"Cluster {cluster_label}\"\n", + " )\n", + "\n", + " # Plot noise points in gray\n", + " hdbscan_subplot.scatter(\n", + " x=node_embeddings_noise_only[x_position_column],\n", + " y=node_embeddings_noise_only[y_position_column],\n", + " s=node_embeddings_noise_only[node_size_column] * 300,\n", + " color='lightgrey',\n", + " alpha=0.4,\n", + " label=\"Noise\"\n", + " )\n", + "\n", + " hdbscan_subplot.legend(title=\"HDBSCAN Clusters\", loc=\"best\", prop={'size': 6})\n" + ] + }, + { + "cell_type": "markdown", + "id": "0c68aa20", + "metadata": {}, + "source": [ + "## 1. Java Packages" + ] + }, + { + "cell_type": "markdown", + "id": "20b6cac2", + "metadata": {}, + "source": [ + "### 1.1 Create Graph Projection\n", + "\n", + "To be able to run Graph algorithms efficiently and to focus on specific parts of the Graph, e.g. dependencies between code units, an in-memory \"projection\" is created containing the selected part of the Graph." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6dad335e", + "metadata": {}, + "outputs": [], + "source": [ + "java_package_projection_parameters={\n", + " \"dependencies_projection\": \"java-package-embeddings-notebook\",\n", + " \"dependencies_projection_node\": \"Package\",\n", + " \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n", + "}\n", + "# Create a undirected graph projection for the Java Package nodes\n", + "java_package_data_available = create_undirected_projection(java_package_projection_parameters)\n", + "if java_package_data_available:\n", + " # Sample the graph (take a smaller subgraph of it) if it exceeds the size limit\n", + " # The updated parameters and node_count contain the original values if no sampling was necessary\n", + " java_package_sampling_result = sample_graph_if_size_exceeds_limit(java_package_projection_parameters)\n", + " java_package_sampled_projection_parameters = java_package_sampling_result.updated_parameters\n", + " java_package_node_count = java_package_sampling_result.node_count\n", + "else:\n", + " print(\"No projected data for Java Package node embeddings calculation available.\")" + ] + }, + { + "cell_type": "markdown", + "id": "f82ea9fe", + "metadata": {}, + "source": [ + "### 1.2 Use Leiden Community Detection Algorithm results as reference\n", + "\n", + "Before we create node embeddings, we will run the Leiden Community detection algorithm to get modularity optimized community ids that we will use later as a \"gold standard\" to tune the results of the node embedding clustering. \n", + "\n", + "The intuition/idea behind that is that we then get clusters in the vector space (node embeddings) that are close to each other (manhattan distance), when the represented code units are also coupled together. Density based clustering works of course differently and leads to different insights about the structural features of the code units so that it will (and also should) not match the Leiden communities perfectly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1681d911", + "metadata": {}, + "outputs": [], + "source": [ + "def get_tuned_leiden_community_detection_algorithm(projection_parameters: dict) -> TuneableLeidenCommunityDetection:\n", + " import optuna\n", + " from optuna.samplers import TPESampler\n", + "\n", + " def objective(trial):\n", + " # Here we intentionally use the original projection parameters, not the sampled ones,\n", + " # since the sampling is not necessary for Fast Random Projection embeddings.\n", + " tuneable_leiden_community_detection = create_tuneable(TuneableLeidenCommunityDetection).with_projection_parameters(projection_parameters)\n", + " # Suggest values for each hyperparameter\n", + " tuneable_leiden_community_detection.set_params(\n", + " gamma=trial.suggest_float(\"gamma\", low=0.7, high=1.3, step=0.01),\n", + " theta = trial.suggest_float(\"theta\", 0.0001, 0.01, log=True),\n", + " # Fixed max_levels = 10 (default) since experiments showed only minor differences in the results\n", + " # max_levels = trial.suggest_int(\"max_levels\", 8, 12)\n", + " )\n", + " tuneable_leiden_community_detection.fit()\n", + " return tuneable_leiden_community_detection.score()\n", + "\n", + " # TODO create study with db?\n", + " study_name = \"LeidenCommunityDetection4Java\" + projection_parameters[\"dependencies_projection_node\"]\n", + " study = optuna.create_study(direction=\"maximize\", sampler=TPESampler(seed=42), study_name=study_name)#, storage=f\"sqlite:///optuna_study_node_embeddings_java.db\", load_if_exists=True)\n", + " \n", + " # Start the hyperparameter tuning\n", + " study.optimize(objective, n_trials=20, timeout=20)\n", + " output_optuna_tuning_results(study, 'Leiden Community Detection')\n", + "\n", + " # Try (enqueue) specific settings first that led to good results in initial experiments\n", + " study.enqueue_trial({'gamma': 1.0, 'theta': 0.001, 'max_levels': 10}) # default values\n", + " study.enqueue_trial({'gamma': 1.14, 'theta': 0.001, 'max_levels': 10})\n", + " \n", + " # Run the node embeddings algorithm again again with the best parameters\n", + " tuned_leiden_community_detection = create_tuneable(TuneableLeidenCommunityDetection).with_projection_parameters(projection_parameters)\n", + " tuned_leiden_community_detection.set_params(**study.best_params)\n", + " tuned_leiden_community_detection.fit()\n", + "\n", + " print(\"Best Leiden Community Detection Modularity\", tuned_leiden_community_detection.get_modularity())\n", + " print(\"Best Leiden Community Detection Community Count\", tuned_leiden_community_detection.get_community_count())\n", + " \n", + " return tuned_leiden_community_detection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48962d7b", + "metadata": {}, + "outputs": [], + "source": [ + "if java_package_node_count > 0:\n", + " tuned_leiden_community_detection = get_tuned_leiden_community_detection_algorithm(java_package_projection_parameters)\n", + " tuned_leiden_community_detection.write_communities()" + ] + }, + { + "cell_type": "markdown", + "id": "2d474706", + "metadata": {}, + "source": [ + "### 1.3 Generate Node Embeddings using Fast Random Projection (Fast RP) for Java Packages\n", + "\n", + "[Fast Random Projection](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp) is used to reduce the dimensionality of the node feature space while preserving most of the distance information. Nodes with similar neighborhood result in node embedding with similar vectors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18b27778", + "metadata": {}, + "outputs": [], + "source": [ + "# Fast Random Projection tuned with Optuna\n", + "\n", + "def get_tuned_fast_random_projection_node_embeddings(projection_parameters: dict) -> TuneableFastRandomProjectionNodeEmbeddings:\n", + " import optuna\n", + " from optuna.samplers import TPESampler\n", + "\n", + " def objective(trial):\n", + " # Here we intentionally use the original projection parameters, not the sampled ones,\n", + " # since the sampling is not necessary for Fast Random Projection embeddings.\n", + " tuneable_fast_random_projection = create_tuneable(TuneableFastRandomProjectionNodeEmbeddings).with_projection_parameters(projection_parameters)\n", + " # Suggest values for each hyperparameter\n", + " tuneable_fast_random_projection.set_params(\n", + " embedding_dimension=trial.suggest_categorical(\"embedding_dimension\", [64, 128, 256]),\n", + " fast_random_projection_normalization_strength=trial.suggest_float(\"fast_random_projection_normalization_strength\", low=-1.0, high=1.0, step=0.1),\n", + " fast_random_projection_forth_iteration_weight=trial.suggest_float(\"fast_random_projection_forth_iteration_weight\", low=0.0, high=2.0, step=0.1),\n", + " )\n", + " tuneable_fast_random_projection.fit()\n", + " return tuneable_fast_random_projection.score()\n", + "\n", + " # TODO create study with db?\n", + " study_name = \"FastRandomProjection4Java\" + projection_parameters[\"dependencies_projection_node\"]\n", + " study = optuna.create_study(direction=\"maximize\", sampler=TPESampler(seed=42), study_name=study_name)#, storage=f\"sqlite:///optuna_study_node_embeddings_java.db\", load_if_exists=True)\n", + " \n", + " # Try (enqueue) specific settings first that led to good results in initial experiments\n", + " study.enqueue_trial({'embedding_dimension': 128, 'fast_random_projection_forth_iteration_weight': 0.5, 'fast_random_projection_normalization_strength': 0.3})\n", + " study.enqueue_trial({'embedding_dimension': 128, 'fast_random_projection_forth_iteration_weight': 1.0, 'fast_random_projection_normalization_strength': 0.5})\n", + " study.enqueue_trial({'embedding_dimension': 256, 'fast_random_projection_forth_iteration_weight': 0.5, 'fast_random_projection_normalization_strength': 0.3})\n", + " study.enqueue_trial({'embedding_dimension': 256, 'fast_random_projection_forth_iteration_weight': 1.0, 'fast_random_projection_normalization_strength': 0.3})\n", + " \n", + " # Start the hyperparameter tuning\n", + " study.optimize(objective, n_trials=80, timeout=40)\n", + " output_optuna_tuning_results(study, 'Fast Random Projection (FastRP)')\n", + "\n", + " # Run the node embeddings algorithm again again with the best parameters\n", + " tuned_fast_random_projection = create_tuneable(TuneableFastRandomProjectionNodeEmbeddings).with_projection_parameters(projection_parameters)\n", + " tuned_fast_random_projection.set_params(**study.best_params)\n", + " return tuned_fast_random_projection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdc72914", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO Keep solution (either Optuna or classic)\n", + "if java_package_node_count > 0:\n", + " tuned_fast_random_projection = get_tuned_fast_random_projection_node_embeddings(java_package_projection_parameters)\n", + " # TODO Write the results back into the Neo4j database\n", + " #tuned_fast_random_projection.best_estimator_.write_embeddings()\n", + " embeddings = tuned_fast_random_projection.fit().get_embeddings()\n", + " embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings).embeddings\n", + " display(embeddings.head())\n", + "# ------\n", + "tuneable_fast_random_projection_parameter_grid = {\n", + " \"embedding_dimension\": [64, 128, 256],\n", + " \"random_seed\": [42], # Fixed random seed since experiments showed only minor differences in the results\n", + " \"fast_random_projection_normalization_strength\": [-0.9, -0.5, -0.4, -0.3, -0.2, 0.0, 0.2, 0.3, 0.4, 0.5],\n", + " \"fast_random_projection_forth_iteration_weight\": [0.5, 1.0],\n", + "}\n", + "\n", + "# Here we intentionally use the original projection parameters, not the sampled ones,\n", + "# since the sampling is not necessary for Fast Random Projection embeddings.\n", + "tuneable_fast_random_projection = create_tuneable(TuneableFastRandomProjectionNodeEmbeddings).with_projection_parameters(java_package_projection_parameters)\n", + "\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "hyperparameter_tuning = GridSearchCV(\n", + " estimator=tuneable_fast_random_projection,\n", + " param_grid=tuneable_fast_random_projection_parameter_grid,\n", + " cv=get_all_data_without_slicing_cross_validator_for_node_count(java_package_node_count),\n", + " verbose=1\n", + ")\n", + "\n", + "if java_package_node_count > 0:\n", + " reset_node_embedding_tuning_scores()\n", + " tuned_fast_random_projection = hyperparameter_tuning.fit(get_initial_dummy_data_for_hyperparameter_tuning(java_package_node_count))\n", + " output_tuning_details(tuned_fast_random_projection, 'Tuned Fast Random Projection for Java Packages')\n", + " output_node_embedding_tuning_scores()\n", + "\n", + " embeddings = tuned_fast_random_projection.best_estimator_.get_embeddings()\n", + " embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings).embeddings\n", + " display(embeddings.head())\n", + "\n", + " # Write the results back into the Neo4j database\n", + " tuned_fast_random_projection.best_estimator_.write_embeddings()" + ] + }, + { + "cell_type": "markdown", + "id": "76d8bca1", + "metadata": {}, + "source": [ + "#### Dimensionality reduction with t-distributed stochastic neighbor embedding (t-SNE)\n", + "\n", + "This step takes the original node embeddings with a higher dimensionality, e.g. 64 floating point numbers, and reduces them into a two dimensional array for visualization. For more details look up the function declaration for \"prepare_node_embeddings_for_2d_visualization\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "031abacc", + "metadata": {}, + "outputs": [], + "source": [ + "if java_package_data_available:\n", + " node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)" + ] + }, + { + "cell_type": "markdown", + "id": "f908c47f", + "metadata": {}, + "source": [ + "#### Visualization of the node embeddings reduced to two dimensions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "459a819c", + "metadata": {}, + "outputs": [], + "source": [ + "if java_package_data_available:\n", + " plot_2d_node_embeddings(\n", + " node_embeddings_for_visualization, \n", + " \"Java Package positioned by their dependency relationships (FastRP node embeddings + t-SNE)\"\n", + " )\n", + " plot_2d_node_embeddings(\n", + " node_embeddings_for_visualization, \n", + " \"Java Package positioned by their dependency relationships (FastRP node embeddings + UMAP)\",\n", + " x_position_column='embeddingUMAPVisualizationX',\n", + " y_position_column='embeddingUMAPVisualizationY'\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "326e67fe", + "metadata": {}, + "source": [ + "#### Write the results (clustering, 2d embedding for visualization) back into the Neo4j database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0908046", + "metadata": {}, + "outputs": [], + "source": [ + "data_to_write = pd.DataFrame(data = {\n", + " 'nodeElementId': embeddings[\"nodeElementId\"],\n", + " 'clusteringHDBSCANLabel': embeddings[get_clustering_property_name('Label')],\n", + " 'clusteringHDBSCANProbability': embeddings[get_clustering_property_name('Probability')],\n", + " 'embeddingFastRandomProjectionVisualizationX': embeddings[\"embeddingVisualizationX\"],\n", + " 'embeddingFastRandomProjectionVisualizationY': embeddings[\"embeddingVisualizationY\"],\n", + " })\n", + "write_batch_data_into_database(data_to_write, 'Package')" + ] + }, + { + "cell_type": "markdown", + "id": "b690b9a7", + "metadata": {}, + "source": [ + "### 1.4 Node Embeddings for Java Packages using HashGNN\n", + "\n", + "[HashGNN](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn) resembles Graph Neural Networks (GNN) but does not include a model or require training. It combines ideas of GNNs and fast randomized algorithms. For more details see [HashGNN](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn). In this section we combine all previously separately explained steps into one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d9b75b6", + "metadata": {}, + "outputs": [], + "source": [ + "def get_tuned_hashgnn_node_embeddings(projection_parameters: dict) -> TuneableHashGNNNodeEmbeddings:\n", + " import optuna\n", + " from optuna.samplers import TPESampler\n", + " from optuna.importance import get_param_importances\n", + "\n", + " def objective(trial):\n", + " tuneable_hashgnn = create_tuneable(TuneableHashGNNNodeEmbeddings).with_projection_parameters(projection_parameters)\n", + " # Suggest values for each hyperparameter\n", + " tuneable_hashgnn.set_params(\n", + " embedding_dimension=trial.suggest_categorical(\"embedding_dimension\", [64, 128, 256]),\n", + " hashgnn_density_level=trial.suggest_categorical(\"hashgnn_density_level\", [1, 2]),\n", + " hashgnn_dimension_multiplier=trial.suggest_categorical(\"hashgnn_dimension_multiplier\", [1, 2]),\n", + " hashgnn_iterations=trial.suggest_categorical(\"hashgnn_iterations\", [2, 4]),\n", + " hashgnn_neighbor_influence=trial.suggest_categorical(\"hashgnn_neighbor_influence\", [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 1.0, 2.0, 5.0, 10.0]),\n", + " random_seed=42, #trial.suggest_categorical(\"random_seed\", [42, 2025]),\n", + " )\n", + " tuneable_hashgnn.fit()\n", + " return tuneable_hashgnn.score()\n", + "\n", + " # TODO create study with db?\n", + " study_name = \"HashGNN4Java\" + projection_parameters[\"dependencies_projection_node\"]\n", + " study = optuna.create_study(direction=\"maximize\", sampler=TPESampler(seed=42), study_name=study_name)#, storage=f\"sqlite:///optuna_study_node_embeddings_java.db\", load_if_exists=True)\n", + " # Try (enqueue) specific settings first which led to good results in initial experiments\n", + " study.enqueue_trial({'embedding_dimension': 128, 'hashgnn_density_level': 2, 'hashgnn_dimension_multiplier': 1, 'hashgnn_iterations': 2, 'hashgnn_neighbor_influence': 1.0})\n", + " study.enqueue_trial({'embedding_dimension': 256, 'hashgnn_density_level': 2, 'hashgnn_dimension_multiplier': 1, 'hashgnn_iterations': 2, 'hashgnn_neighbor_influence': 0.7})\n", + " study.enqueue_trial({'embedding_dimension': 256, 'hashgnn_density_level': 2, 'hashgnn_dimension_multiplier': 1, 'hashgnn_iterations': 4, 'hashgnn_neighbor_influence': 1.0})\n", + " # Start the hyperparameter tuning\n", + " study.optimize(objective, n_trials=80, timeout=40)\n", + " output_optuna_tuning_results(study, 'HashGNN')\n", + "\n", + " print(\"Best HashGNN parameters (Optuna):\", study.best_params)\n", + " print(\"Best HashGNN adjusted mutual info score with noise penalty:\", study.best_value)\n", + " print(\"Best HashGNN parameter influence:\", get_param_importances(study))\n", + "\n", + " # Run the node embeddings algorithm again again with the best parameters\n", + " tuned_hashgnn = create_tuneable(TuneableHashGNNNodeEmbeddings).with_projection_parameters(projection_parameters)\n", + " tuned_hashgnn.set_params(**study.best_params)\n", + " return tuned_hashgnn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19426811", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO Keep one solution (Optuna vs. GridSearch) \n", + "if java_package_node_count > 0:\n", + " tuned_hashgnn = get_tuned_hashgnn_node_embeddings(java_package_sampled_projection_parameters)\n", + "\n", + " if java_package_sampling_result.is_sampled:\n", + " tuned_hashgnn.refit_with_projection(java_package_projection_parameters[\"dependencies_projection\"])\n", + " else:\n", + " tuned_hashgnn.fit()\n", + " \n", + " embeddings = tuned_hashgnn.get_embeddings()\n", + " embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings).embeddings\n", + " display(embeddings.head())\n", + "\n", + " node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", + " plot_2d_node_embeddings(\n", + " node_embeddings_for_visualization,\n", + " \"Java Packages positioned by their dependency relationships (HashGNN + t-SNE)\"\n", + " )\n", + " plot_2d_node_embeddings(\n", + " node_embeddings_for_visualization, \n", + " \"Java Packages positioned by their dependency relationships (HashGNN + UMAP)\",\n", + " x_position_column='embeddingUMAPVisualizationX',\n", + " y_position_column='embeddingUMAPVisualizationY'\n", + " )\n", + "# -------\n", + "tuneable_hashgnn_parameter_grid = {\n", + " \"embedding_dimension\": [64, 128, 256],\n", + " # \"random_seed\": [42, 2023], # Fixed random seed since experiments showed only minor differences in the results\n", + " \"hashgnn_iterations\": [2, 4],\n", + " \"hashgnn_density_level\": [1, 2],\n", + " \"hashgnn_neighbor_influence\": [0.7, 1.0, 5.0, 10.0], # [0.1, 0.7, 1.0, 5.0, 10.0],\n", + " \"hashgnn_dimension_multiplier\": [1, 2],\n", + "}\n", + "\n", + "tuneable_hashgnn = create_tuneable(TuneableHashGNNNodeEmbeddings).with_projection_parameters(java_package_sampled_projection_parameters)\n", + "\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "hyperparameter_tuning = GridSearchCV(\n", + " estimator=tuneable_hashgnn,\n", + " param_grid=tuneable_hashgnn_parameter_grid,\n", + " cv=get_all_data_without_slicing_cross_validator_for_node_count(java_package_node_count),\n", + " verbose=1\n", + ")\n", + "\n", + "if java_package_node_count > 0:\n", + " reset_node_embedding_tuning_scores()\n", + " tuned_hashgnn = hyperparameter_tuning.fit(get_initial_dummy_data_for_hyperparameter_tuning(java_package_node_count))\n", + " output_tuning_details(tuned_hashgnn, 'Tuned HashGNN for Java Packages')\n", + " output_node_embedding_tuning_scores()\n", + "\n", + " if java_package_sampling_result.is_sampled:\n", + " tuned_hashgnn.best_estimator_.refit_with_projection(java_package_projection_parameters[\"dependencies_projection\"])\n", + "\n", + " embeddings = tuned_hashgnn.best_estimator_.get_embeddings()\n", + " embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings).embeddings\n", + " display(embeddings.head())\n", + "\n", + " plot_2d_node_embeddings(\n", + " prepare_node_embeddings_for_2d_visualization(embeddings),\n", + " \"Java Packages positioned by their dependency relationships (HashGNN + t-SNE)\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "248d88b4", + "metadata": {}, + "source": [ + "### 1.5 Node Embeddings for Java Packages using node2vec" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1248226", + "metadata": {}, + "outputs": [], + "source": [ + "def get_tuned_node2vec_node_embeddings(projection_parameters: dict) -> TuneableNode2VecNodeEmbeddings:\n", + " from typing import cast\n", + " import optuna\n", + " from optuna.samplers import TPESampler\n", + " from optuna.importance import MeanDecreaseImpurityImportanceEvaluator\n", + " from optuna.importance import get_param_importances\n", + "\n", + " def objective(trial):\n", + " tuneable_nod2vec = create_tuneable(TuneableNode2VecNodeEmbeddings).with_projection_parameters(projection_parameters)\n", + " # Suggest values for each hyperparameter\n", + " tuneable_nod2vec.set_params(\n", + " embedding_dimension=trial.suggest_categorical(\"embedding_dimension\", [32, 64, 128, 256]),\n", + " node2vec_in_out_factor=trial.suggest_float(\"node2vec_in_out_factor\", low=0.25, high=2.0, step=0.25),\n", + " node2vec_return_factor=trial.suggest_float(\"node2vec_return_factor\", low=0.25, high=2.5, step=0.25),\n", + " node2vec_window_size=trial.suggest_categorical(\"node2vec_window_size\", [5, 10]),\n", + " )\n", + " tuneable_nod2vec.fit()\n", + " return tuneable_nod2vec.score()\n", + "\n", + " # TODO create study with db?\n", + " study_name = \"Node2Vec4Java\" + projection_parameters[\"dependencies_projection_node\"]\n", + " study = optuna.create_study(direction=\"maximize\", sampler=TPESampler(seed=42), study_name=study_name)#, storage=f\"sqlite:///optuna_study_node_embeddings_java.db\", load_if_exists=True)\n", + " # Try (enqueue) specific settings first which led to good results in local experiments\n", + " study.enqueue_trial({'embedding_dimension': 32, 'node2vec_in_out_factor': 1.25, 'node2vec_return_factor': 1.5, 'node2vec_window_size': 10})\n", + " study.enqueue_trial({'embedding_dimension': 32, 'node2vec_in_out_factor': 1.25, 'node2vec_return_factor': 1.75, 'node2vec_window_size': 10})\n", + " study.enqueue_trial({'embedding_dimension': 32, 'node2vec_in_out_factor': 1.75, 'node2vec_return_factor': 1.5, 'node2vec_window_size': 10})\n", + " study.enqueue_trial({'embedding_dimension': 64, 'node2vec_in_out_factor': 0.5, 'node2vec_return_factor': 2.0, 'node2vec_window_size': 5})\n", + " study.enqueue_trial({'embedding_dimension': 64, 'node2vec_in_out_factor': 0.75, 'node2vec_return_factor': 0.75, 'node2vec_window_size': 10})\n", + " study.enqueue_trial({'embedding_dimension': 64, 'node2vec_in_out_factor': 0.75, 'node2vec_return_factor': 2.5, 'node2vec_window_size': 5})\n", + " study.enqueue_trial({'embedding_dimension': 64, 'node2vec_in_out_factor': 1.0, 'node2vec_return_factor': 1.0, 'node2vec_window_size': 5})\n", + " study.enqueue_trial({'embedding_dimension': 64, 'node2vec_in_out_factor': 1.25, 'node2vec_return_factor': 1.5, 'node2vec_window_size': 10})\n", + " study.enqueue_trial({'embedding_dimension': 128, 'node2vec_in_out_factor': 0.5, 'node2vec_return_factor': 2.0, 'node2vec_window_size': 10})\n", + " study.enqueue_trial({'embedding_dimension': 128, 'node2vec_in_out_factor': 0.5, 'node2vec_return_factor': 2.25, 'node2vec_window_size': 5})\n", + " study.enqueue_trial({'embedding_dimension': 128, 'node2vec_in_out_factor': 1.25, 'node2vec_return_factor': 1.75, 'node2vec_window_size': 10})\n", + " study.enqueue_trial({'embedding_dimension': 256, 'node2vec_in_out_factor': 0.5, 'node2vec_return_factor': 1.75, 'node2vec_window_size': 5})\n", + " study.enqueue_trial({'embedding_dimension': 256, 'node2vec_in_out_factor': 0.5, 'node2vec_return_factor': 2.0, 'node2vec_window_size': 5})\n", + " study.enqueue_trial({'embedding_dimension': 256, 'node2vec_in_out_factor': 1.25, 'node2vec_return_factor': 1.5, 'node2vec_window_size': 10})\n", + " study.enqueue_trial({'embedding_dimension': 256, 'node2vec_in_out_factor': 1.25, 'node2vec_return_factor': 1.75, 'node2vec_window_size': 10})\n", + " # Start the hyperparameter tuning\n", + " study.optimize(objective, n_trials=80, timeout=40)\n", + " output_optuna_tuning_results(study, 'node2vec')\n", + "\n", + " # Run the node embeddings algorithm again again with the best parameters\n", + " tuned_node2vec = create_tuneable(TuneableNode2VecNodeEmbeddings).with_projection_parameters(projection_parameters)\n", + " tuned_node2vec.set_params(**study.best_params)\n", + " return tuned_node2vec" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58c23828", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO Keep one solution (Optuna vs. GridSearch) \n", + "if java_package_node_count > 0:\n", + " tuned_node2vec = get_tuned_node2vec_node_embeddings(java_package_sampled_projection_parameters)\n", + " \n", + " if java_package_sampling_result.is_sampled:\n", + " tuned_node2vec.refit_with_projection(java_package_projection_parameters[\"dependencies_projection\"])\n", + " else:\n", + " tuned_node2vec.fit()\n", + " \n", + " embeddings = add_tuned_hierarchical_density_based_spatial_clustering(tuned_node2vec.get_embeddings()).embeddings\n", + " display(embeddings.head())\n", + " \n", + " node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", + " plot_2d_node_embeddings(\n", + " node_embeddings_for_visualization,\n", + " \"Java Packages positioned by their dependency relationships (node2vec + t-SNE)\"\n", + " )\n", + " plot_2d_node_embeddings(\n", + " node_embeddings_for_visualization, \n", + " \"Java Packages positioned by their dependency relationships (node2vec + UMAP)\",\n", + " x_position_column='embeddingUMAPVisualizationX',\n", + " y_position_column='embeddingUMAPVisualizationY'\n", + " )\n", + "# -------\n", + " \n", + "tuneable_node2vec_parameter_grid = {\n", + " \"embedding_dimension\": [32, 64, 128], # 256 rarely improves the results, but increases the computation time\n", + " \"node2vec_in_out_factor\": [0.25, 0.5, 1.0, 2.0], # [0.25, 0.5, 1.0, 2.0, 4.0]\n", + " \"node2vec_return_factor\": [0.25, 0.5, 1.0, 2.0, 4.0], # [0.25, 0.5, 1.0, 2.0, 4.0]\n", + " \"node2vec_negative_sampling_rate\": [5, 10],\n", + " # \"node2vec_window_size\": [5, 10],\n", + " # \"random_seed\": [42], # Fixed random seed since experiments showed only minor differences in the results\n", + " # \"node2vec_walk_length\": [80], # [40, 80, 160],\n", + " # \"node2vec_walks_per_node\": [10], # [5, 10],\n", + " # \"node2vec_iterations\": [1],\n", + " # \"node2vec_positive_sampling_factor\": [0.001],\n", + "}\n", + "\n", + "tuneable_node2vec = create_tuneable(TuneableNode2VecNodeEmbeddings).with_projection_parameters(java_package_sampled_projection_parameters)\n", + "\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "hyperparameter_tuning = GridSearchCV(\n", + " estimator=tuneable_node2vec,\n", + " param_grid=tuneable_node2vec_parameter_grid,\n", + " cv=get_all_data_without_slicing_cross_validator_for_node_count(java_package_node_count),\n", + " verbose=1\n", + ")\n", + "\n", + "if java_package_node_count > 0:\n", + " reset_node_embedding_tuning_scores()\n", + " tuned_nod2vec = hyperparameter_tuning.fit(get_initial_dummy_data_for_hyperparameter_tuning(java_package_node_count))\n", + " output_tuning_details(tuned_nod2vec, 'Tuned node2vec for Java Packages')\n", + " output_node_embedding_tuning_scores()\n", + "\n", + " if java_package_sampling_result.is_sampled:\n", + " tuned_nod2vec.best_estimator_.refit_with_projection(java_package_projection_parameters[\"dependencies_projection\"])\n", + "\n", + " embeddings = tuned_nod2vec.best_estimator_.get_embeddings()\n", + " embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings).embeddings\n", + " display(embeddings.head())\n", + "\n", + " plot_2d_node_embeddings(\n", + " prepare_node_embeddings_for_2d_visualization(embeddings),\n", + " \"Java Packages positioned by their dependency relationships (node2vec + t-SNE)\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "5682bb64", + "metadata": {}, + "source": [ + "## 2. Java Types" + ] + }, + { + "cell_type": "markdown", + "id": "25370d7f", + "metadata": {}, + "source": [ + "### 2.1 Create Graph Projection\n", + "\n", + "To be able to run Graph algorithms efficiently and to focus on specific parts of the Graph, e.g. dependencies between code units, an in-memory \"projection\" is created containing the selected part of the Graph." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2c0af00", + "metadata": {}, + "outputs": [], + "source": [ + "java_type_projection_parameters={\n", + " \"dependencies_projection\": \"java-type-embeddings-notebook\",\n", + " \"dependencies_projection_node\": \"Type\",\n", + " \"dependencies_projection_weight_property\": \"weight\",\n", + "}\n", + "# Create a undirected graph projection for the Java Type nodes\n", + "java_type_data_available = create_undirected_projection(java_type_projection_parameters)\n", + "if java_type_data_available:\n", + " # Sample the graph (take a smaller subgraph of it) if it exceeds the size limit\n", + " # The updated parameters and node_count contain the original values if no sampling was necessary\n", + " java_type_sampling_result = sample_graph_if_size_exceeds_limit(java_type_projection_parameters)\n", + " java_type_sampled_projection_parameters = java_type_sampling_result.updated_parameters\n", + " java_type_node_count = java_type_sampling_result.node_count\n", + "else: \n", + " print(\"No projected data for Java Type node embeddings calculation available.\")" + ] + }, + { + "cell_type": "markdown", + "id": "806e55ae", + "metadata": {}, + "source": [ + "### 2.2 Use Leiden Community Detection Algorithm results as reference\n", + "\n", + "Before we create node embeddings, we will run the Leiden Community detection algorithm to get modularity optimized community ids that we will use later as a \"gold standard\" to tune the results of the node embedding clustering. \n", + "\n", + "The intuition/idea behind that is that we then get clusters in the vector space (node embeddings) that are close to each other (manhattan distance), when the represented code units are also coupled together. Density based clustering works of course differently and leads to different insights about the structural features of the code units so that it will (and also should) not match the Leiden communities perfectly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "516052c9", + "metadata": {}, + "outputs": [], + "source": [ + "if java_type_node_count > 0:\n", + " tuned_leiden_community_detection = get_tuned_leiden_community_detection_algorithm(java_type_projection_parameters)\n", + " tuned_leiden_community_detection.write_communities()" + ] + }, + { + "cell_type": "markdown", + "id": "bf8acd30", + "metadata": {}, + "source": [ + "### 2.3 Node Embeddings for Java Types using Fast Random Projection (Fast RP)\n", + "\n", + "[Fast Random Projection](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp) is used to reduce the dimensionality of the node feature space while preserving most of the distance information. Nodes with similar neighborhood result in node embedding with similar vectors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db22fb8f", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO Keep solution (either Optuna or classic)\n", + "if java_type_node_count > 0:\n", + " tuned_fast_random_projection = get_tuned_fast_random_projection_node_embeddings(java_type_projection_parameters)\n", + " # TODO Write the results back into the Neo4j database\n", + " #tuned_fast_random_projection.best_estimator_.write_embeddings()\n", + " embeddings = tuned_fast_random_projection.fit().get_embeddings()\n", + " embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings).embeddings\n", + " display(embeddings.head())\n", + "\n", + "tuneable_fast_random_projection_parameter_grid = {\n", + " \"embedding_dimension\": [64, 128, 256],\n", + " \"random_seed\": [42], # Fixed random seed since experiments showed only minor differences in the results\n", + " \"fast_random_projection_normalization_strength\": [-0.9, -0.5, -0.4, -0.3, -0.2, 0.0, 0.2, 0.3, 0.4, 0.5],\n", + " \"fast_random_projection_forth_iteration_weight\": [0.5, 1.0],\n", + "}\n", + "\n", + "# Here we intentionally use the original projection parameters, not the sampled ones,\n", + "# since the sampling is not necessary for Fast Random Projection embeddings.\n", + "tuneable_fast_random_projection = create_tuneable(TuneableFastRandomProjectionNodeEmbeddings).with_projection_parameters(java_type_projection_parameters)\n", + "\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "hyperparameter_tuning = GridSearchCV(\n", + " estimator=tuneable_fast_random_projection,\n", + " param_grid=tuneable_fast_random_projection_parameter_grid,\n", + " cv=get_all_data_without_slicing_cross_validator_for_node_count(java_type_node_count),\n", + " verbose=1\n", + ")\n", + "\n", + "if java_type_node_count > 0:\n", + " reset_node_embedding_tuning_scores() # Reset the DataFrame to store the results\n", + " tuned_fast_random_projection = hyperparameter_tuning.fit(get_initial_dummy_data_for_hyperparameter_tuning(java_type_node_count))\n", + " output_tuning_details(tuned_fast_random_projection, 'Tuned Fast Random Projection for Java Types')\n", + " output_node_embedding_tuning_scores()\n", + "\n", + " embeddings = tuned_fast_random_projection.best_estimator_.get_embeddings()\n", + " embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings).embeddings\n", + " display(embeddings.head())\n", + "\n", + " # Write the results back into the Neo4j database\n", + " tuned_fast_random_projection.best_estimator_.write_embeddings()\n", + "\n", + " node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", + " plot_2d_node_embeddings(\n", + " node_embeddings_for_visualization, \n", + " \"Java Types positioned by their dependency relationships (Fast Random Projection + t-SNE)\"\n", + " )\n", + " plot_2d_node_embeddings(\n", + " node_embeddings_for_visualization, \n", + " \"Java Types positioned by their dependency relationships (Fast Random Projection + UMAP)\",\n", + " x_position_column='embeddingUMAPVisualizationX',\n", + " y_position_column='embeddingUMAPVisualizationY'\n", + " )\n", + "\n", + " data_to_write = pd.DataFrame(data = {\n", + " 'nodeElementId': embeddings[\"nodeElementId\"],\n", + " 'clusteringHDBSCANLabel': embeddings[get_clustering_property_name('Label')],\n", + " 'clusteringHDBSCANProbability': embeddings[get_clustering_property_name('Probability')],\n", + " 'embeddingFastRandomProjectionVisualizationX': embeddings[\"embeddingVisualizationX\"],\n", + " 'embeddingFastRandomProjectionVisualizationY': embeddings[\"embeddingVisualizationY\"],\n", + " })\n", + " write_batch_data_into_database(data_to_write, 'Type')" + ] + }, + { + "cell_type": "markdown", + "id": "bdb5c1dd", + "metadata": {}, + "source": [ + "### 2.4 Node Embeddings for Java Types using HashGNN\n", + "\n", + "[HashGNN](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn) resembles Graph Neural Networks (GNN) but does not include a model or require training. It combines ideas of GNNs and fast randomized algorithms. For more details see [HashGNN](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b3c3a1c", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO Keep one solution (Optuna vs. GridSearch) \n", + "if java_type_node_count > 0:\n", + " tuned_hashgnn = get_tuned_hashgnn_node_embeddings(java_type_sampled_projection_parameters)\n", + "\n", + " if java_type_sampling_result.is_sampled:\n", + " tuned_hashgnn.refit_with_projection(java_type_projection_parameters[\"dependencies_projection\"])\n", + " else:\n", + " tuned_hashgnn.fit()\n", + " \n", + " embeddings = tuned_hashgnn.get_embeddings()\n", + " embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings).embeddings\n", + " display(embeddings.head())\n", + " \n", + " node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", + " plot_2d_node_embeddings(\n", + " node_embeddings_for_visualization,\n", + " \"Java Types positioned by their dependency relationships (HashGNN + t-SNE)\"\n", + " )\n", + " plot_2d_node_embeddings(\n", + " node_embeddings_for_visualization, \n", + " \"Java Types positioned by their dependency relationships (HashGNN + UMAP)\",\n", + " x_position_column='embeddingUMAPVisualizationX',\n", + " y_position_column='embeddingUMAPVisualizationY'\n", + " )\n", + "\n", + "# -------\n", + "\n", + "tuneable_hashgnn_parameter_grid = {\n", + " \"embedding_dimension\": [64, 128, 256],\n", + " # \"random_seed\": [42, 2023], # Fixed random seed since experiments showed only minor differences in the results\n", + " \"hashgnn_iterations\": [2, 4],\n", + " \"hashgnn_density_level\": [1, 2],\n", + " \"hashgnn_neighbor_influence\": [0.7, 1.0, 5.0, 10.0], # [0.1, 0.7, 1.0, 5.0, 10.0],\n", + " \"hashgnn_dimension_multiplier\": [1, 2],\n", + "}\n", + "\n", + "tuneable_hashgnn = create_tuneable(TuneableHashGNNNodeEmbeddings).with_projection_parameters(java_type_sampled_projection_parameters)\n", + "\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "hyperparameter_tuning = GridSearchCV(\n", + " estimator=tuneable_hashgnn,\n", + " param_grid=tuneable_hashgnn_parameter_grid,\n", + " cv=get_all_data_without_slicing_cross_validator_for_node_count(java_type_node_count),\n", + " verbose=1\n", + ")\n", + "\n", + "if java_type_node_count > 0:\n", + " reset_node_embedding_tuning_scores() # Reset the DataFrame to store the results\n", + " tuned_hashgnn = hyperparameter_tuning.fit(get_initial_dummy_data_for_hyperparameter_tuning(java_type_node_count))\n", + " output_tuning_details(tuned_hashgnn, 'Tuned HashGNN for Java Types')\n", + " output_node_embedding_tuning_scores()\n", + "\n", + " if java_type_sampling_result.is_sampled:\n", + " tuned_hashgnn.best_estimator_.refit_with_projection(java_type_projection_parameters[\"dependencies_projection\"])\n", + "\n", + " embeddings = tuned_hashgnn.best_estimator_.get_embeddings()\n", + " embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings).embeddings\n", + " display(embeddings.head())\n", + "\n", + " plot_2d_node_embeddings(\n", + " prepare_node_embeddings_for_2d_visualization(embeddings),\n", + " \"Java Types positioned by their dependency relationships (HashGNN + t-SNE)\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "73874003", + "metadata": {}, + "source": [ + "### 2.5 Node Embeddings for Java Types using node2vec" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad7d60bf", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO Keep one solution (Optuna vs. GridSearch) \n", + "if java_type_node_count > 0:\n", + " tuned_node2vec = get_tuned_node2vec_node_embeddings(java_type_sampled_projection_parameters)\n", + " \n", + " if java_package_sampling_result.is_sampled:\n", + " tuned_node2vec.refit_with_projection(java_type_projection_parameters[\"dependencies_projection\"])\n", + " else:\n", + " tuned_node2vec.fit()\n", + " \n", + " embeddings = add_tuned_hierarchical_density_based_spatial_clustering(tuned_node2vec.get_embeddings()).embeddings\n", + " display(embeddings.head())\n", + " \n", + " node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", + " plot_2d_node_embeddings(\n", + " node_embeddings_for_visualization,\n", + " \"Java Types positioned by their dependency relationships (node2vec + t-SNE)\"\n", + " )\n", + " plot_2d_node_embeddings(\n", + " node_embeddings_for_visualization, \n", + " \"Java Types positioned by their dependency relationships (node2vec + UMAP)\",\n", + " x_position_column='embeddingUMAPVisualizationX',\n", + " y_position_column='embeddingUMAPVisualizationY'\n", + " )\n", + "# -------\n", + "tuneable_node2vec_parameter_grid = {\n", + " \"embedding_dimension\": [32, 64, 128], # 256 rarely improves the results, but increases the computation time\n", + " \"node2vec_in_out_factor\": [0.25, 0.5, 1.0, 2.0], # [0.25, 0.5, 1.0, 2.0, 4.0]\n", + " \"node2vec_return_factor\": [0.25, 0.5, 1.0, 2.0, 4.0], # [0.25, 0.5, 1.0, 2.0, 4.0]\n", + " # \"node2vec_negative_sampling_rate\": [5, 10],\n", + " # \"node2vec_window_size\": [5, 10],\n", + " # \"random_seed\": [42], # Fixed random seed since experiments showed only minor differences in the results\n", + " # \"node2vec_walk_length\": [80], # [40, 80, 160],\n", + " # \"node2vec_walks_per_node\": [10], # [5, 10],\n", + " # \"node2vec_iterations\": [1],\n", + " # \"node2vec_positive_sampling_factor\": [0.001],\n", + "}\n", + "\n", + "tuneable_node2vec = create_tuneable(TuneableNode2VecNodeEmbeddings).with_projection_parameters(java_type_sampled_projection_parameters)\n", + "\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "hyperparameter_tuning = GridSearchCV(\n", + " estimator=tuneable_node2vec,\n", + " param_grid=tuneable_node2vec_parameter_grid,\n", + " cv=get_all_data_without_slicing_cross_validator_for_node_count(java_type_node_count),\n", + " verbose=1\n", + ")\n", + "\n", + "if java_type_node_count > 0:\n", + " reset_node_embedding_tuning_scores()\n", + " tuned_node2vec = hyperparameter_tuning.fit(get_initial_dummy_data_for_hyperparameter_tuning(java_type_node_count))\n", + " output_tuning_details(tuned_node2vec, 'Tuned node2vec for Java Types')\n", + " output_node_embedding_tuning_scores()\n", + "\n", + " if java_type_sampling_result.is_sampled:\n", + " tuned_node2vec.best_estimator_.refit_with_projection(java_type_projection_parameters[\"dependencies_projection\"])\n", + "\n", + " embeddings = tuned_node2vec.best_estimator_.get_embeddings()\n", + " embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings).embeddings\n", + " display(embeddings.head())\n", + "\n", + " plot_2d_node_embeddings(\n", + " prepare_node_embeddings_for_2d_visualization(embeddings),\n", + " \"Java Types positioned by their dependency relationships (node2vec + t-SNE)\"\n", + " )" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "JohT" + } + ], + "code_graph_analysis_pipeline_data_validation": "ValidateAlwaysFalse", + "kernelspec": { + "display_name": "codegraph", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + }, + "title": "Hyperparameter tuning of Java Node Embeddings" + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/jupyter/environment.yml b/jupyter/environment.yml index e0ce499d9..0d3ce2c82 100644 --- a/jupyter/environment.yml +++ b/jupyter/environment.yml @@ -20,5 +20,9 @@ dependencies: - monotonic=1.* - plotly=6.0.* - python-kaleido=0.2.* # To render plotly plots. Static image export for web-based visualization libraries. + - scikit-learn=1.6.* # To try out this HDBSCAN implementation + - seaborn=0.13 # To visualize clustering results + - optuna=4.3.* + - umap-learn=0.5.* # to visualize node embeddings in 2D (UMAP dimensionality reduction) - pip: - neo4j==5.23.* \ No newline at end of file diff --git a/scripts/analysis/analyze.sh b/scripts/analysis/analyze.sh index 07238bca2..31dff6765 100755 --- a/scripts/analysis/analyze.sh +++ b/scripts/analysis/analyze.sh @@ -44,7 +44,7 @@ LOG_GROUP_END=${LOG_GROUP_END:-"::endgroup::"} # Prefix to end a log group. Defa # Function to display script usage usage() { - echo "Usage: $0 [--report ] [--profile ] [--explore]" + echo "Usage: $0 [--report ] [--profile ] [--explore]" exit 1 } diff --git a/scripts/reports/compilations/AllReports.sh b/scripts/reports/compilations/AllReports.sh index 508e182fb..d195c9de7 100755 --- a/scripts/reports/compilations/AllReports.sh +++ b/scripts/reports/compilations/AllReports.sh @@ -25,4 +25,5 @@ echo "AllReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DI # then hard to parallelize them. So if coupling can be prevented, it still should. source "${REPORT_COMPILATIONS_SCRIPT_DIR}/CsvReports.sh" source "${REPORT_COMPILATIONS_SCRIPT_DIR}/JupyterReports.sh" +source "${REPORT_COMPILATIONS_SCRIPT_DIR}/PythonReports.sh" source "${REPORT_COMPILATIONS_SCRIPT_DIR}/VisualizationReports.sh" \ No newline at end of file diff --git a/scripts/reports/compilations/CsvReports.sh b/scripts/reports/compilations/CsvReports.sh index cd4112fb4..66324bd81 100755 --- a/scripts/reports/compilations/CsvReports.sh +++ b/scripts/reports/compilations/CsvReports.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # Runs all CSV report scripts (no Python and Chromium required). -# It only consideres scripts in the "reports" directory (overridable with REPORTS_SCRIPT_DIR) one directory above this one. +# It only considers scripts in the "reports" and "domains" directories and their sub directories (overridable with REPORTS_SCRIPT_DIR and DOMAINS_DIRECTORY). # Requires reports/*.sh @@ -20,16 +20,28 @@ REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR:-$( CDPATH=. cd REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$(dirname -- "${REPORT_COMPILATIONS_SCRIPT_DIR}")} echo "CsvReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}" -# Run all report scripts -for report_script_file in "${REPORTS_SCRIPT_DIR}"/*Csv.sh; do - report_script_filename=$(basename -- "${report_script_file}"); - report_script_filename="${report_script_filename%.*}" # Remove file extension +# Get the "domains" directory that contains analysis and report scripts by functionality. +DOMAINS_DIRECTORY=${DOMAINS_DIRECTORY:-"${REPORTS_SCRIPT_DIR}/../../domains"} +echo "CsvReports: DOMAINS_DIRECTORY=${DOMAINS_DIRECTORY}" - echo "${LOG_GROUP_START}Create CSV Report ${report_script_filename}"; - echo "CsvReports: $(date +'%Y-%m-%dT%H:%M:%S%z') Starting ${report_script_filename}..."; +# Run all CSV report scripts (filename ending with Csv.sh) in the REPORTS_SCRIPT_DIR and DOMAINS_DIRECTORY directories. +for directory in "${REPORTS_SCRIPT_DIR}" "${DOMAINS_DIRECTORY}"; do + if [ ! -d "${directory}" ]; then + echo "CsvReports: Error: Directory ${directory} does not exist. Please check your REPORTS_SCRIPT_DIR and DOMAIN_DIRECTORY settings." + exit 1 + fi - source "${report_script_file}" + # Run all CSV report scripts for the selected directory. + find "${directory}" -type f -name "*Csv.sh" | sort | while read -r report_script_file; do + report_script_filename=$(basename -- "${report_script_file}"); + report_script_filename="${report_script_filename%.*}" # Remove file extension - echo "CsvReports: $(date +'%Y-%m-%dT%H:%M:%S%z') Finished ${report_script_filename}"; - echo "${LOG_GROUP_END}"; -done \ No newline at end of file + echo "${LOG_GROUP_START}Create CSV Report ${report_script_filename}"; + echo "CsvReports: $(date +'%Y-%m-%dT%H:%M:%S%z') Starting ${report_script_filename}..."; + + source "${report_script_file}" + + echo "CsvReports: $(date +'%Y-%m-%dT%H:%M:%S%z') Finished ${report_script_filename}"; + echo "${LOG_GROUP_END}"; + done +done diff --git a/scripts/reports/compilations/PythonReports.sh b/scripts/reports/compilations/PythonReports.sh new file mode 100755 index 000000000..b46e89bf5 --- /dev/null +++ b/scripts/reports/compilations/PythonReports.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +# Runs all Python report scripts (no Chromium required). +# It only considers scripts in the "reports" and "domains" directories and their sub directories (overridable with REPORTS_SCRIPT_DIR and DOMAINS_DIRECTORY). + +# Requires reports/*.sh + +# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) +set -o errexit -o pipefail + +# Overrideable Constants (defaults also defined in sub scripts) +LOG_GROUP_START=${LOG_GROUP_START:-"::group::"} # Prefix to start a log group. Defaults to GitHub Actions log group start command. +LOG_GROUP_END=${LOG_GROUP_END:-"::endgroup::"} # Prefix to end a log group. Defaults to GitHub Actions log group end command. + +## Get this "scripts/reports/compilations" directory if not already set. +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} +echo "PythonReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}" + +REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$(dirname -- "${REPORT_COMPILATIONS_SCRIPT_DIR}")} +echo "PythonReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}" + +# Get the "domains" directory that contains analysis and report scripts by functionality. +DOMAINS_DIRECTORY=${DOMAINS_DIRECTORY:-"${REPORTS_SCRIPT_DIR}/../../domains"} +echo "PythonReports: DOMAINS_DIRECTORY=${DOMAINS_DIRECTORY}" + +# Run all Python report scripts (filename ending with Csv.sh) in the REPORTS_SCRIPT_DIR and DOMAINS_DIRECTORY directories. +for directory in "${REPORTS_SCRIPT_DIR}" "${DOMAINS_DIRECTORY}"; do + if [ ! -d "${directory}" ]; then + echo "PythonReports: Error: Directory ${directory} does not exist. Please check your REPORTS_SCRIPT_DIR and DOMAIN_DIRECTORY settings." + exit 1 + fi + + # Run all Python report scripts for the selected directory. + find "${directory}" -type f \( -name "*Python.sh" -o -name "*Python.py" \) | sort | while read -r report_script_file; do + report_script_filename=$(basename -- "${report_script_file}"); + report_script_filename="${report_script_filename%.*}" # Remove file extension + + echo "${LOG_GROUP_START}Create Python Report ${report_script_filename}"; + echo "PythonReports: $(date +'%Y-%m-%dT%H:%M:%S%z') Starting ${report_script_filename}..."; + + source "${report_script_file}" + + echo "PythonReports: $(date +'%Y-%m-%dT%H:%M:%S%z') Finished ${report_script_filename}"; + echo "${LOG_GROUP_END}"; + done +done