JohT · JohT · May 8, 2025 · Jun 10, 2025 · Jun 8, 2025 · Jun 14, 2025
diff --git a/.gitignore b/.gitignore
@@ -94,4 +94,7 @@ coverage/
 *.nbconvert*
 
 # Python environments
-.conda
+.conda
+
+# Optuna (and other) Database data
+*.db
diff --git a/COMMANDS.md b/COMMANDS.md
@@ -8,6 +8,7 @@
     - [Examples](#examples)
         - [Start an analysis with CSV reports only](#start-an-analysis-with-csv-reports-only)
         - [Start an analysis with Jupyter reports only](#start-an-analysis-with-jupyter-reports-only)
+        - [Start an analysis with Python reports only](#start-an-analysis-with-python-reports-only)
         - [Start an analysis with PDF generation](#start-an-analysis-with-pdf-generation)
         - [Start an analysis without importing git log data](#start-an-analysis-without-importing-git-log-data)
         - [Only run setup and explore the Graph manually](#only-run-setup-and-explore-the-graph-manually)
@@ -102,6 +103,14 @@ If only the Jupyter reports are needed e.g. when the CSV reports had already bee
 ./../../scripts/analysis/analyze.sh --report Jupyter
 ```
 
+#### Start an analysis with Python reports only
+
+If you only need Python reports, e.g. to skip Chromium Browser dependency, the this can be done with:
+
+```shell
+./../../scripts/analysis/analyze.sh --report Python
+```
+
 #### Start an analysis with PDF generation
 
 Note: Generating a PDF from a Jupyter notebook using [nbconvert](https://nbconvert.readthedocs.io) takes some time and might even fail due to a timeout error.

diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md
@@ -84,16 +84,22 @@ Use these optional command line options as needed:
   ./../../scripts/analysis/analyze.sh --report Csv
   ```
 
-- Jupyter notebook reports when Python and Conda are installed:
+- Jupyter notebook reports when Python and Conda are installed (and Chromium Browser for PDF generation):
 
   ```shell
   ./../../scripts/analysis/analyze.sh --report Jupyter
   ```
 
+- Python reports when Python and Conda are installed (without Chromium Browser for PDF generation):
+
+  ```shell
+  ./../../scripts/analysis/analyze.sh --report Python
+  ```
+
 - Graph visualizations when Node.js and npm are installed:
 
   ```shell
-  ./../../scripts/analysis/analyze.sh --report Jupyter
+  ./../../scripts/analysis/analyze.sh --report Visualization
   ```
 
 - All reports with Python, Conda, Node.js and npm installed:

diff --git a/README.md b/README.md
@@ -148,6 +148,9 @@ The [Code Structure Analysis Pipeline](./.github/workflows/internal-java-code-an
   - [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver)
   - [openTSNE](https://github.com/pavlin-policar/openTSNE)
   - [wordcloud](https://github.com/amueller/word_cloud)
+  - [umap](https://umap-learn.readthedocs.io)
+  - [scikit-learn](https://scikit-learn.org)
+  - [optuna](https://optuna.org)
 - [Graph Visualization](./graph-visualization/README.md) uses [node.js](https://nodejs.org/de) and the dependencies listed in [package.json](./graph-visualization/package.json).
 - [HPCC-Systems (High Performance Computing Cluster) Web-Assembly (JavaScript)](https://github.com/hpcc-systems/hpcc-js-wasm) containing a wrapper for GraphViz to visualize graph structures.
 - [GraphViz](https://gitlab.com/graphviz/graphviz) for CLI Graph Visualization

diff --git a/cypher/Community_Detection/Community_Detection_2b_Leiden_Tuneable_Statistics.cypher b/cypher/Community_Detection/Community_Detection_2b_Leiden_Tuneable_Statistics.cypher
@@ -0,0 +1,31 @@
+//Community Detection Leiden Statistics
+
+CALL gds.leiden.stats(
+ $dependencies_projection + '-cleaned', {
+  gamma: toFloat($dependencies_leiden_gamma),
+  theta: toFloat($dependencies_leiden_theta),
+  maxLevels: toInteger($dependencies_leiden_max_levels),
+  tolerance: 0.0000001,
+  consecutiveIds: true,
+  relationshipWeightProperty: $dependencies_projection_weight_property
+})
+YIELD nodeCount
+     ,communityCount
+     ,ranLevels
+     ,modularity
+     ,modularities
+     ,communityDistribution
+RETURN nodeCount
+      ,communityCount
+      ,ranLevels
+      ,modularity
+      ,modularities
+      ,communityDistribution.min
+      ,communityDistribution.mean
+      ,communityDistribution.max
+      ,communityDistribution.p50
+      ,communityDistribution.p75
+      ,communityDistribution.p90
+      ,communityDistribution.p95
+      ,communityDistribution.p99
+      ,communityDistribution.p999
diff --git a/cypher/Community_Detection/Community_Detection_2d_Leiden_Tuneable_Write.cypher b/cypher/Community_Detection/Community_Detection_2d_Leiden_Tuneable_Write.cypher
@@ -0,0 +1,40 @@
+//Community Detection Leiden Write property communityLeidenId
+
+CALL gds.leiden.write(
+ $dependencies_projection + '-cleaned', {
+  gamma: toFloat($dependencies_leiden_gamma),
+  theta: toFloat($dependencies_leiden_theta),
+  maxLevels: toInteger($dependencies_leiden_max_levels),
+  tolerance: 0.0000001,
+  consecutiveIds: true,
+  relationshipWeightProperty: $dependencies_projection_weight_property,
+  writeProperty: $dependencies_projection_write_property
+})
+YIELD preProcessingMillis
+     ,computeMillis
+     ,writeMillis
+     ,postProcessingMillis
+     ,nodePropertiesWritten
+     ,communityCount
+     ,ranLevels
+     ,modularity
+     ,modularities
+     ,communityDistribution
+RETURN preProcessingMillis
+      ,computeMillis
+      ,writeMillis
+      ,postProcessingMillis
+      ,nodePropertiesWritten
+      ,communityCount
+      ,ranLevels
+      ,modularity
+      ,communityDistribution.min
+      ,communityDistribution.mean
+      ,communityDistribution.max
+      ,communityDistribution.p50
+      ,communityDistribution.p75
+      ,communityDistribution.p90
+      ,communityDistribution.p95
+      ,communityDistribution.p99
+      ,communityDistribution.p999
+      ,modularities
diff --git a/cypher/Dependencies_Projection/Dependencies_13_Sample_Projected_Graph.cypher b/cypher/Dependencies_Projection/Dependencies_13_Sample_Projected_Graph.cypher
@@ -0,0 +1,11 @@
+// Creates a smaller projection by sampling the original graph using "Common Neighbour Aware Random Walk"
+
+CALL gds.graph.sample.cnarw(
+  $dependencies_projection + '-sampled-cleaned',
+  $dependencies_projection,
+  {
+    samplingRatio: toFloat($dependencies_projection_sampling_ratio)
+  }
+)
+YIELD graphName, fromGraphName, nodeCount, relationshipCount, startNodeCount, projectMillis
+RETURN graphName, fromGraphName, nodeCount, relationshipCount, startNodeCount, projectMillis
diff --git a/cypher/Dependencies_Projection/Dependencies_14_Write_Batch_Data.cypher b/cypher/Dependencies_Projection/Dependencies_14_Write_Batch_Data.cypher
@@ -0,0 +1,7 @@
+// Writes batch data back into the database for code units when working with a dependencies projection. Variables: dependencies_projection_rows, dependencies_projection_node
+
+UNWIND $dependencies_projection_rows AS row
+MATCH (codeUnit)
+WHERE elementId(codeUnit) = row.nodeId
+  AND $dependencies_projection_node IN labels(codeUnit) 
+  SET codeUnit += row.properties
diff --git a/cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher
@@ -16,8 +16,9 @@ OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS
    WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName   
  RETURN DISTINCT 
         coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
-       ,codeUnit.name AS shortCodeUnitName
-       ,coalesce(artifactName, projectName) AS projectName
-       ,coalesce(codeUnit.communityLeidenId, 0) AS communityId
+       ,codeUnit.name                               AS shortCodeUnitName
+       ,elementId(codeUnit)                         AS nodeElementId
+       ,coalesce(artifactName, projectName)         AS projectName
+       ,coalesce(codeUnit.communityLeidenId, 0)     AS communityId
        ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality
        ,embedding
diff --git a/cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Tuneable_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Tuneable_Stream.cypher
@@ -0,0 +1,26 @@
+// Node Embeddings 1d using Fast Random Projection: Stream for Hyper-Parameter tuning. Requires "Add_file_name and_extension.cypher".
+
+CALL gds.fastRP.stream(
+ $dependencies_projection + '-cleaned', {
+      embeddingDimension: toInteger($dependencies_projection_embedding_dimension)
+     ,randomSeed: toInteger($dependencies_projection_embedding_random_seed)
+     ,normalizationStrength: toFloat($dependencies_projection_fast_random_projection_normalization_strength)
+     ,iterationWeights: [0.0, 0.0, 1.0, toFloat($dependencies_projection_fast_random_projection_forth_iteration_weight)]
+     ,relationshipWeightProperty: $dependencies_projection_weight_property
+  }
+)
+YIELD nodeId, embedding
+ WITH gds.util.asNode(nodeId) AS codeUnit
+     ,embedding
+OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
+   WITH *, artifact.name AS artifactName
+OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)
+   WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName   
+ RETURN DISTINCT 
+        coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
+       ,codeUnit.name                               AS shortCodeUnitName
+       ,elementId(codeUnit)                         AS nodeElementId
+       ,coalesce(artifactName, projectName)         AS projectName
+       ,coalesce(codeUnit.communityLeidenIdTuned, codeUnit.communityLeidenId, 0) AS communityId
+       ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality
+       ,embedding
diff --git a/cypher/Node_Embeddings/Node_Embeddings_1e_Fast_Random_Projection_Tuneable_Write.cypher b/cypher/Node_Embeddings/Node_Embeddings_1e_Fast_Random_Projection_Tuneable_Write.cypher
@@ -0,0 +1,14 @@
+// Node Embeddings 1e using Fast Random Projection: Write for tuned hyper-parameters.
+
+CALL gds.fastRP.write(
+ $dependencies_projection + '-cleaned', {
+      embeddingDimension: toInteger($dependencies_projection_embedding_dimension)
+     ,randomSeed: toInteger($dependencies_projection_embedding_random_seed)
+     ,normalizationStrength: toFloat($dependencies_projection_fast_random_projection_normalization_strength)
+     ,iterationWeights: [0.0, 0.0, 1.0, toFloat($dependencies_projection_fast_random_projection_forth_iteration_weight)]
+     ,relationshipWeightProperty: $dependencies_projection_weight_property
+     ,writeProperty: $dependencies_projection_write_property
+  }
+)
+ YIELD nodeCount, nodePropertiesWritten, preProcessingMillis, computeMillis, writeMillis
+RETURN nodeCount, nodePropertiesWritten, preProcessingMillis, computeMillis, writeMillis
diff --git a/cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher
@@ -22,8 +22,9 @@ OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS
    WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName   
  RETURN DISTINCT 
         coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
-       ,codeUnit.name AS shortCodeUnitName
-       ,coalesce(artifactName, projectName) AS projectName
-     ,coalesce(codeUnit.communityLeidenId, 0) AS communityId
-     ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality
-     ,embedding
+       ,codeUnit.name                               AS shortCodeUnitName
+       ,elementId(codeUnit)                         AS nodeElementId
+       ,coalesce(artifactName, projectName)         AS projectName
+       ,coalesce(codeUnit.communityLeidenId, 0)     AS communityId
+       ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality
+       ,embedding
diff --git a/cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Tuneable_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Tuneable_Stream.cypher
@@ -0,0 +1,30 @@
+// Node Embeddings 2c using Hash GNN (Graph Neural Networks): Stream. Requires "Add_file_name and_extension.cypher".
+
+CALL gds.beta.hashgnn.stream(
+ $dependencies_projection + '-cleaned', {
+      embeddingDensity: toInteger($dependencies_projection_embedding_dimension) * 2 * toInteger($dependencies_projection_hashgnn_dimension_multiplier)
+     ,randomSeed: toInteger($dependencies_projection_embedding_random_seed)
+     ,iterations: toInteger($dependencies_projection_hashgnn_iterations)
+     ,generateFeatures: {
+         dimension: toInteger($dependencies_projection_embedding_dimension) * 4 * toInteger($dependencies_projection_hashgnn_dimension_multiplier)
+        ,densityLevel: toInteger($dependencies_projection_hashgnn_density_level)
+     }
+     ,outputDimension: toInteger($dependencies_projection_embedding_dimension)
+     ,neighborInfluence: toFloat($dependencies_projection_hashgnn_neighbor_influence)
+  }
+)
+YIELD nodeId, embedding
+ WITH gds.util.asNode(nodeId) AS codeUnit
+     ,embedding
+OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
+   WITH *, artifact.name AS artifactName
+OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)
+   WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName   
+ RETURN DISTINCT 
+        coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
+       ,codeUnit.name                               AS shortCodeUnitName
+       ,elementId(codeUnit)                         AS nodeElementId
+       ,coalesce(artifactName, projectName)         AS projectName
+       ,coalesce(codeUnit.communityLeidenIdTuned, codeUnit.communityLeidenId, 0) AS communityId
+       ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality
+       ,embedding
diff --git a/cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher
@@ -17,8 +17,9 @@ OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS
    WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName   
  RETURN DISTINCT 
         coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
-       ,codeUnit.name AS shortCodeUnitName
-       ,coalesce(artifactName, projectName) AS projectName
-     ,coalesce(codeUnit.communityLeidenId, 0) AS communityId
-     ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality
-     ,embedding
+       ,codeUnit.name                               AS shortCodeUnitName
+       ,elementId(codeUnit)                         AS nodeElementId
+       ,coalesce(artifactName, projectName)         AS projectName
+       ,coalesce(codeUnit.communityLeidenId, 0)     AS communityId
+       ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality
+       ,embedding
diff --git a/cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Tuneable_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Tuneable_Stream.cypher
@@ -0,0 +1,32 @@
+// Node Embeddings 3c using Node2Vec: Stream. Requires "Add_file_name and_extension.cypher".
+
+CALL gds.node2vec.stream(
+ $dependencies_projection + '-cleaned', {
+      embeddingDimension: toInteger($dependencies_projection_embedding_dimension)
+     ,randomSeed: toInteger($dependencies_projection_embedding_random_seed)
+     ,iterations: toInteger($dependencies_projection_node2vec_iterations)
+     ,inOutFactor: toFloat($dependencies_projection_node2vec_in_out_factor)
+     ,returnFactor: toFloat($dependencies_projection_node2vec_return_factor)
+     ,windowSize: toInteger($dependencies_projection_node2vec_window_size)
+     ,walksPerNode: toInteger($dependencies_projection_node2vec_walks_per_node)
+     ,walkLength: toInteger($dependencies_projection_node2vec_walk_length)
+     ,negativeSamplingRate: toInteger($dependencies_projection_node2vec_negative_sampling_rate)
+     ,positiveSamplingFactor: toFloat($dependencies_projection_node2vec_positive_sampling_factor)
+     ,relationshipWeightProperty: $dependencies_projection_weight_property
+  }
+)
+YIELD nodeId, embedding
+ WITH gds.util.asNode(nodeId) AS codeUnit
+     ,embedding
+OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
+   WITH *, artifact.name AS artifactName
+OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)
+   WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName   
+ RETURN DISTINCT 
+        coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
+       ,codeUnit.name                               AS shortCodeUnitName
+       ,elementId(codeUnit)                         AS nodeElementId
+       ,coalesce(artifactName, projectName)         AS projectName
+       ,coalesce(codeUnit.communityLeidenIdTuned, codeUnit.communityLeidenId, 0) AS communityId
+       ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality
+       ,embedding
diff --git a/cypher/Validation/ValidateAlwaysFalse.cypher b/cypher/Validation/ValidateAlwaysFalse.cypher
@@ -0,0 +1,3 @@
+// Will never return any results so that the validation will always fail. This is helpful for Jupyter Notebooks that should not be executed automatically.
+
+MATCH (nothing) RETURN nothing LIMIT 0
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		// Will never return any results so that the validation will always fail. This is helpful for Jupyter Notebooks that should not be executed automatically.

		MATCH (nothing) RETURN nothing LIMIT 0