1
+ #! /usr/bin/env bash
2
+
3
+ # Pipeline that coordinates anomaly detection using the Graph Data Science Library of Neo4j.
4
+ # It requires an already running Neo4j graph database with already scanned and analyzed artifacts.
5
+ # The results will be written into the sub directory reports/anomaly-detection.
6
+
7
+ # Note that "scripts/prepareAnalysis.sh" is required to run prior to this script.
8
+
9
+ # Requires executeQueryFunctions.sh, projectionFunctions.sh, cleanupAfterReportGeneration.sh
10
+
11
+ # Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
12
+ set -o errexit -o pipefail
13
+
14
+ # Overrideable Constants (defaults also defined in sub scripts)
15
+ REPORTS_DIRECTORY=${REPORTS_DIRECTORY:- " reports" }
16
+
17
+ # # Get this "scripts/reports" directory if not already set
18
+ # Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
19
+ # CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
20
+ # This way non-standard tools like readlink aren't needed.
21
+ ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR:- $(CDPATH=. cd -- " $( dirname -- " ${BASH_SOURCE[0]} " ) " && pwd -P)}
22
+ echo " anomalyDetectionPipeline: ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR} "
23
+ # Get the "scripts" directory by taking the path of this script and going one directory up.
24
+ SCRIPTS_DIR=${SCRIPTS_DIR:- " ${ANOMALY_DETECTION_SCRIPT_DIR} /../../scripts" } # Repository directory containing the shell scripts
25
+ # Get the "cypher" query directory for gathering features.
26
+ ANOMALY_DETECTION_FEATURE_CYPHER_DIR=${ANOMALY_DETECTION_FEATURE_CYPHER_DIR:- " ${ANOMALY_DETECTION_SCRIPT_DIR} /features" }
27
+ ANOMALY_DETECTION_QUERY_CYPHER_DIR=${ANOMALY_DETECTION_QUERY_CYPHER_DIR:- " ${ANOMALY_DETECTION_SCRIPT_DIR} /queries" }
28
+
29
+ # Function to display script usage
30
+ usage () {
31
+ echo -e " ${COLOR_ERROR} " >&2
32
+ echo " Usage: $0 [--verbose]" >&2
33
+ echo -e " ${COLOR_DEFAULT} " >&2
34
+ exit 1
35
+ }
36
+
37
+ # Default values
38
+ verboseMode=" " # either "" or "--verbose"
39
+
40
+ # Parse command line arguments
41
+ while [[ $# -gt 0 ]]; do
42
+ key=" $1 "
43
+ value=" ${2} "
44
+
45
+ case ${key} in
46
+ --verbose)
47
+ verboseMode=" --verbose"
48
+ ;;
49
+ * )
50
+ echo -e " ${COLOR_ERROR} anomalyDetectionPipeline: Error: Unknown option: ${key}${COLOR_DEFAULT} " >&2
51
+ usage
52
+ ;;
53
+ esac
54
+ shift || true # ignore error when there are no more arguments
55
+ done
56
+
57
+ # Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher"
58
+ source " ${SCRIPTS_DIR} /executeQueryFunctions.sh"
59
+
60
+ # Define functions to create and delete Graph Projections like "createUndirectedDependencyProjection"
61
+ source " ${SCRIPTS_DIR} /projectionFunctions.sh"
62
+
63
+ # Query or recalculate features.
64
+ #
65
+ # Required Parameters:
66
+ # - projection_name=...
67
+ # Name prefix for the in-memory projection name. Example: "package-anomaly-detection"
68
+ # - projection_node_label=...
69
+ # Label of the nodes that will be used for the projection. Example: "Package"
70
+ # - projection_weight_property=...
71
+ # Name of the node property that contains the dependency weight. Example: "weight"
72
+ anomaly_detection_features () {
73
+ # Determine the Betweenness centrality (with the directed graph projection) if not already done
74
+ execute_cypher_queries_until_results " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-Betweenness-Exists.cypher" \
75
+ " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-Betweenness-Write.cypher" " ${@ } "
76
+ # Determine the local clustering coefficient if not already done
77
+ execute_cypher_queries_until_results " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-LocalClusteringCoefficient-Exists.cypher" \
78
+ " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-LocalClusteringCoefficient-Write.cypher" " ${@ } "
79
+ # Determine the page rank if not already done
80
+ execute_cypher_queries_until_results " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-PageRank-Exists.cypher" \
81
+ " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-PageRank-Write.cypher" " ${@ } "
82
+ # Determine the article rank if not already done
83
+ execute_cypher_queries_until_results " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-ArticleRank-Exists.cypher" \
84
+ " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-PageRank-Write.cypher" " ${@ } "
85
+ }
86
+ # Run queries to find anomalies in the graph.
87
+ #
88
+ # Required Parameters:
89
+ # - projection_node_label=...
90
+ # Label of the nodes that will be used for the projection. Example: "Package"
91
+ anomaly_detection_queries () {
92
+ local nodeLabel
93
+ nodeLabel=$( extractQueryParameter " projection_node_label" " ${@ } " )
94
+ # Determine the article rank if not already done
95
+ execute_cypher " ${ANOMALY_DETECTION_QUERY_CYPHER_DIR} /AnomalyDetectionPageToArticleRankDifference.cypher" " ${@ } " > " ${FULL_REPORT_DIRECTORY} /${nodeLabel} AnomalyDetection_ArticleVsPageRankDifference.csv"
96
+ }
97
+
98
+ # Run the anomaly detection pipeline.
99
+ #
100
+ # Required Parameters:
101
+ # - projection_name=...
102
+ # Name prefix for the in-memory projection name. Example: "package-anomaly-detection"
103
+ # - projection_node_label=...
104
+ # Label of the nodes that will be used for the projection. Example: "Package"
105
+ # - projection_weight_property=...
106
+ # Name of the node property that contains the dependency weight. Example: "weight"
107
+ anomaly_detection_pipeline () {
108
+ time anomaly_detection_features " ${@ } "
109
+ time anomaly_detection_queries " ${@ } "
110
+ # Get tuned Leiden communities as a reference to tune clustering
111
+ time " ${ANOMALY_DETECTION_SCRIPT_DIR} /tunedLeidenCommunityDetection.py" " ${@ } " ${verboseMode}
112
+ # Tuned Fast Random Projection and tuned HDBSCAN clustering
113
+ time " ${ANOMALY_DETECTION_SCRIPT_DIR} /tunedNodeEmbeddingClustering.py" " ${@ } " ${verboseMode}
114
+ # Reduce the dimensionality of the node embeddings down to 2D for visualization using UMAP
115
+ time " ${ANOMALY_DETECTION_SCRIPT_DIR} /umap2dNodeEmbeddings.py" " ${@ } " ${verboseMode}
116
+
117
+ # Query Results: Output all collected features into a CSV file.
118
+ local nodeLabel
119
+ nodeLabel=$( extractQueryParameter " projection_node_label" " ${@ } " )
120
+ execute_cypher " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeatures.cypher" " ${@ } " > " ${FULL_REPORT_DIRECTORY} /${nodeLabel} AnomalyDetectionFeatures.csv"
121
+ }
122
+
123
+ # Create report directory
124
+ REPORT_NAME=" anomaly-detection"
125
+ FULL_REPORT_DIRECTORY=" ${REPORTS_DIRECTORY} /${REPORT_NAME} "
126
+ mkdir -p " ${FULL_REPORT_DIRECTORY} "
127
+
128
+ # Query Parameter key pairs for projection and algorithm side
129
+ PROJECTION_NAME=" dependencies_projection"
130
+ ALGORITHM_PROJECTION=" projection_name"
131
+
132
+ PROJECTION_NODE=" dependencies_projection_node"
133
+ ALGORITHM_NODE=" projection_node_label"
134
+
135
+ PROJECTION_WEIGHT=" dependencies_projection_weight_property"
136
+ ALGORITHM_WEIGHT=" projection_weight_property"
137
+
138
+ # Code independent algorithm parameters
139
+ COMMUNITY_PROPERTY=" community_property=communityLeidenIdTuned"
140
+
141
+ # Query (or recalculate) features.
142
+ #
143
+ # Required Parameters:
144
+ # - projection_name=...
145
+ # Name prefix for the in-memory projection name. Example: "package-anomaly-detection"
146
+ # - projection_node_label=...
147
+ # Label of the nodes that will be used for the projection. Example: "Package"
148
+ # - projection_weight_property=...
149
+ # Name of the node property that contains the dependency weight. Example: "weight"
150
+ anomaly_detection_features () {
151
+ # Query Feature: Determine the Betweenness centrality (with the directed graph projection) if not already done
152
+ execute_cypher_queries_until_results " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-Betweenness-Exists.cypher" \
153
+ " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-Betweenness-Write.cypher" " ${@ } "
154
+ # Query Feature: Determine the local clustering coefficient if not already done
155
+ execute_cypher_queries_until_results " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-LocalClusteringCoefficient-Exists.cypher" \
156
+ " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-LocalClusteringCoefficient-Write.cypher" " ${@ } "
157
+ # Query Feature: Determine the page rank if not already done
158
+ execute_cypher_queries_until_results " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-PageRank-Exists.cypher" \
159
+ " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-PageRank-Write.cypher" " ${@ } "
160
+ # Query Feature: Determine the article rank if not already done
161
+ execute_cypher_queries_until_results " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-ArticleRank-Exists.cypher" \
162
+ " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeature-ArticleRank-Write.cypher" " ${@ } "
163
+ }
164
+
165
+ # Run the anomaly detection pipeline.
166
+ #
167
+ # Required Parameters:
168
+ # - projection_name=...
169
+ # Name prefix for the in-memory projection name. Example: "package-anomaly-detection"
170
+ # - projection_node_label=...
171
+ # Label of the nodes that will be used for the projection. Example: "Package"
172
+ # - projection_weight_property=...
173
+ # Name of the node property that contains the dependency weight. Example: "weight"
174
+ anomaly_detection_pipeline () {
175
+ time anomaly_detection_features " ${@ } "
176
+ # Run Python: Get tuned Leiden communities as a reference to tune clustering
177
+ time " ${ANOMALY_DETECTION_SCRIPT_DIR} /tunedLeidenCommunityDetection.py" " ${@ } " ${verboseMode}
178
+ # Run Python: Tuned Fast Random Projection and tuned HDBSCAN clustering
179
+ time " ${ANOMALY_DETECTION_SCRIPT_DIR} /tunedNodeEmbeddingClustering.py" " ${@ } " ${verboseMode}
180
+
181
+ # Query Results: Output all collected features into a CSV file.
182
+ local nodeLabel
183
+ nodeLabel=$( extractQueryParameter " projection_node_label" " ${@ } " )
184
+ execute_cypher " ${ANOMALY_DETECTION_FEATURE_CYPHER_DIR} /AnomalyDetectionFeatures.cypher" " ${@ } " > " ${FULL_REPORT_DIRECTORY} /${nodeLabel} AnomalyDetection.csv"
185
+ }
186
+
187
+ # -- Java Artifact Node Embeddings -------------------------------
188
+
189
+ if createUndirectedDependencyProjection " ${PROJECTION_NAME} =artifact-anomaly-detection" " ${PROJECTION_NODE} =Artifact" " ${PROJECTION_WEIGHT} =weight" ; then
190
+ createDirectedDependencyProjection " ${PROJECTION_NAME} =artifact-anomaly-detection-directed" " ${PROJECTION_NODE} =Artifact" " ${PROJECTION_WEIGHT} =weight"
191
+ anomaly_detection_pipeline " ${ALGORITHM_PROJECTION} =artifact-anomaly-detection" " ${ALGORITHM_NODE} =Artifact" " ${ALGORITHM_WEIGHT} =weight" " ${COMMUNITY_PROPERTY} "
192
+ fi
193
+
194
+ # -- Java Package Node Embeddings --------------------------------
195
+
196
+ if createUndirectedDependencyProjection " ${PROJECTION_NAME} =package-anomaly-detection" " ${PROJECTION_NODE} =Package" " ${PROJECTION_WEIGHT} =weight25PercentInterfaces" ; then
197
+ createDirectedDependencyProjection " ${PROJECTION_NAME} =package-anomaly-detection-directed" " ${PROJECTION_NODE} =Package" " ${PROJECTION_WEIGHT} =weight25PercentInterfaces"
198
+ anomaly_detection_pipeline " ${ALGORITHM_PROJECTION} =package-anomaly-detection" " ${ALGORITHM_NODE} =Package" " ${ALGORITHM_WEIGHT} =weight25PercentInterfaces" " ${COMMUNITY_PROPERTY} "
199
+ fi
200
+
201
+ # -- Java Type Node Embeddings -----------------------------------
202
+
203
+ if createUndirectedJavaTypeDependencyProjection " ${PROJECTION_NAME} =type-anomaly-detection" ; then
204
+ createDirectedJavaTypeDependencyProjection " ${PROJECTION_NAME} =type-anomaly-detection-directed"
205
+ anomaly_detection_pipeline " ${ALGORITHM_PROJECTION} =type-anomaly-detection" " ${ALGORITHM_NODE} =Type" " ${ALGORITHM_WEIGHT} =weight" " ${COMMUNITY_PROPERTY} " " ${EMBEDDING_PROPERTY} "
206
+ fi
207
+
208
+ # -- Typescript Module Node Embeddings ---------------------------
209
+
210
+ if createUndirectedDependencyProjection " ${PROJECTION_NAME} =typescript-module-embedding" " ${PROJECTION_NODE} =Module" " ${PROJECTION_WEIGHT} =lowCouplingElement25PercentWeight" ; then
211
+ createDirectedDependencyProjection " ${PROJECTION_NAME} =typescript-module-embedding-directed" " ${PROJECTION_NODE} =Module" " ${PROJECTION_WEIGHT} =lowCouplingElement25PercentWeight"
212
+ anomaly_detection_pipeline " ${ALGORITHM_PROJECTION} =typescript-module-embedding" " ${ALGORITHM_NODE} =Module" " ${ALGORITHM_WEIGHT} =lowCouplingElement25PercentWeight" " ${COMMUNITY_PROPERTY} " " ${EMBEDDING_PROPERTY} "
213
+ fi
214
+
215
+ # ---------------------------------------------------------------
216
+
217
+ # Clean-up after report generation. Empty reports will be deleted.
218
+ source " ${SCRIPTS_DIR} /cleanupAfterReportGeneration.sh" " ${FULL_REPORT_DIRECTORY} "
219
+
220
+ echo " anomalyDetectionPipeline: $( date +' %Y-%m-%dT%H:%M:%S%z' ) Successfully finished."
0 commit comments