Skip to content

Commit ea1478a

Browse files
committed
Introduce anomaly detection pipeline with tuned clustering
1 parent 83dceec commit ea1478a

26 files changed

+3785
-0
lines changed
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
#!/usr/bin/env bash
2+
3+
# Pipeline that coordinates anomaly detection using the Graph Data Science Library of Neo4j.
4+
# It requires an already running Neo4j graph database with already scanned and analyzed artifacts.
5+
# The results will be written into the sub directory reports/anomaly-detection.
6+
7+
# Note that "scripts/prepareAnalysis.sh" is required to run prior to this script.
8+
9+
# Requires executeQueryFunctions.sh, projectionFunctions.sh, cleanupAfterReportGeneration.sh
10+
11+
# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
12+
set -o errexit -o pipefail
13+
14+
# Overrideable Constants (defaults also defined in sub scripts)
15+
REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"}
16+
17+
## Get this "scripts/reports" directory if not already set
18+
# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
19+
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
20+
# This way non-standard tools like readlink aren't needed.
21+
ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)}
22+
echo "anomalyDetectionPipeline: ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR}"
23+
# Get the "scripts" directory by taking the path of this script and going one directory up.
24+
SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/../../scripts"} # Repository directory containing the shell scripts
25+
# Get the "cypher" query directory for gathering features.
26+
ANOMALY_DETECTION_FEATURE_CYPHER_DIR=${ANOMALY_DETECTION_FEATURE_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/features"}
27+
ANOMALY_DETECTION_QUERY_CYPHER_DIR=${ANOMALY_DETECTION_QUERY_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/queries"}
28+
29+
# Function to display script usage
30+
usage() {
31+
echo -e "${COLOR_ERROR}" >&2
32+
echo "Usage: $0 [--usePython] [--verbose]" >&2
33+
echo -e "${COLOR_DEFAULT}" >&2
34+
exit 1
35+
}
36+
37+
# Default values
38+
usePython="false" # Use Python scripts for anomaly detection
39+
verboseMode="" # either "" or "--verbose"
40+
41+
# Parse command line arguments
42+
while [[ $# -gt 0 ]]; do
43+
key="$1"
44+
value="${2}"
45+
46+
case ${key} in
47+
--verbose)
48+
verboseMode="--verbose"
49+
;;
50+
--usePython)
51+
usePython="true"
52+
;;
53+
*)
54+
echo -e "${COLOR_ERROR}anomalyDetectionPipeline: Error: Unknown option: ${key}${COLOR_DEFAULT}" >&2
55+
usage
56+
;;
57+
esac
58+
shift || true # ignore error when there are no more arguments
59+
done
60+
61+
# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher"
62+
source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
63+
64+
# Define functions to create and delete Graph Projections like "createUndirectedDependencyProjection"
65+
source "${SCRIPTS_DIR}/projectionFunctions.sh"
66+
67+
# Query or recalculate features.
68+
#
69+
# Required Parameters:
70+
# - projection_name=...
71+
# Name prefix for the in-memory projection name. Example: "package-anomaly-detection"
72+
# - projection_node_label=...
73+
# Label of the nodes that will be used for the projection. Example: "Package"
74+
# - projection_weight_property=...
75+
# Name of the node property that contains the dependency weight. Example: "weight"
76+
anomaly_detection_features() {
77+
echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Collecting features for ${nodeLabel} nodes..."
78+
79+
# Determine the Betweenness centrality (with the directed graph projection) if not already done
80+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Betweenness-Exists.cypher" \
81+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Betweenness-Write.cypher" "${@}"
82+
# Determine the local clustering coefficient if not already done
83+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-LocalClusteringCoefficient-Exists.cypher" \
84+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-LocalClusteringCoefficient-Write.cypher" "${@}"
85+
# Determine the page rank if not already done
86+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageRank-Exists.cypher" \
87+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageRank-Write.cypher" "${@}"
88+
# Determine the article rank if not already done
89+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Exists.cypher" \
90+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageRank-Write.cypher" "${@}"
91+
}
92+
# Run queries to find anomalies in the graph.
93+
#
94+
# Required Parameters:
95+
# - projection_node_label=...
96+
# Label of the nodes that will be used for the projection. Example: "Package"
97+
anomaly_detection_queries() {
98+
local nodeLabel
99+
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
100+
101+
echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Queries for ${nodeLabel} nodes..."
102+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_PotentialImbalancedRoles.csv"
103+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_PotentialOverEngineerOrIsolated.csv"
104+
105+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_HiddenBridgeNodes.csv"
106+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_PopularBottlenecks.csv"
107+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_SilentCoordinators.csv"
108+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_OverReferencesUtilities.csv"
109+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_FragileStructuralBridges.csv"
110+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_DependencyHungryOrchestrators.csv"
111+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetection_UnexpectedCentralNodes.csv"
112+
}
113+
114+
# Execute the Python scripts for anomaly detection.
115+
#
116+
# Required Parameters:
117+
# - projection_name=...
118+
# Name prefix for the in-memory projection name. Example: "package-anomaly-detection"
119+
# - projection_node_label=...
120+
# Label of the nodes that will be used for the projection. Example: "Package"
121+
# - projection_weight_property=...
122+
# Name of the node property that contains the dependency weight. Example: "weight"
123+
anomaly_detection_using_python() {
124+
echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Python scripts for ${nodeLabel} nodes..."
125+
126+
# Get tuned Leiden communities as a reference to tune clustering
127+
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedLeidenCommunityDetection.py" "${@}" ${verboseMode}
128+
# Tuned Fast Random Projection and tuned HDBSCAN clustering
129+
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedNodeEmbeddingClustering.py" "${@}" ${verboseMode}
130+
131+
time "${ANOMALY_DETECTION_SCRIPT_DIR}/anomalyDetectionPlots.py" "${@}" "--report_directory" "${FULL_REPORT_DIRECTORY}" ${verboseMode}
132+
# Query Results: Output all collected features into a CSV file.
133+
local nodeLabel
134+
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
135+
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeatures.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}AnomalyDetectionFeatures.csv"
136+
}
137+
138+
# Run the anomaly detection pipeline.
139+
#
140+
# Required Parameters:
141+
# - projection_name=...
142+
# Name prefix for the in-memory projection name. Example: "package-anomaly-detection"
143+
# - projection_node_label=...
144+
# Label of the nodes that will be used for the projection. Example: "Package"
145+
# - projection_weight_property=...
146+
# Name of the node property that contains the dependency weight. Example: "weight"
147+
anomaly_detection_pipeline() {
148+
time anomaly_detection_features "${@}"
149+
time anomaly_detection_queries "${@}"
150+
if [ "${usePython}" = "true" ]; then
151+
anomaly_detection_using_python "${@}"
152+
fi
153+
}
154+
155+
# Create report directory
156+
REPORT_NAME="anomaly-detection"
157+
FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}"
158+
mkdir -p "${FULL_REPORT_DIRECTORY}"
159+
160+
# Query Parameter key pairs for projection and algorithm side
161+
PROJECTION_NAME="dependencies_projection"
162+
ALGORITHM_PROJECTION="projection_name"
163+
164+
PROJECTION_NODE="dependencies_projection_node"
165+
ALGORITHM_NODE="projection_node_label"
166+
167+
PROJECTION_WEIGHT="dependencies_projection_weight_property"
168+
ALGORITHM_WEIGHT="projection_weight_property"
169+
170+
# Code independent algorithm parameters
171+
COMMUNITY_PROPERTY="community_property=communityLeidenIdTuned"
172+
173+
# -- Java Artifact Node Embeddings -------------------------------
174+
175+
if createUndirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight"; then
176+
createDirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection-directed" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight"
177+
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=artifact-anomaly-detection" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}"
178+
fi
179+
180+
# -- Java Package Node Embeddings --------------------------------
181+
182+
if createUndirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces"; then
183+
createDirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection-directed" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces"
184+
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=package-anomaly-detection" "${ALGORITHM_NODE}=Package" "${ALGORITHM_WEIGHT}=weight25PercentInterfaces" "${COMMUNITY_PROPERTY}"
185+
fi
186+
187+
# -- Java Type Node Embeddings -----------------------------------
188+
189+
if createUndirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection"; then
190+
createDirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection-directed"
191+
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=type-anomaly-detection" "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}"
192+
fi
193+
194+
# -- Typescript Module Node Embeddings ---------------------------
195+
196+
if createUndirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight"; then
197+
createDirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding-directed" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight"
198+
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=typescript-module-embedding" "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=lowCouplingElement25PercentWeight" "${COMMUNITY_PROPERTY}"
199+
fi
200+
201+
# ---------------------------------------------------------------
202+
203+
# Clean-up after report generation. Empty reports will be deleted.
204+
source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}"
205+
206+
echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished."

0 commit comments

Comments
 (0)