Skip to content

Commit 4238c6b

Browse files
committed
Use UMAP for node embedding visualization
1 parent ea1478a commit 4238c6b

File tree

5 files changed

+357
-78
lines changed

5 files changed

+357
-78
lines changed

domains/anomaly-detection/anomalyDetectionPipeline.sh

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,8 @@ anomaly_detection_using_python() {
127127
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedLeidenCommunityDetection.py" "${@}" ${verboseMode}
128128
# Tuned Fast Random Projection and tuned HDBSCAN clustering
129129
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedNodeEmbeddingClustering.py" "${@}" ${verboseMode}
130+
# Reduce the dimensionality of the node embeddings down to 2D for visualization using UMAP
131+
time "${ANOMALY_DETECTION_SCRIPT_DIR}/umap2dNodeEmbeddings.py" "${@}" ${verboseMode}
130132

131133
time "${ANOMALY_DETECTION_SCRIPT_DIR}/anomalyDetectionPlots.py" "${@}" "--report_directory" "${FULL_REPORT_DIRECTORY}" ${verboseMode}
132134
# Query Results: Output all collected features into a CSV file.
@@ -169,33 +171,34 @@ ALGORITHM_WEIGHT="projection_weight_property"
169171

170172
# Code independent algorithm parameters
171173
COMMUNITY_PROPERTY="community_property=communityLeidenIdTuned"
174+
EMBEDDING_PROPERTY="embedding_property=embeddingsFastRandomProjectionTunedForClustering"
172175

173176
# -- Java Artifact Node Embeddings -------------------------------
174177

175178
if createUndirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight"; then
176179
createDirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection-directed" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight"
177-
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=artifact-anomaly-detection" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}"
180+
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=artifact-anomaly-detection" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
178181
fi
179182

180183
# -- Java Package Node Embeddings --------------------------------
181184

182185
if createUndirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces"; then
183186
createDirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection-directed" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces"
184-
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=package-anomaly-detection" "${ALGORITHM_NODE}=Package" "${ALGORITHM_WEIGHT}=weight25PercentInterfaces" "${COMMUNITY_PROPERTY}"
187+
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=package-anomaly-detection" "${ALGORITHM_NODE}=Package" "${ALGORITHM_WEIGHT}=weight25PercentInterfaces" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
185188
fi
186189

187190
# -- Java Type Node Embeddings -----------------------------------
188191

189192
if createUndirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection"; then
190193
createDirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection-directed"
191-
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=type-anomaly-detection" "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}"
194+
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=type-anomaly-detection" "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=weight" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
192195
fi
193196

194197
# -- Typescript Module Node Embeddings ---------------------------
195198

196199
if createUndirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight"; then
197200
createDirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding-directed" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight"
198-
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=typescript-module-embedding" "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=lowCouplingElement25PercentWeight" "${COMMUNITY_PROPERTY}"
201+
anomaly_detection_pipeline "${ALGORITHM_PROJECTION}=typescript-module-embedding" "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=lowCouplingElement25PercentWeight" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
199202
fi
200203

201204
# ---------------------------------------------------------------

domains/anomaly-detection/tunedNodeEmbeddingClustering.py

Lines changed: 11 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
# This is useful for understanding code structure, detecting modular boundaries, and identifying anomalies or outliers in large software systems without requiring manual labeling.
55
# It takes the code structure as a graph in Neo4j and generates node embeddings using Fast Random Projection (FastRP).
66
# These embeddings capture structural similarity and are clustered using HDBSCAN to assign labels or detect noise.
7-
# For visualization, the embeddings are reduced to 2D using t-SNE.
87
# All results - including embeddings, cluster labels, and 2D coordinates — are written back to Neo4j for further use.
98

109
# Prerequisite:
@@ -25,9 +24,7 @@
2524

2625
from neo4j import GraphDatabase, Driver
2726

28-
from openTSNE.sklearn import TSNE
29-
30-
from sklearn.base import BaseEstimator
27+
# from sklearn.base import BaseEstimator # Extend from sklearn BaseEstimator to use e.g. GridSearchCV for hyperparameter tuning.
3128
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, normalized_mutual_info_score
3229
from sklearn.cluster import HDBSCAN # type: ignore
3330

@@ -38,7 +35,7 @@
3835

3936

4037
class Parameters:
41-
required_parameters_ = ["projection_name", "projection_node_label", "projection_weight_property", "community_property"]
38+
required_parameters_ = ["projection_name", "projection_node_label", "projection_weight_property", "community_property", "embedding_property"]
4239

4340
def __init__(self, input_parameters: typing.Dict[str, str], verbose: bool = False):
4441
self.query_parameters_ = input_parameters.copy() # copy enforces immutability
@@ -63,9 +60,6 @@ def log_dependency_versions_() -> None:
6360
from sklearn import __version__ as sklearn_version
6461
print('scikit-learn version: {}'.format(sklearn_version))
6562

66-
from openTSNE import __version__ as openTSNE_version
67-
print('openTSNE version: {}'.format(openTSNE_version))
68-
6963
from neo4j import __version__ as neo4j_version
7064
print('neo4j version: {}'.format(neo4j_version))
7165

@@ -116,6 +110,9 @@ def get_projection_name(self) -> str:
116110
def get_projection_node_label(self) -> str:
117111
return self.query_parameters_["projection_node_label"]
118112

113+
def get_embedding_property(self) -> str:
114+
return self.query_parameters_["embedding_property"]
115+
119116
def is_verbose(self) -> bool:
120117
return self.verbose_
121118

@@ -581,7 +578,8 @@ def __init__(self,
581578
forth_iteration_weight: float = 1.0,
582579
):
583580
self.parameters_ = parameters
584-
self.verbose = parameters.is_verbose()
581+
self.verbose_ = parameters.is_verbose()
582+
self.write_property_ = parameters.get_embedding_property()
585583

586584
self.embedding_dimension = embedding_dimension
587585
self.random_seed = random_seed
@@ -594,15 +592,15 @@ def __to_algorithm_parameters(self) -> typing.Dict['str', 'str']:
594592
"normalization_strength": str(self.normalization_strength),
595593
"forth_iteration_weight": str(self.forth_iteration_weight),
596594
"embedding_random_seed": str(self.random_seed),
597-
"write_property": "embeddingsFastRandomProjectionForClustering",
595+
"write_property": str(self.write_property_),
598596
**self.parameters_.get_query_parameters()
599597
}
600598

601599
def __run_algorithm(self) -> pd.DataFrame:
602600
algorithm_parameters = self.__to_algorithm_parameters()
603601
# For Debugging:
604602
# print("Generating embeddings using Neo4j Graph Data Science with the following parameters: " + str(algorithm_parameters))
605-
if self.verbose:
603+
if self.verbose_:
606604
return query_cypher_to_data_frame(self.cypher_query_for_generating_embeddings_, parameters=algorithm_parameters)
607605

608606
return query_cypher_to_data_frame_suppress_warnings(self.cypher_query_for_generating_embeddings_, parameters=algorithm_parameters)
@@ -636,12 +634,12 @@ def write_embeddings(self) -> typing.Self:
636634
This is useful for further processing or analysis of the embeddings.
637635
"""
638636
algorithm_parameters = self.__to_algorithm_parameters()
639-
if self.verbose:
637+
if self.verbose_:
640638
print("")
641639
print("Writing embeddings to Neo4j with the following parameters: " + str(algorithm_parameters))
642640
print("")
643641

644-
if self.verbose:
642+
if self.verbose_:
645643
query_cypher_to_data_frame(self.cypher_query_for_writing_embeddings_, parameters=algorithm_parameters)
646644
else:
647645
query_cypher_to_data_frame_suppress_warnings(self.cypher_query_for_writing_embeddings_, parameters=algorithm_parameters)
@@ -701,58 +699,6 @@ def objective(trial):
701699
return TuneableFastRandomProjectionNodeEmbeddings(parameters, **study.best_params).fit()
702700

703701

704-
def prepare_node_embeddings_for_2d_visualization(embeddings: pd.DataFrame) -> pd.DataFrame:
705-
"""
706-
Reduces the dimensionality of the node embeddings (e.g. 64 floating point numbers in an array)
707-
to two dimensions for 2D visualization.
708-
see https://opentsne.readthedocs.io
709-
"""
710-
711-
if embeddings.empty:
712-
print("No projected data for node embeddings dimensionality reduction available")
713-
return embeddings
714-
715-
# Calling the fit_transform method just with a list doesn't work.
716-
# It leads to an error with the following message: 'list' object has no attribute 'shape'
717-
# This can be solved by converting the list to a numpy array using np.array(..).
718-
# See https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape
719-
embeddings_as_numpy_array = np.array(embeddings.embedding.to_list())
720-
721-
# Use t-distributed Stochastic Neighbor Embedding (t-SNE) to reduce the dimensionality
722-
# of the previously calculated node embeddings to 2 dimensions for visualization
723-
t_distributed_stochastic_neighbor_embedding = TSNE(n_components=2, verbose=False, random_state=47)
724-
two_dimension_node_embeddings = t_distributed_stochastic_neighbor_embedding.fit_transform(embeddings_as_numpy_array)
725-
# display(two_dimension_node_embeddings.shape) # Display the shape of the t-SNE result
726-
727-
# Create a new DataFrame with the results of the 2 dimensional node embeddings
728-
# and the code unit and artifact name of the query above as preparation for the plot
729-
embeddings['embeddingVisualizationX'] = [value[0] for value in two_dimension_node_embeddings]
730-
embeddings['embeddingVisualizationY'] = [value[1] for value in two_dimension_node_embeddings]
731-
732-
return embeddings
733-
734-
735-
def execute_tuned_node_embeddings_clustering(parameters: Parameters) -> None:
736-
tuned_fast_random_projection = get_tuned_fast_random_projection_node_embeddings(parameters)
737-
embeddings = tuned_fast_random_projection.get_embeddings()
738-
clustering_results = coordinate_tuned_hierarchical_density_based_spatial_clustering(embeddings)
739-
if parameters.is_verbose():
740-
print("HDBSCAN clustered labels by their size descending (top 10):", clustering_results.clustering_results_distribution.head(10))
741-
print("HDBSCAN clustered labels by their probability descending (top 10):", clustering_results.clustering_results_distribution.sort_values(by='probability', ascending=False).head(10))
742-
743-
embeddings = prepare_node_embeddings_for_2d_visualization(clustering_results.embeddings)
744-
745-
tuned_fast_random_projection.write_embeddings()
746-
data_to_write = pd.DataFrame(data={
747-
'nodeElementId': embeddings["nodeElementId"],
748-
'clusteringHDBSCANLabel': embeddings['clusteringTunedHDBSCANLabel'],
749-
'clusteringHDBSCANProbability': embeddings['clusteringTunedHDBSCANProbability'],
750-
'clusteringHDBSCANNoise': (embeddings['clusteringTunedHDBSCANLabel'] == -1).astype(int),
751-
'embeddingFastRandomProjectionVisualizationX': embeddings["embeddingVisualizationX"],
752-
'embeddingFastRandomProjectionVisualizationY': embeddings["embeddingVisualizationY"],
753-
})
754-
write_batch_data_into_database(data_to_write, parameters.get_projection_node_label())
755-
756702
# ------------------------------------------------------------------------------------------------------------
757703
# MAIN
758704
# ------------------------------------------------------------------------------------------------------------

0 commit comments

Comments
 (0)