Skip to content

Commit 13c4dfd

Browse files
committed
Add lists and plots for most normal non-anomalies
1 parent fecd491 commit 13c4dfd

File tree

1 file changed

+231
-11
lines changed

1 file changed

+231
-11
lines changed

domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb

Lines changed: 231 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -679,7 +679,7 @@
679679
"id": "b2cfcc56",
680680
"metadata": {},
681681
"source": [
682-
"#### 1.3.b List the top 10 anomalies solely based on embeddings"
682+
"#### 1.3b List the top 10 anomalies solely based on embeddings"
683683
]
684684
},
685685
{
@@ -689,7 +689,7 @@
689689
"metadata": {},
690690
"outputs": [],
691691
"source": [
692-
"java_package_embedding_anomaly_detection_features = java_package_anomaly_detection_features[features_for_visualization_excluded_from_training + ['embedding', 'pageRank']].copy()\n",
692+
"java_package_embedding_anomaly_detection_features = java_package_anomaly_detection_features[features_for_visualization_excluded_from_training + ['embedding', 'pageRank', 'articleRank']].copy()\n",
693693
"java_package_embedding_anomaly_detection_input = reduce_dimensionality_of_node_embeddings(java_package_embedding_anomaly_detection_features, max_dimensions=60, target_variance=0.95)\n",
694694
"java_package_embedding_anomaly_detection_feature_names = embedding_feature_names = [f'pca_{i}' for i in range(java_package_embedding_anomaly_detection_input.shape[1])]\n",
695695
"java_package_embedding_anomaly_detection_result = tune_anomaly_detection_models(java_package_embedding_anomaly_detection_input, contamination=\"auto\")\n",
@@ -698,6 +698,135 @@
698698
"display(get_top_10_anomalies(java_package_embedding_anomaly_detection_features, anomaly_label_column='anomalyOfEmbeddingLabel', anomaly_score_column='anomalyOfEmbeddingScore').reset_index(drop=True))"
699699
]
700700
},
701+
{
702+
"cell_type": "markdown",
703+
"id": "b3f6ea49",
704+
"metadata": {},
705+
"source": [
706+
"#### 1.3c List the the top (most normal) non-anomalies"
707+
]
708+
},
709+
{
710+
"cell_type": "code",
711+
"execution_count": null,
712+
"id": "68621d0a",
713+
"metadata": {},
714+
"outputs": [],
715+
"source": [
716+
"def get_top_10_non_anomalies(\n",
717+
" anomaly_detected_features: pd.DataFrame, \n",
718+
" anomaly_label_column: str = \"anomalyLabel\",\n",
719+
" anomaly_score_column: str = \"anomalyScore\"\n",
720+
") -> pd.DataFrame:\n",
721+
" anomalies = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] != 1]\n",
722+
" return anomalies.sort_values(by=anomaly_score_column, ascending=True).head(10)"
723+
]
724+
},
725+
{
726+
"cell_type": "code",
727+
"execution_count": null,
728+
"id": "a926347f",
729+
"metadata": {},
730+
"outputs": [],
731+
"source": [
732+
"display(get_top_10_non_anomalies(java_package_anomaly_detection_features).reset_index(drop=True))"
733+
]
734+
},
735+
{
736+
"cell_type": "markdown",
737+
"id": "8e083c1f",
738+
"metadata": {},
739+
"source": [
740+
"#### 1.3d List the the top (most normal) non-anomalies solely based on embeddings"
741+
]
742+
},
743+
{
744+
"cell_type": "code",
745+
"execution_count": null,
746+
"id": "8aae5671",
747+
"metadata": {},
748+
"outputs": [],
749+
"source": [
750+
"display(get_top_10_non_anomalies(java_package_embedding_anomaly_detection_features, anomaly_label_column='anomalyOfEmbeddingLabel', anomaly_score_column='anomalyOfEmbeddingScore').reset_index(drop=True))"
751+
]
752+
},
753+
{
754+
"cell_type": "markdown",
755+
"id": "15ed28c9",
756+
"metadata": {},
757+
"source": [
758+
"#### 1.3e Plot the distribution of the anomaly scores"
759+
]
760+
},
761+
{
762+
"cell_type": "code",
763+
"execution_count": null,
764+
"id": "5d051b97",
765+
"metadata": {},
766+
"outputs": [],
767+
"source": [
768+
"def plot_anomaly_score_distribution(\n",
769+
" anomaly_detected_features: pd.DataFrame, \n",
770+
" anomaly_label_column: str = \"anomalyLabel\",\n",
771+
" anomaly_score_column: str = \"anomalyScore\",\n",
772+
" title_prefix: str = \"\"\n",
773+
") -> None:\n",
774+
" \"\"\"\n",
775+
" Plots the distribution of anomaly scores in the feature matrix.\n",
776+
"\n",
777+
" Parameters:\n",
778+
" - anomaly_detected_features: pd.DataFrame containing anomaly labels and scores.\n",
779+
" - anomaly_label_column: Name of the column containing anomaly labels.\n",
780+
" - anomaly_score_column: Name of the column containing anomaly scores.\n",
781+
" \"\"\"\n",
782+
" plot.figure(figsize=(12, 6))\n",
783+
" plot.hist(anomaly_detected_features[anomaly_score_column], bins=50, color='blue', alpha=0.7)\n",
784+
" plot.title(f\"{title_prefix} Anomaly Score Distribution\")\n",
785+
" plot.xlabel('Anomaly Score')\n",
786+
" plot.ylabel('Frequency')\n",
787+
" plot.grid(True)\n",
788+
"\n",
789+
" # Add vertical lines for anomaly thresholds\n",
790+
" anomaly_threshold = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == 1][anomaly_score_column].min()\n",
791+
" plot.axvline(anomaly_threshold, color='red', linestyle='dashed', linewidth=1, label='Anomaly Threshold')\n",
792+
" plot.legend()\n",
793+
"\n",
794+
" plot.show()"
795+
]
796+
},
797+
{
798+
"cell_type": "code",
799+
"execution_count": null,
800+
"id": "e7cebd27",
801+
"metadata": {},
802+
"outputs": [],
803+
"source": [
804+
"plot_anomaly_score_distribution(java_package_anomaly_detection_features, title_prefix=\"Java Package\")"
805+
]
806+
},
807+
{
808+
"cell_type": "markdown",
809+
"id": "f6274214",
810+
"metadata": {},
811+
"source": [
812+
"#### 1.3f Plot the distribution of the anomaly scores solely based on embeddings"
813+
]
814+
},
815+
{
816+
"cell_type": "code",
817+
"execution_count": null,
818+
"id": "fe2beda3",
819+
"metadata": {},
820+
"outputs": [],
821+
"source": [
822+
"plot_anomaly_score_distribution(\n",
823+
" java_package_embedding_anomaly_detection_features, \n",
824+
" anomaly_label_column='anomalyOfEmbeddingLabel',\n",
825+
" anomaly_score_column='anomalyOfEmbeddingScore',\n",
826+
" title_prefix=\"Java Package Embeddings\"\n",
827+
")"
828+
]
829+
},
701830
{
702831
"cell_type": "markdown",
703832
"id": "a3936d79",
@@ -724,7 +853,7 @@
724853
" cluster_size_column: str = \"clusterSize\",\n",
725854
" anomaly_label_column: str = \"anomalyLabel\",\n",
726855
" anomaly_score_column: str = \"anomalyScore\",\n",
727-
" page_rank_column: str = \"pageRank\",\n",
856+
" size_column: str = \"articleRank\",\n",
728857
" x_position_column: str = 'embeddingVisualizationX',\n",
729858
" y_position_column: str = 'embeddingVisualizationY',\n",
730859
") -> None:\n",
@@ -744,13 +873,13 @@
744873
" cluster_non_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] != -1]\n",
745874
"\n",
746875
" plot.figure(figsize=(10, 10))\n",
747-
" plot.title(title_prefix + ' (size=PageRank, color=ClusterLabel, red=Anomaly)')\n",
876+
" plot.title(f\"{title_prefix} (size={size_column}, main-color=cluster, red=anomaly, green=non-anomaly)\")\n",
748877
"\n",
749-
" # Plot noise\n",
878+
" # Plot noise (from clustering)\n",
750879
" plot.scatter(\n",
751880
" x=cluster_noise[x_position_column],\n",
752881
" y=cluster_noise[y_position_column],\n",
753-
" s=cluster_noise[page_rank_column] * 200 + 4,\n",
882+
" s=cluster_noise[size_column] * 200 + 4,\n",
754883
" color='lightgrey',\n",
755884
" alpha=0.5,\n",
756885
" label='Noise'\n",
@@ -760,7 +889,7 @@
760889
" plot.scatter(\n",
761890
" x=cluster_non_noise[x_position_column],\n",
762891
" y=cluster_non_noise[y_position_column],\n",
763-
" s=cluster_non_noise[page_rank_column] * 200 + 4,\n",
892+
" s=cluster_non_noise[size_column] * 200 + 4,\n",
764893
" c=cluster_non_noise[cluster_label_column],\n",
765894
" cmap='tab20',\n",
766895
" alpha=0.7,\n",
@@ -771,7 +900,7 @@
771900
" plot.scatter(\n",
772901
" x=cluster_anomalies[x_position_column],\n",
773902
" y=cluster_anomalies[y_position_column],\n",
774-
" s=cluster_anomalies[page_rank_column] * 200 + 4,\n",
903+
" s=cluster_anomalies[size_column] * 200 + 4,\n",
775904
" c=cluster_anomalies[anomaly_score_column],\n",
776905
" cmap=\"Reds\",\n",
777906
" alpha=0.9,\n",
@@ -782,18 +911,32 @@
782911
" cluster_medoids = cluster_non_noise[cluster_non_noise[cluster_medoid_column] == 1].sort_values(by=cluster_size_column, ascending=False).head(20)\n",
783912
" for index, row in cluster_medoids.iterrows():\n",
784913
" plot.annotate(\n",
785-
" text=f\"{row[cluster_label_column]}:{truncate(row[code_unit_column], 20)} ({row[anomaly_score_column]:.4f})\",\n",
914+
" text=f\"{truncate(row[code_unit_column], 20)} (cluster {row[cluster_label_column]})\",\n",
786915
" xy=(row[x_position_column], row[y_position_column]),\n",
787916
" xytext=(5, 5),\n",
788917
" alpha=0.4,\n",
789918
" **plot_annotation_style\n",
790919
" )\n",
791920
"\n",
921+
" # Annotate top non-anomalies\n",
922+
" non_anomalies = cluster_without_anomalies.sort_values(by=anomaly_score_column, ascending=True).reset_index(drop=True).head(5)\n",
923+
" for dataframe_index, row in non_anomalies.iterrows():\n",
924+
" index = typing.cast(int, dataframe_index)\n",
925+
" plot.annotate(\n",
926+
" text=f\"#{index + 1}: {truncate(row[code_unit_column], 20)} ({row[anomaly_score_column]:.3f})\",\n",
927+
" xy=(row[x_position_column], row[y_position_column]),\n",
928+
" xytext=(5, 5 + (index % 5) * 10),\n",
929+
" color='green',\n",
930+
" alpha=0.7,\n",
931+
" **plot_annotation_style\n",
932+
" )\n",
933+
"\n",
934+
" # Annotate top anomalies\n",
792935
" anomalies = cluster_anomalies.sort_values(by=anomaly_score_column, ascending=False).reset_index(drop=True).head(10)\n",
793936
" for dataframe_index, row in anomalies.iterrows():\n",
794937
" index = typing.cast(int, dataframe_index)\n",
795938
" plot.annotate(\n",
796-
" text=f\"{row[cluster_label_column]}:{truncate(row[code_unit_column], 20)} ({row[anomaly_score_column]:.4f})\",\n",
939+
" text=f\"#{index + 1}: {truncate(row[code_unit_column], 20)} ({row[anomaly_score_column]:.3f})\",\n",
797940
" xy=(row[x_position_column], row[y_position_column]),\n",
798941
" xytext=(5, 5 + (index % 5) * 10),\n",
799942
" color='red',\n",
@@ -1507,7 +1650,7 @@
15071650
"metadata": {},
15081651
"outputs": [],
15091652
"source": [
1510-
"java_type_embedding_anomaly_detection_features = java_type_anomaly_detection_features[features_for_visualization_excluded_from_training + ['embedding', 'pageRank']].copy()\n",
1653+
"java_type_embedding_anomaly_detection_features = java_type_anomaly_detection_features[features_for_visualization_excluded_from_training + ['embedding', 'pageRank', 'articleRank']].copy()\n",
15111654
"java_type_embedding_anomaly_detection_input = reduce_dimensionality_of_node_embeddings(java_type_embedding_anomaly_detection_features, max_dimensions=60, target_variance=0.95)\n",
15121655
"java_type_embedding_anomaly_detection_feature_names = embedding_feature_names = [f'pca_{i}' for i in range(java_type_embedding_anomaly_detection_input.shape[1])]\n",
15131656
"java_type_embedding_anomaly_detection_result = tune_anomaly_detection_models(java_type_embedding_anomaly_detection_input, contamination=\"auto\")\n",
@@ -1516,6 +1659,83 @@
15161659
"display(get_top_10_anomalies(java_type_embedding_anomaly_detection_features, anomaly_label_column='anomalyOfEmbeddingLabel', anomaly_score_column='anomalyOfEmbeddingScore').reset_index(drop=True))"
15171660
]
15181661
},
1662+
{
1663+
"cell_type": "markdown",
1664+
"id": "1bac51eb",
1665+
"metadata": {},
1666+
"source": [
1667+
"#### 1.3c List the top (most normal) non-anomalies"
1668+
]
1669+
},
1670+
{
1671+
"cell_type": "code",
1672+
"execution_count": null,
1673+
"id": "6005ff1e",
1674+
"metadata": {},
1675+
"outputs": [],
1676+
"source": [
1677+
"display(get_top_10_non_anomalies(java_type_anomaly_detection_features).reset_index(drop=True))"
1678+
]
1679+
},
1680+
{
1681+
"cell_type": "markdown",
1682+
"id": "7e52ffa2",
1683+
"metadata": {},
1684+
"source": [
1685+
"#### 1.3d List the top (most normal) non-anomalies solely based on embeddings"
1686+
]
1687+
},
1688+
{
1689+
"cell_type": "code",
1690+
"execution_count": null,
1691+
"id": "7cfc7d61",
1692+
"metadata": {},
1693+
"outputs": [],
1694+
"source": [
1695+
"display(get_top_10_non_anomalies(java_type_embedding_anomaly_detection_features, anomaly_label_column='anomalyOfEmbeddingLabel', anomaly_score_column='anomalyOfEmbeddingScore').reset_index(drop=True))"
1696+
]
1697+
},
1698+
{
1699+
"cell_type": "markdown",
1700+
"id": "3b635a0e",
1701+
"metadata": {},
1702+
"source": [
1703+
"#### 1.3e Plot the distribution of the anomaly scores"
1704+
]
1705+
},
1706+
{
1707+
"cell_type": "code",
1708+
"execution_count": null,
1709+
"id": "40be411a",
1710+
"metadata": {},
1711+
"outputs": [],
1712+
"source": [
1713+
"plot_anomaly_score_distribution(java_type_anomaly_detection_features, title_prefix=\"Java Type\")"
1714+
]
1715+
},
1716+
{
1717+
"cell_type": "markdown",
1718+
"id": "1269582b",
1719+
"metadata": {},
1720+
"source": [
1721+
"#### 1.3f Plot the distribution of the anomaly scores solely based on embeddings"
1722+
]
1723+
},
1724+
{
1725+
"cell_type": "code",
1726+
"execution_count": null,
1727+
"id": "d676af42",
1728+
"metadata": {},
1729+
"outputs": [],
1730+
"source": [
1731+
"plot_anomaly_score_distribution(\n",
1732+
" java_type_embedding_anomaly_detection_features, \n",
1733+
" anomaly_label_column='anomalyOfEmbeddingLabel',\n",
1734+
" anomaly_score_column='anomalyOfEmbeddingScore',\n",
1735+
" title_prefix=\"Java Type Embeddings\"\n",
1736+
")"
1737+
]
1738+
},
15191739
{
15201740
"cell_type": "markdown",
15211741
"id": "68a00628",

0 commit comments

Comments
 (0)