Add lists and plots for most normal non-anomalies

JohT · JohT · commit 13c4dfde8143 · 2025-07-27T11:03:24.000+02:00
diff --git a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb
@@ -679,7 +679,7 @@
    "id": "b2cfcc56",
    "metadata": {},
    "source": [
-    "#### 1.3.b List the top 10 anomalies solely based on embeddings"
+    "#### 1.3b List the top 10 anomalies solely based on embeddings"
    ]
   },
   {
@@ -689,7 +689,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "java_package_embedding_anomaly_detection_features = java_package_anomaly_detection_features[features_for_visualization_excluded_from_training + ['embedding', 'pageRank']].copy()\n",
+    "java_package_embedding_anomaly_detection_features = java_package_anomaly_detection_features[features_for_visualization_excluded_from_training + ['embedding', 'pageRank', 'articleRank']].copy()\n",
     "java_package_embedding_anomaly_detection_input = reduce_dimensionality_of_node_embeddings(java_package_embedding_anomaly_detection_features, max_dimensions=60, target_variance=0.95)\n",
     "java_package_embedding_anomaly_detection_feature_names = embedding_feature_names = [f'pca_{i}' for i in range(java_package_embedding_anomaly_detection_input.shape[1])]\n",
     "java_package_embedding_anomaly_detection_result = tune_anomaly_detection_models(java_package_embedding_anomaly_detection_input, contamination=\"auto\")\n",
@@ -698,6 +698,135 @@
     "display(get_top_10_anomalies(java_package_embedding_anomaly_detection_features, anomaly_label_column='anomalyOfEmbeddingLabel', anomaly_score_column='anomalyOfEmbeddingScore').reset_index(drop=True))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "b3f6ea49",
+   "metadata": {},
+   "source": [
+    "#### 1.3c List the the top (most normal) non-anomalies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "68621d0a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_top_10_non_anomalies(\n",
+    "        anomaly_detected_features: pd.DataFrame, \n",
+    "        anomaly_label_column: str = \"anomalyLabel\",\n",
+    "        anomaly_score_column: str = \"anomalyScore\"\n",
+    ") -> pd.DataFrame:\n",
+    "    anomalies = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] != 1]\n",
+    "    return anomalies.sort_values(by=anomaly_score_column, ascending=True).head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a926347f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "display(get_top_10_non_anomalies(java_package_anomaly_detection_features).reset_index(drop=True))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e083c1f",
+   "metadata": {},
+   "source": [
+    "#### 1.3d List the the top (most normal) non-anomalies solely based on embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8aae5671",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "display(get_top_10_non_anomalies(java_package_embedding_anomaly_detection_features, anomaly_label_column='anomalyOfEmbeddingLabel', anomaly_score_column='anomalyOfEmbeddingScore').reset_index(drop=True))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15ed28c9",
+   "metadata": {},
+   "source": [
+    "#### 1.3e Plot the distribution of the anomaly scores"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d051b97",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_anomaly_score_distribution(\n",
+    "        anomaly_detected_features: pd.DataFrame, \n",
+    "        anomaly_label_column: str = \"anomalyLabel\",\n",
+    "        anomaly_score_column: str = \"anomalyScore\",\n",
+    "        title_prefix: str = \"\"\n",
+    ") -> None:\n",
+    "    \"\"\"\n",
+    "    Plots the distribution of anomaly scores in the feature matrix.\n",
+    "\n",
+    "    Parameters:\n",
+    "    - anomaly_detected_features: pd.DataFrame containing anomaly labels and scores.\n",
+    "    - anomaly_label_column: Name of the column containing anomaly labels.\n",
+    "    - anomaly_score_column: Name of the column containing anomaly scores.\n",
+    "    \"\"\"\n",
+    "    plot.figure(figsize=(12, 6))\n",
+    "    plot.hist(anomaly_detected_features[anomaly_score_column], bins=50, color='blue', alpha=0.7)\n",
+    "    plot.title(f\"{title_prefix} Anomaly Score Distribution\")\n",
+    "    plot.xlabel('Anomaly Score')\n",
+    "    plot.ylabel('Frequency')\n",
+    "    plot.grid(True)\n",
+    "\n",
+    "    # Add vertical lines for anomaly thresholds\n",
+    "    anomaly_threshold = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == 1][anomaly_score_column].min()\n",
+    "    plot.axvline(anomaly_threshold, color='red', linestyle='dashed', linewidth=1, label='Anomaly Threshold')\n",
+    "    plot.legend()\n",
+    "\n",
+    "    plot.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7cebd27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_anomaly_score_distribution(java_package_anomaly_detection_features, title_prefix=\"Java Package\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f6274214",
+   "metadata": {},
+   "source": [
+    "#### 1.3f Plot the distribution of the anomaly scores solely based on embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe2beda3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_anomaly_score_distribution(\n",
+    "    java_package_embedding_anomaly_detection_features, \n",
+    "    anomaly_label_column='anomalyOfEmbeddingLabel',\n",
+    "    anomaly_score_column='anomalyOfEmbeddingScore',\n",
+    "    title_prefix=\"Java Package Embeddings\"\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "a3936d79",
@@ -724,7 +853,7 @@
     "    cluster_size_column: str = \"clusterSize\",\n",
     "    anomaly_label_column: str = \"anomalyLabel\",\n",
     "    anomaly_score_column: str = \"anomalyScore\",\n",
-    "    page_rank_column: str = \"pageRank\",\n",
+    "    size_column: str = \"articleRank\",\n",
     "    x_position_column: str = 'embeddingVisualizationX',\n",
     "    y_position_column: str = 'embeddingVisualizationY',\n",
     ") -> None:\n",
@@ -744,13 +873,13 @@
     "    cluster_non_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] != -1]\n",
     "\n",
     "    plot.figure(figsize=(10, 10))\n",
-    "    plot.title(title_prefix + ' (size=PageRank, color=ClusterLabel, red=Anomaly)')\n",
+    "    plot.title(f\"{title_prefix} (size={size_column}, main-color=cluster, red=anomaly, green=non-anomaly)\")\n",
     "\n",
-    "    # Plot noise\n",
+    "    # Plot noise (from clustering)\n",
     "    plot.scatter(\n",
     "        x=cluster_noise[x_position_column],\n",
     "        y=cluster_noise[y_position_column],\n",
-    "        s=cluster_noise[page_rank_column] * 200 + 4,\n",
+    "        s=cluster_noise[size_column] * 200 + 4,\n",
     "        color='lightgrey',\n",
     "        alpha=0.5,\n",
     "        label='Noise'\n",
@@ -760,7 +889,7 @@
     "    plot.scatter(\n",
     "        x=cluster_non_noise[x_position_column],\n",
     "        y=cluster_non_noise[y_position_column],\n",
-    "        s=cluster_non_noise[page_rank_column] * 200 + 4,\n",
+    "        s=cluster_non_noise[size_column] * 200 + 4,\n",
     "        c=cluster_non_noise[cluster_label_column],\n",
     "        cmap='tab20',\n",
     "        alpha=0.7,\n",
@@ -771,7 +900,7 @@
     "    plot.scatter(\n",
     "        x=cluster_anomalies[x_position_column],\n",
     "        y=cluster_anomalies[y_position_column],\n",
-    "        s=cluster_anomalies[page_rank_column] * 200 + 4,\n",
+    "        s=cluster_anomalies[size_column] * 200 + 4,\n",
     "        c=cluster_anomalies[anomaly_score_column],\n",
     "        cmap=\"Reds\",\n",
     "        alpha=0.9,\n",
@@ -782,18 +911,32 @@
     "    cluster_medoids = cluster_non_noise[cluster_non_noise[cluster_medoid_column] == 1].sort_values(by=cluster_size_column, ascending=False).head(20)\n",
     "    for index, row in cluster_medoids.iterrows():\n",
     "        plot.annotate(\n",
-    "            text=f\"{row[cluster_label_column]}:{truncate(row[code_unit_column], 20)} ({row[anomaly_score_column]:.4f})\",\n",
+    "            text=f\"{truncate(row[code_unit_column], 20)} (cluster {row[cluster_label_column]})\",\n",
     "            xy=(row[x_position_column], row[y_position_column]),\n",
     "            xytext=(5, 5),\n",
     "            alpha=0.4,\n",
     "            **plot_annotation_style\n",
     "        )\n",
     "\n",
+    "    # Annotate top non-anomalies\n",
+    "    non_anomalies = cluster_without_anomalies.sort_values(by=anomaly_score_column, ascending=True).reset_index(drop=True).head(5)\n",
+    "    for dataframe_index, row in non_anomalies.iterrows():\n",
+    "        index = typing.cast(int, dataframe_index)\n",
+    "        plot.annotate(\n",
+    "            text=f\"#{index + 1}: {truncate(row[code_unit_column], 20)} ({row[anomaly_score_column]:.3f})\",\n",
+    "            xy=(row[x_position_column], row[y_position_column]),\n",
+    "            xytext=(5, 5 + (index % 5) * 10),\n",
+    "            color='green',\n",
+    "            alpha=0.7,\n",
+    "            **plot_annotation_style\n",
+    "        )\n",
+    "\n",
+    "    # Annotate top anomalies\n",
     "    anomalies = cluster_anomalies.sort_values(by=anomaly_score_column, ascending=False).reset_index(drop=True).head(10)\n",
     "    for dataframe_index, row in anomalies.iterrows():\n",
     "        index = typing.cast(int, dataframe_index)\n",
     "        plot.annotate(\n",
-    "            text=f\"{row[cluster_label_column]}:{truncate(row[code_unit_column], 20)} ({row[anomaly_score_column]:.4f})\",\n",
+    "            text=f\"#{index + 1}: {truncate(row[code_unit_column], 20)} ({row[anomaly_score_column]:.3f})\",\n",
     "            xy=(row[x_position_column], row[y_position_column]),\n",
     "            xytext=(5, 5 + (index % 5) * 10),\n",
     "            color='red',\n",
@@ -1507,7 +1650,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "java_type_embedding_anomaly_detection_features = java_type_anomaly_detection_features[features_for_visualization_excluded_from_training + ['embedding', 'pageRank']].copy()\n",
+    "java_type_embedding_anomaly_detection_features = java_type_anomaly_detection_features[features_for_visualization_excluded_from_training + ['embedding', 'pageRank', 'articleRank']].copy()\n",
     "java_type_embedding_anomaly_detection_input = reduce_dimensionality_of_node_embeddings(java_type_embedding_anomaly_detection_features, max_dimensions=60, target_variance=0.95)\n",
     "java_type_embedding_anomaly_detection_feature_names = embedding_feature_names = [f'pca_{i}' for i in range(java_type_embedding_anomaly_detection_input.shape[1])]\n",
     "java_type_embedding_anomaly_detection_result = tune_anomaly_detection_models(java_type_embedding_anomaly_detection_input, contamination=\"auto\")\n",
@@ -1516,6 +1659,83 @@
     "display(get_top_10_anomalies(java_type_embedding_anomaly_detection_features, anomaly_label_column='anomalyOfEmbeddingLabel', anomaly_score_column='anomalyOfEmbeddingScore').reset_index(drop=True))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "1bac51eb",
+   "metadata": {},
+   "source": [
+    "#### 1.3c List the top (most normal) non-anomalies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6005ff1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "display(get_top_10_non_anomalies(java_type_anomaly_detection_features).reset_index(drop=True))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7e52ffa2",
+   "metadata": {},
+   "source": [
+    "#### 1.3d List the top (most normal) non-anomalies solely based on embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7cfc7d61",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "display(get_top_10_non_anomalies(java_type_embedding_anomaly_detection_features, anomaly_label_column='anomalyOfEmbeddingLabel', anomaly_score_column='anomalyOfEmbeddingScore').reset_index(drop=True))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3b635a0e",
+   "metadata": {},
+   "source": [
+    "#### 1.3e Plot the distribution of the anomaly scores"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "40be411a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_anomaly_score_distribution(java_type_anomaly_detection_features, title_prefix=\"Java Type\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1269582b",
+   "metadata": {},
+   "source": [
+    "#### 1.3f Plot the distribution of the anomaly scores solely based on embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d676af42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_anomaly_score_distribution(\n",
+    "    java_type_embedding_anomaly_detection_features, \n",
+    "    anomaly_label_column='anomalyOfEmbeddingLabel',\n",
+    "    anomaly_score_column='anomalyOfEmbeddingScore',\n",
+    "    title_prefix=\"Java Type Embeddings\"\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "68a00628",