|
679 | 679 | "id": "b2cfcc56",
|
680 | 680 | "metadata": {},
|
681 | 681 | "source": [
|
682 |
| - "#### 1.3.b List the top 10 anomalies solely based on embeddings" |
| 682 | + "#### 1.3b List the top 10 anomalies solely based on embeddings" |
683 | 683 | ]
|
684 | 684 | },
|
685 | 685 | {
|
|
689 | 689 | "metadata": {},
|
690 | 690 | "outputs": [],
|
691 | 691 | "source": [
|
692 |
| - "java_package_embedding_anomaly_detection_features = java_package_anomaly_detection_features[features_for_visualization_excluded_from_training + ['embedding', 'pageRank']].copy()\n", |
| 692 | + "java_package_embedding_anomaly_detection_features = java_package_anomaly_detection_features[features_for_visualization_excluded_from_training + ['embedding', 'pageRank', 'articleRank']].copy()\n", |
693 | 693 | "java_package_embedding_anomaly_detection_input = reduce_dimensionality_of_node_embeddings(java_package_embedding_anomaly_detection_features, max_dimensions=60, target_variance=0.95)\n",
|
694 | 694 | "java_package_embedding_anomaly_detection_feature_names = embedding_feature_names = [f'pca_{i}' for i in range(java_package_embedding_anomaly_detection_input.shape[1])]\n",
|
695 | 695 | "java_package_embedding_anomaly_detection_result = tune_anomaly_detection_models(java_package_embedding_anomaly_detection_input, contamination=\"auto\")\n",
|
|
698 | 698 | "display(get_top_10_anomalies(java_package_embedding_anomaly_detection_features, anomaly_label_column='anomalyOfEmbeddingLabel', anomaly_score_column='anomalyOfEmbeddingScore').reset_index(drop=True))"
|
699 | 699 | ]
|
700 | 700 | },
|
| 701 | + { |
| 702 | + "cell_type": "markdown", |
| 703 | + "id": "b3f6ea49", |
| 704 | + "metadata": {}, |
| 705 | + "source": [ |
| 706 | + "#### 1.3c List the the top (most normal) non-anomalies" |
| 707 | + ] |
| 708 | + }, |
| 709 | + { |
| 710 | + "cell_type": "code", |
| 711 | + "execution_count": null, |
| 712 | + "id": "68621d0a", |
| 713 | + "metadata": {}, |
| 714 | + "outputs": [], |
| 715 | + "source": [ |
| 716 | + "def get_top_10_non_anomalies(\n", |
| 717 | + " anomaly_detected_features: pd.DataFrame, \n", |
| 718 | + " anomaly_label_column: str = \"anomalyLabel\",\n", |
| 719 | + " anomaly_score_column: str = \"anomalyScore\"\n", |
| 720 | + ") -> pd.DataFrame:\n", |
| 721 | + " anomalies = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] != 1]\n", |
| 722 | + " return anomalies.sort_values(by=anomaly_score_column, ascending=True).head(10)" |
| 723 | + ] |
| 724 | + }, |
| 725 | + { |
| 726 | + "cell_type": "code", |
| 727 | + "execution_count": null, |
| 728 | + "id": "a926347f", |
| 729 | + "metadata": {}, |
| 730 | + "outputs": [], |
| 731 | + "source": [ |
| 732 | + "display(get_top_10_non_anomalies(java_package_anomaly_detection_features).reset_index(drop=True))" |
| 733 | + ] |
| 734 | + }, |
| 735 | + { |
| 736 | + "cell_type": "markdown", |
| 737 | + "id": "8e083c1f", |
| 738 | + "metadata": {}, |
| 739 | + "source": [ |
| 740 | + "#### 1.3d List the the top (most normal) non-anomalies solely based on embeddings" |
| 741 | + ] |
| 742 | + }, |
| 743 | + { |
| 744 | + "cell_type": "code", |
| 745 | + "execution_count": null, |
| 746 | + "id": "8aae5671", |
| 747 | + "metadata": {}, |
| 748 | + "outputs": [], |
| 749 | + "source": [ |
| 750 | + "display(get_top_10_non_anomalies(java_package_embedding_anomaly_detection_features, anomaly_label_column='anomalyOfEmbeddingLabel', anomaly_score_column='anomalyOfEmbeddingScore').reset_index(drop=True))" |
| 751 | + ] |
| 752 | + }, |
| 753 | + { |
| 754 | + "cell_type": "markdown", |
| 755 | + "id": "15ed28c9", |
| 756 | + "metadata": {}, |
| 757 | + "source": [ |
| 758 | + "#### 1.3e Plot the distribution of the anomaly scores" |
| 759 | + ] |
| 760 | + }, |
| 761 | + { |
| 762 | + "cell_type": "code", |
| 763 | + "execution_count": null, |
| 764 | + "id": "5d051b97", |
| 765 | + "metadata": {}, |
| 766 | + "outputs": [], |
| 767 | + "source": [ |
| 768 | + "def plot_anomaly_score_distribution(\n", |
| 769 | + " anomaly_detected_features: pd.DataFrame, \n", |
| 770 | + " anomaly_label_column: str = \"anomalyLabel\",\n", |
| 771 | + " anomaly_score_column: str = \"anomalyScore\",\n", |
| 772 | + " title_prefix: str = \"\"\n", |
| 773 | + ") -> None:\n", |
| 774 | + " \"\"\"\n", |
| 775 | + " Plots the distribution of anomaly scores in the feature matrix.\n", |
| 776 | + "\n", |
| 777 | + " Parameters:\n", |
| 778 | + " - anomaly_detected_features: pd.DataFrame containing anomaly labels and scores.\n", |
| 779 | + " - anomaly_label_column: Name of the column containing anomaly labels.\n", |
| 780 | + " - anomaly_score_column: Name of the column containing anomaly scores.\n", |
| 781 | + " \"\"\"\n", |
| 782 | + " plot.figure(figsize=(12, 6))\n", |
| 783 | + " plot.hist(anomaly_detected_features[anomaly_score_column], bins=50, color='blue', alpha=0.7)\n", |
| 784 | + " plot.title(f\"{title_prefix} Anomaly Score Distribution\")\n", |
| 785 | + " plot.xlabel('Anomaly Score')\n", |
| 786 | + " plot.ylabel('Frequency')\n", |
| 787 | + " plot.grid(True)\n", |
| 788 | + "\n", |
| 789 | + " # Add vertical lines for anomaly thresholds\n", |
| 790 | + " anomaly_threshold = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == 1][anomaly_score_column].min()\n", |
| 791 | + " plot.axvline(anomaly_threshold, color='red', linestyle='dashed', linewidth=1, label='Anomaly Threshold')\n", |
| 792 | + " plot.legend()\n", |
| 793 | + "\n", |
| 794 | + " plot.show()" |
| 795 | + ] |
| 796 | + }, |
| 797 | + { |
| 798 | + "cell_type": "code", |
| 799 | + "execution_count": null, |
| 800 | + "id": "e7cebd27", |
| 801 | + "metadata": {}, |
| 802 | + "outputs": [], |
| 803 | + "source": [ |
| 804 | + "plot_anomaly_score_distribution(java_package_anomaly_detection_features, title_prefix=\"Java Package\")" |
| 805 | + ] |
| 806 | + }, |
| 807 | + { |
| 808 | + "cell_type": "markdown", |
| 809 | + "id": "f6274214", |
| 810 | + "metadata": {}, |
| 811 | + "source": [ |
| 812 | + "#### 1.3f Plot the distribution of the anomaly scores solely based on embeddings" |
| 813 | + ] |
| 814 | + }, |
| 815 | + { |
| 816 | + "cell_type": "code", |
| 817 | + "execution_count": null, |
| 818 | + "id": "fe2beda3", |
| 819 | + "metadata": {}, |
| 820 | + "outputs": [], |
| 821 | + "source": [ |
| 822 | + "plot_anomaly_score_distribution(\n", |
| 823 | + " java_package_embedding_anomaly_detection_features, \n", |
| 824 | + " anomaly_label_column='anomalyOfEmbeddingLabel',\n", |
| 825 | + " anomaly_score_column='anomalyOfEmbeddingScore',\n", |
| 826 | + " title_prefix=\"Java Package Embeddings\"\n", |
| 827 | + ")" |
| 828 | + ] |
| 829 | + }, |
701 | 830 | {
|
702 | 831 | "cell_type": "markdown",
|
703 | 832 | "id": "a3936d79",
|
|
724 | 853 | " cluster_size_column: str = \"clusterSize\",\n",
|
725 | 854 | " anomaly_label_column: str = \"anomalyLabel\",\n",
|
726 | 855 | " anomaly_score_column: str = \"anomalyScore\",\n",
|
727 |
| - " page_rank_column: str = \"pageRank\",\n", |
| 856 | + " size_column: str = \"articleRank\",\n", |
728 | 857 | " x_position_column: str = 'embeddingVisualizationX',\n",
|
729 | 858 | " y_position_column: str = 'embeddingVisualizationY',\n",
|
730 | 859 | ") -> None:\n",
|
|
744 | 873 | " cluster_non_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] != -1]\n",
|
745 | 874 | "\n",
|
746 | 875 | " plot.figure(figsize=(10, 10))\n",
|
747 |
| - " plot.title(title_prefix + ' (size=PageRank, color=ClusterLabel, red=Anomaly)')\n", |
| 876 | + " plot.title(f\"{title_prefix} (size={size_column}, main-color=cluster, red=anomaly, green=non-anomaly)\")\n", |
748 | 877 | "\n",
|
749 |
| - " # Plot noise\n", |
| 878 | + " # Plot noise (from clustering)\n", |
750 | 879 | " plot.scatter(\n",
|
751 | 880 | " x=cluster_noise[x_position_column],\n",
|
752 | 881 | " y=cluster_noise[y_position_column],\n",
|
753 |
| - " s=cluster_noise[page_rank_column] * 200 + 4,\n", |
| 882 | + " s=cluster_noise[size_column] * 200 + 4,\n", |
754 | 883 | " color='lightgrey',\n",
|
755 | 884 | " alpha=0.5,\n",
|
756 | 885 | " label='Noise'\n",
|
|
760 | 889 | " plot.scatter(\n",
|
761 | 890 | " x=cluster_non_noise[x_position_column],\n",
|
762 | 891 | " y=cluster_non_noise[y_position_column],\n",
|
763 |
| - " s=cluster_non_noise[page_rank_column] * 200 + 4,\n", |
| 892 | + " s=cluster_non_noise[size_column] * 200 + 4,\n", |
764 | 893 | " c=cluster_non_noise[cluster_label_column],\n",
|
765 | 894 | " cmap='tab20',\n",
|
766 | 895 | " alpha=0.7,\n",
|
|
771 | 900 | " plot.scatter(\n",
|
772 | 901 | " x=cluster_anomalies[x_position_column],\n",
|
773 | 902 | " y=cluster_anomalies[y_position_column],\n",
|
774 |
| - " s=cluster_anomalies[page_rank_column] * 200 + 4,\n", |
| 903 | + " s=cluster_anomalies[size_column] * 200 + 4,\n", |
775 | 904 | " c=cluster_anomalies[anomaly_score_column],\n",
|
776 | 905 | " cmap=\"Reds\",\n",
|
777 | 906 | " alpha=0.9,\n",
|
|
782 | 911 | " cluster_medoids = cluster_non_noise[cluster_non_noise[cluster_medoid_column] == 1].sort_values(by=cluster_size_column, ascending=False).head(20)\n",
|
783 | 912 | " for index, row in cluster_medoids.iterrows():\n",
|
784 | 913 | " plot.annotate(\n",
|
785 |
| - " text=f\"{row[cluster_label_column]}:{truncate(row[code_unit_column], 20)} ({row[anomaly_score_column]:.4f})\",\n", |
| 914 | + " text=f\"{truncate(row[code_unit_column], 20)} (cluster {row[cluster_label_column]})\",\n", |
786 | 915 | " xy=(row[x_position_column], row[y_position_column]),\n",
|
787 | 916 | " xytext=(5, 5),\n",
|
788 | 917 | " alpha=0.4,\n",
|
789 | 918 | " **plot_annotation_style\n",
|
790 | 919 | " )\n",
|
791 | 920 | "\n",
|
| 921 | + " # Annotate top non-anomalies\n", |
| 922 | + " non_anomalies = cluster_without_anomalies.sort_values(by=anomaly_score_column, ascending=True).reset_index(drop=True).head(5)\n", |
| 923 | + " for dataframe_index, row in non_anomalies.iterrows():\n", |
| 924 | + " index = typing.cast(int, dataframe_index)\n", |
| 925 | + " plot.annotate(\n", |
| 926 | + " text=f\"#{index + 1}: {truncate(row[code_unit_column], 20)} ({row[anomaly_score_column]:.3f})\",\n", |
| 927 | + " xy=(row[x_position_column], row[y_position_column]),\n", |
| 928 | + " xytext=(5, 5 + (index % 5) * 10),\n", |
| 929 | + " color='green',\n", |
| 930 | + " alpha=0.7,\n", |
| 931 | + " **plot_annotation_style\n", |
| 932 | + " )\n", |
| 933 | + "\n", |
| 934 | + " # Annotate top anomalies\n", |
792 | 935 | " anomalies = cluster_anomalies.sort_values(by=anomaly_score_column, ascending=False).reset_index(drop=True).head(10)\n",
|
793 | 936 | " for dataframe_index, row in anomalies.iterrows():\n",
|
794 | 937 | " index = typing.cast(int, dataframe_index)\n",
|
795 | 938 | " plot.annotate(\n",
|
796 |
| - " text=f\"{row[cluster_label_column]}:{truncate(row[code_unit_column], 20)} ({row[anomaly_score_column]:.4f})\",\n", |
| 939 | + " text=f\"#{index + 1}: {truncate(row[code_unit_column], 20)} ({row[anomaly_score_column]:.3f})\",\n", |
797 | 940 | " xy=(row[x_position_column], row[y_position_column]),\n",
|
798 | 941 | " xytext=(5, 5 + (index % 5) * 10),\n",
|
799 | 942 | " color='red',\n",
|
|
1507 | 1650 | "metadata": {},
|
1508 | 1651 | "outputs": [],
|
1509 | 1652 | "source": [
|
1510 |
| - "java_type_embedding_anomaly_detection_features = java_type_anomaly_detection_features[features_for_visualization_excluded_from_training + ['embedding', 'pageRank']].copy()\n", |
| 1653 | + "java_type_embedding_anomaly_detection_features = java_type_anomaly_detection_features[features_for_visualization_excluded_from_training + ['embedding', 'pageRank', 'articleRank']].copy()\n", |
1511 | 1654 | "java_type_embedding_anomaly_detection_input = reduce_dimensionality_of_node_embeddings(java_type_embedding_anomaly_detection_features, max_dimensions=60, target_variance=0.95)\n",
|
1512 | 1655 | "java_type_embedding_anomaly_detection_feature_names = embedding_feature_names = [f'pca_{i}' for i in range(java_type_embedding_anomaly_detection_input.shape[1])]\n",
|
1513 | 1656 | "java_type_embedding_anomaly_detection_result = tune_anomaly_detection_models(java_type_embedding_anomaly_detection_input, contamination=\"auto\")\n",
|
|
1516 | 1659 | "display(get_top_10_anomalies(java_type_embedding_anomaly_detection_features, anomaly_label_column='anomalyOfEmbeddingLabel', anomaly_score_column='anomalyOfEmbeddingScore').reset_index(drop=True))"
|
1517 | 1660 | ]
|
1518 | 1661 | },
|
| 1662 | + { |
| 1663 | + "cell_type": "markdown", |
| 1664 | + "id": "1bac51eb", |
| 1665 | + "metadata": {}, |
| 1666 | + "source": [ |
| 1667 | + "#### 1.3c List the top (most normal) non-anomalies" |
| 1668 | + ] |
| 1669 | + }, |
| 1670 | + { |
| 1671 | + "cell_type": "code", |
| 1672 | + "execution_count": null, |
| 1673 | + "id": "6005ff1e", |
| 1674 | + "metadata": {}, |
| 1675 | + "outputs": [], |
| 1676 | + "source": [ |
| 1677 | + "display(get_top_10_non_anomalies(java_type_anomaly_detection_features).reset_index(drop=True))" |
| 1678 | + ] |
| 1679 | + }, |
| 1680 | + { |
| 1681 | + "cell_type": "markdown", |
| 1682 | + "id": "7e52ffa2", |
| 1683 | + "metadata": {}, |
| 1684 | + "source": [ |
| 1685 | + "#### 1.3d List the top (most normal) non-anomalies solely based on embeddings" |
| 1686 | + ] |
| 1687 | + }, |
| 1688 | + { |
| 1689 | + "cell_type": "code", |
| 1690 | + "execution_count": null, |
| 1691 | + "id": "7cfc7d61", |
| 1692 | + "metadata": {}, |
| 1693 | + "outputs": [], |
| 1694 | + "source": [ |
| 1695 | + "display(get_top_10_non_anomalies(java_type_embedding_anomaly_detection_features, anomaly_label_column='anomalyOfEmbeddingLabel', anomaly_score_column='anomalyOfEmbeddingScore').reset_index(drop=True))" |
| 1696 | + ] |
| 1697 | + }, |
| 1698 | + { |
| 1699 | + "cell_type": "markdown", |
| 1700 | + "id": "3b635a0e", |
| 1701 | + "metadata": {}, |
| 1702 | + "source": [ |
| 1703 | + "#### 1.3e Plot the distribution of the anomaly scores" |
| 1704 | + ] |
| 1705 | + }, |
| 1706 | + { |
| 1707 | + "cell_type": "code", |
| 1708 | + "execution_count": null, |
| 1709 | + "id": "40be411a", |
| 1710 | + "metadata": {}, |
| 1711 | + "outputs": [], |
| 1712 | + "source": [ |
| 1713 | + "plot_anomaly_score_distribution(java_type_anomaly_detection_features, title_prefix=\"Java Type\")" |
| 1714 | + ] |
| 1715 | + }, |
| 1716 | + { |
| 1717 | + "cell_type": "markdown", |
| 1718 | + "id": "1269582b", |
| 1719 | + "metadata": {}, |
| 1720 | + "source": [ |
| 1721 | + "#### 1.3f Plot the distribution of the anomaly scores solely based on embeddings" |
| 1722 | + ] |
| 1723 | + }, |
| 1724 | + { |
| 1725 | + "cell_type": "code", |
| 1726 | + "execution_count": null, |
| 1727 | + "id": "d676af42", |
| 1728 | + "metadata": {}, |
| 1729 | + "outputs": [], |
| 1730 | + "source": [ |
| 1731 | + "plot_anomaly_score_distribution(\n", |
| 1732 | + " java_type_embedding_anomaly_detection_features, \n", |
| 1733 | + " anomaly_label_column='anomalyOfEmbeddingLabel',\n", |
| 1734 | + " anomaly_score_column='anomalyOfEmbeddingScore',\n", |
| 1735 | + " title_prefix=\"Java Type Embeddings\"\n", |
| 1736 | + ")" |
| 1737 | + ] |
| 1738 | + }, |
1519 | 1739 | {
|
1520 | 1740 | "cell_type": "markdown",
|
1521 | 1741 | "id": "68a00628",
|
|
0 commit comments