diff --git a/README.md b/README.md index 9d9c61e..af0e939 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,23 @@ Adey Innovations Inc. seeks to enhance the detection of fraudulent transactions in e-commerce and banking sectors. This project focuses on developing advanced machine learning models to identify fraud with high accuracy by analyzing transaction data, creating sophisticated features, and implementing real-time monitoring systems. By improving fraud detection, Adey Innovations Inc. aims to reduce financial losses, bolster transaction security, and build stronger trust with customers and financial institutions. The project entails data preprocessing, feature engineering, model development, evaluation, and deployment, ensuring a comprehensive approach to combating fraud. -## Model Explainability Using SHAP +## 1. Exploratory Data Analysis (EDA) + +### Univariate analysis + +### Bivariate analysis + +### Feature Engineering + +## 2. Model Building and Training + +After trainig and testing 6 models 3 for each datasets i select the below models + +#### 2.1 Fraud-IP Dataset - XGBoost Model + +#### 2.2 Credit Card Dataset + +## 3. Model Explainability Using SHAP ### Summary Plot @@ -12,12 +28,14 @@ Adey Innovations Inc. seeks to enhance the detection of fraudulent transactions ![forceplot](https://github.com/Daniel-Andarge/AiML-financial-fraud-detection-model/blob/main/assets/shap-lime/forcePlot.png) -### Dependence Plot +## 4. Model Deployment and API Development + +### Running the flask app -![depndeplot](https://github.com/Daniel-Andarge/AiML-financial-fraud-detection-model/blob/main/assets/shap-lime/featureImpo.png) +### Testing the api -## Model Explainability Using LIME +### Testing the api from Postman -### Feature Importance Plot +### Building Docker Image -![limeplot](https://github.com/Daniel-Andarge/AiML-financial-fraud-detection-model/blob/main/assets/shap-lime/LIME.png) +### Running Docker Container diff --git a/assets/api-docker/build-docker-image.png b/assets/api-docker/build-docker-image.png new file mode 100644 index 0000000..fc89f15 Binary files /dev/null and b/assets/api-docker/build-docker-image.png differ diff --git a/assets/api-docker/docker-run.png b/assets/api-docker/docker-run.png new file mode 100644 index 0000000..94ed22c Binary files /dev/null and b/assets/api-docker/docker-run.png differ diff --git a/assets/api-docker/postman.png b/assets/api-docker/postman.png new file mode 100644 index 0000000..1510f31 Binary files /dev/null and b/assets/api-docker/postman.png differ diff --git a/assets/api-docker/run-flask.png b/assets/api-docker/run-flask.png new file mode 100644 index 0000000..0eb0548 Binary files /dev/null and b/assets/api-docker/run-flask.png differ diff --git a/assets/api-docker/test-flask.png b/assets/api-docker/test-flask.png new file mode 100644 index 0000000..6599629 Binary files /dev/null and b/assets/api-docker/test-flask.png differ diff --git a/assets/eda/featured_df.png b/assets/eda/featured_df.png new file mode 100644 index 0000000..46b149e Binary files /dev/null and b/assets/eda/featured_df.png differ diff --git a/assets/eda/fetureImpo.png b/assets/eda/fetureImpo.png new file mode 100644 index 0000000..e6a09d9 Binary files /dev/null and b/assets/eda/fetureImpo.png differ diff --git a/assets/eda/his1.png b/assets/eda/his1.png new file mode 100644 index 0000000..620e012 Binary files /dev/null and b/assets/eda/his1.png differ diff --git a/assets/model-building/lr1.png b/assets/model-building/lr1.png new file mode 100644 index 0000000..1c2c5dc Binary files /dev/null and b/assets/model-building/lr1.png differ diff --git a/assets/model-building/lr2.png b/assets/model-building/lr2.png new file mode 100644 index 0000000..14f861f Binary files /dev/null and b/assets/model-building/lr2.png differ diff --git a/assets/model-building/xg1.png b/assets/model-building/xg1.png new file mode 100644 index 0000000..fd800c2 Binary files /dev/null and b/assets/model-building/xg1.png differ diff --git a/assets/model-building/xg2.png b/assets/model-building/xg2.png new file mode 100644 index 0000000..18354f4 Binary files /dev/null and b/assets/model-building/xg2.png differ diff --git a/notebooks/model_training.ipynb b/notebooks/model_training.ipynb index 79761d9..eaa9ec1 100644 --- a/notebooks/model_training.ipynb +++ b/notebooks/model_training.ipynb @@ -554,71 +554,6 @@ "X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42, stratify=y_val_test)" ] }, - { - "cell_type": "code", - "execution_count": 8, - "id": "6a2935d4-2c26-41d8-afbd-7591ff09f0b7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Original class distribution:\n", - "class\n", - "0 9781\n", - "1 9781\n", - "Name: count, dtype: int64\n" - ] - }, - { - "ename": "ValueError", - "evalue": "Desired ratio is not achievable with the current dataset size.", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[8], line 18\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[38;5;66;03m# Ensure num_to_remove is non-negative\u001b[39;00m\n\u001b[0;32m 17\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m num_to_remove \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m---> 18\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDesired ratio is not achievable with the current dataset size.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 20\u001b[0m \u001b[38;5;66;03m# Perform undersampling\u001b[39;00m\n\u001b[0;32m 21\u001b[0m X_resampled, y_resampled \u001b[38;5;241m=\u001b[39m resample(X_train[y_train \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m], y_train[y_train \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m], \n\u001b[0;32m 22\u001b[0m replace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, n_samples\u001b[38;5;241m=\u001b[39mnum_to_remove, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m42\u001b[39m)\n", - "\u001b[1;31mValueError\u001b[0m: Desired ratio is not achievable with the current dataset size." - ] - } - ], - "source": [ - "from sklearn.utils import resample\n", - "\n", - "# Assuming X_train and y_train are your original training data\n", - "print(\"Original class distribution:\")\n", - "print(y_train.value_counts())\n", - "\n", - "# Calculate number of samples to remove from class 0 (not fraud)\n", - "desired_ratio = 0.4\n", - "total_samples = len(y_train)\n", - "class_0_count = (y_train == 0).sum()\n", - "class_1_count = (y_train == 1).sum()\n", - "\n", - "# Calculate number of samples to remove from class 0\n", - "num_to_remove = int(class_0_count - (class_1_count / desired_ratio))\n", - "\n", - "# Ensure num_to_remove is non-negative\n", - "if num_to_remove < 0:\n", - " raise ValueError(\"Desired ratio is not achievable with the current dataset size.\")\n", - "\n", - "# Perform undersampling\n", - "X_resampled, y_resampled = resample(X_train[y_train == 0], y_train[y_train == 0], \n", - " replace=False, n_samples=num_to_remove, random_state=42)\n", - "\n", - "# Combine undersampled class 0 with class 1\n", - "X_balanced = pd.concat([X_resampled, X_train[y_train == 1]], axis=0)\n", - "y_balanced = pd.concat([y_resampled, y_train[y_train == 1]], axis=0)\n", - "\n", - "# Shuffle to mix the classes\n", - "X_balanced, y_balanced = shuffle(X_balanced, y_balanced, random_state=42)\n", - "\n", - "# Check balanced class distribution\n", - "print(\"Balanced class distribution:\")\n", - "print(y_balanced.value_counts())" - ] - }, { "cell_type": "code", "execution_count": 22, @@ -1842,36 +1777,6 @@ "print(classification_report(y_val, y_val_pred, target_names=['Not Fraud', 'Fraud']))" ] }, - { - "cell_type": "code", - "execution_count": 6, - "id": "55440f29-e184-4315-960a-1753b5a4de23", - "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "module 'xgboost' has no attribute 'predict'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[6], line 6\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m classification_report\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Assuming xgb is your trained XGBClassifier model\u001b[39;00m\n\u001b[0;32m 4\u001b[0m \n\u001b[0;32m 5\u001b[0m \u001b[38;5;66;03m# Predict on validation data\u001b[39;00m\n\u001b[1;32m----> 6\u001b[0m y_pred \u001b[38;5;241m=\u001b[39m \u001b[43mxgb\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m(X_val)\n\u001b[0;32m 8\u001b[0m \u001b[38;5;66;03m# Print classification report\u001b[39;00m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28mprint\u001b[39m(classification_report(y_val, y_pred, target_names\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNot Fraud\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mFraud\u001b[39m\u001b[38;5;124m'\u001b[39m]))\n", - "\u001b[1;31mAttributeError\u001b[0m: module 'xgboost' has no attribute 'predict'" - ] - } - ], - "source": [ - "from sklearn.metrics import classification_report\n", - "\n", - "# Assuming xgb is your trained XGBClassifier model\n", - "\n", - "# Predict on validation data\n", - "y_pred = xgbm.predict(X_val)\n", - "\n", - "# Print classification report\n", - "print(classification_report(y_val, y_pred, target_names=['Not Fraud', 'Fraud']))" - ] - }, { "cell_type": "code", "execution_count": null,