Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from fastapi import FastAPI
import joblib
import shap
import pandas as pd
from pydantic import BaseModel
from typing import List

app = FastAPI()
model = joblib.load('fraud_model.pkl')
explainer = shap.TreeExplainer(model)
feature_names = pd.read_csv('X_test.csv').columns.tolist()

class TransactionBatch(BaseModel):
transactions: List[List[float]]

@app.post("/score_batch")
def score_batch(txn: TransactionBatch):
df = pd.DataFrame(txn.transactions, columns=feature_names)
if df.shape[1] != 32:
return {"error": f"Expected 32 features, got {df.shape[1]}"}
scores = model.predict_proba(df)[:, 1]
explanation = explainer(df)
results = []
for i, (score, shap_val) in enumerate(zip(scores, explanation)):
top_contribs = sorted(
zip(feature_names, shap_val.values),
key=lambda x: abs(x[1]),
reverse=True
)[:3]
summary = f"Txn {i+1}: Top contributors: " + ", ".join([f"{feat} ({val:.2f})" for feat, val in top_contribs])
action = "Escalate" if score > 0.5 else "Clear"
results.append({"txn_id": i+1, "score": score, "explanation": summary, "action": action})
return {"results": results}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import joblib
import shap
import pandas as pd
import numpy as np

model = joblib.load('fraud_model.pkl')
explainer = shap.TreeExplainer(model)

def decide_actions(txn_df):
scores = model.predict_proba(txn_df)[:, 1]
explanation = explainer(txn_df) # Use Explanation object
actions = []
for i, score in enumerate(scores):
top_contrib = max(explanation[i].values, key=abs) # Highest impact feature
if score >= 0.9:
actions.append(f"Txn {i+1}: Auto-block")
elif 0.6 <= score < 0.9:
if top_contrib < 0.2:
actions.append(f"Txn {i+1}: Soft-action (2FA)")
else:
actions.append(f"Txn {i+1}: Escalate to analyst")
elif 0.3 <= score < 0.6:
actions.append(f"Txn {i+1}: Low-priority review")
else:
actions.append(f"Txn {i+1}: Auto-clear")
return actions, scores, explanation

# Test
sample_txn = pd.read_csv('X_test.csv').iloc[:5]
actions, scores, explanation = decide_actions(sample_txn)
for action, score in zip(actions, scores):
print(f"{action}, Score: {score:.2%}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import redis
import pandas as pd
import json
import numpy as np

# Redis connection
try:
r = redis.Redis(host='localhost', port=6379, db=0)
# Test the connection
r.ping()
print("Connected to Redis successfully.")
except redis.ConnectionError as e:
print(f"Failed to connect to Redis: {e}")
print("Ensure Redis is running (e.g., 'docker run -d -p 6379:6379 redis')")
exit(1)

def store_features(user_id, features):
"""Store user features in Redis"""
try:
r.set(f"user:{user_id}", json.dumps(features))
print(f"Stored features for user {user_id}")
except redis.RedisError as e:
print(f"Error storing features for user {user_id}: {e}")

def get_features(user_id):
"""Get user features"""
try:
data = r.get(f"user:{user_id}")
return json.loads(data) if data else {}
except redis.RedisError as e:
print(f"Error retrieving features for user {user_id}: {e}")
return {}

# Example: Store sample features from data
try:
df = pd.read_csv('creditcard.csv').head(10) # Sample
for idx, row in df.iterrows():
user_id = f"pseudo_{idx}"
features = {
'txns_last_1h': row.get('txns_last_1h', 0), # From engineered data
'avg_amount_30d': row.get('avg_amount_30d', 0),
'LogAmount': np.log1p(row['Amount'])
}
store_features(user_id, features)
print("Features stored in Redis.")
except FileNotFoundError:
print("Error: 'creditcard.csv' file not found.")
except Exception as e:
print(f"Unexpected error: {e}")
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{"V1":-4.6461577641,"V2":3.6715139557,"V3":-4.1760446584,"V4":1.2884733288,"V5":-4.5454621389,"V6":-2.380136214,"V7":-2.8440829212,"V8":3.8047914845,"V9":-1.0061046703,"V10":-1.1232436847,"V11":0.8177747604,"V12":-0.5900369048,"V13":1.1868321443,"V14":-0.3982361923,"V15":-0.0600207548,"V16":-0.9539136242,"V17":-1.3571307889,"V18":-1.4918861338,"V19":0.2739588822,"V20":1.2975512462,"V21":0.511791034,"V22":-1.8226517963,"V23":-1.0000951617,"V24":0.3616802457,"V25":2.9714849164,"V26":-0.4663104955,"V27":1.1751528315,"V28":0.495456612,"Amount":-0.0292892604,"LogAmount":0.8710423175,"txns_last_1h":-1.220911055,"avg_amount_30d":-0.9780176443}]|0.9997038245201111|Legitimate (False Positive)
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import queue
import threading
import time
import pandas as pd
import json

# Simulate producer-consumer with queue (like Kafka topic)
txn_queue = queue.Queue()

def producer():
"""Simulate streaming transactions"""
df = pd.read_csv('creditcard.csv').head(100) # Sample data
for _, txn in df.iterrows():
txn_queue.put(json.dumps(txn.to_dict()))
time.sleep(0.1) # Simulate real-time
txn_queue.put(None) # End signal

def consumer(process_func):
"""Process incoming txns"""
while True:
msg = txn_queue.get()
if msg is None:
break
txn = json.loads(msg)
process_func(txn) # e.g., score + explain

# Example process func (integrate with model later)
def dummy_process(txn):
print(f"Processed txn: {txn['Amount']}")

# Run in threads
prod_thread = threading.Thread(target=producer)
cons_thread = threading.Thread(target=consumer, args=(dummy_process,))
prod_thread.start()
cons_thread.start()
prod_thread.join()
cons_thread.join()
print("Ingestion simulation complete.")
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
WARNING:root:Drift detected in 0: p=2.2259668901635153e-09
WARNING:root:Drift detected in 1: p=6.265700415209558e-24
WARNING:root:Drift detected in 2: p=2.192514889493521e-27
WARNING:root:Drift detected in 3: p=1.1118720522654423e-32
WARNING:root:Drift detected in 4: p=8.420075425111453e-10
WARNING:root:Drift detected in 5: p=1.1062631501529071e-20
WARNING:root:Drift detected in 6: p=1.2710110796096432e-26
WARNING:root:Drift detected in 7: p=8.554578488951965e-10
WARNING:root:Drift detected in 8: p=1.3810919419130143e-21
WARNING:root:Drift detected in 9: p=1.3896638405410009e-43
WARNING:root:Drift detected in 10: p=1.8436032445450472e-33
WARNING:root:Drift detected in 11: p=1.0263097879436605e-34
INFO:root:No drift in 12
WARNING:root:Drift detected in 13: p=3.746382080855379e-43
INFO:root:No drift in 14
WARNING:root:Drift detected in 15: p=1.595517457987386e-27
WARNING:root:Drift detected in 16: p=5.911320446824504e-29
WARNING:root:Drift detected in 17: p=2.643551661975532e-13
WARNING:root:Drift detected in 18: p=1.8259266797259267e-08
WARNING:root:Drift detected in 19: p=1.3311929376470102e-09
WARNING:root:Drift detected in 20: p=1.7180612040855814e-15
INFO:root:No drift in 21
WARNING:root:Drift detected in 22: p=0.020779715951691723
INFO:root:No drift in 23
INFO:root:No drift in 24
WARNING:root:Drift detected in 25: p=0.02259593421912716
WARNING:root:Drift detected in 26: p=2.86730062139326e-12
WARNING:root:Drift detected in 27: p=1.2479892016787744e-07
WARNING:root:Drift detected in 28: p=1.0748564136877228e-06
WARNING:root:Drift detected in 29: p=1.0748564136877228e-06
WARNING:root:Drift detected in 30: p=0.040101308793924074
INFO:root:No drift in 31
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import pandas as pd
from scipy.stats import ks_2samp
import logging

logging.basicConfig(filename='monitor.log', level=logging.INFO)

# Load train/test distributions
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')

# Check data drift (KS test per feature)
for col in X_train.columns:
stat, p = ks_2samp(X_train[col], X_test[col])
if p < 0.05:
logging.warning(f"Drift detected in {col}: p={p}")
else:
logging.info(f"No drift in {col}")

# Explanation drift: compare top SHAP over time (simplified; run periodically)
print("Monitoring complete. Check monitor.log")
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
pandas>=2.0.0
numpy>=1.24.0
xgboost>=2.0.0
scikit-learn>=1.3.0
imbalanced-learn>=0.10.0
shap>=0.41.0
dice-ml>=0.9.0
redis>=4.5.0
psycopg2-binary>=2.9.0
fastapi>=0.95.0
uvicorn>=0.20.0
streamlit>=1.20.0
matplotlib>=3.7.0
plotly>=5.10.0
requests>=2.28.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import pandas as pd
import xgboost as xgb
from sklearn.metrics import precision_recall_curve, auc, roc_auc_score
import joblib

# Load data
X_train = pd.read_csv('X_train.csv').values
y_train = pd.read_csv('y_train.csv').values.ravel()
X_test = pd.read_csv('X_test.csv').values
y_test = pd.read_csv('y_test.csv').values.ravel()

# Train
model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='aucpr', scale_pos_weight=1)
model.fit(X_train, y_train)

# Evaluate
y_pred_proba = model.predict_proba(X_test)[:, 1]
pr_auc = auc(*precision_recall_curve(y_test, y_pred_proba)[:2])
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"PR-AUC: {pr_auc:.4f}, ROC-AUC: {roc_auc:.4f}")

# Save
joblib.dump(model, 'fraud_model.pkl')
print("Model trained and saved.")
Loading