Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
mlflow.db
46 changes: 46 additions & 0 deletions data/drift_report.html

Large diffs are not rendered by default.

86 changes: 82 additions & 4 deletions data/iris_RandomForestClassifier.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,94 @@
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
import warnings
warnings.filterwarnings('ignore')

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Iris_Classification_Drift_Detection")

print(f"Tracking URI: {mlflow.get_tracking_uri()}")

#Enable MLflow autologging
mlflow.autolog()

# Load data and prep
print("\nStep 1: Loading data and preparing for training...")
iris_data = load_iris(as_frame=True)
df = iris_data.frame
X = df.drop(columns=["target"])
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Train and log baseline model with Mlflow
print("\nStep 2: Training and logging baseline model...")
with mlflow.start_run(run_name ="iris_rf_baseline") as run:
# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

#Calculate and log metrics
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
mlflow.log_metric("accuracy", accuracy)

#Log model
mlflow.sklearn.log_model(model, "random_forest_model")

#Log training data as reference for drift detection
mlflow.log_text(X_train.to_csv(index=False), "reference_data.csv")

run_id = run.info.run_id
print(f"Model trained with accuracy: {accuracy: .4f}")
print(f"Mlflow Run ID: {run_id}")

#Simulate feature drift
print("\nStep 3: Simulating feature drift...")
X_drifted = X_test.copy()
#Introduce drift in sepal length feature
X_drifted["sepal length (cm)"] += np.random.normal(loc= 2.0, scale = 0.3, size = len(X_drifted))

#Detect drift using Evidently
print("\nStep 4: Detecting drift with Evidently... ")
report = Report(metrics=[DataDriftPreset()])
report.run(reference_data=X_train, current_data=X_drifted)

#Save report as HTML
report.save_html("drift_report.html")

#Log drift report to Mlflow
print('\nStep 5: Logging drift report to Mlflow...')
with mlflow.start_run(run_id = run_id):
mlflow.log_artifact("drift_report.html", "drift_reports")

#Print drift summary
result = report.as_dict()
n_drifted_features = result ['metrics'][0]['result']['number_of_drifted_columns']
n_features = result['metrics'][0]['result']['number_of_columns']

print(f"\nDrift Detection Result:")
print(f"Features analyzed: {n_features}")
print(f"Features drifted: {n_drifted_features}")
print(f"Drift percentage: {(n_drifted_features / n_features) * 100: .2f}%")

#Check model performance on drifted data
print("\nStep 6: Evaluating model on drifted data...")
y_pred_drifted = model.predict(X_drifted)
accuracy_drifted = accuracy_score(y_test, y_pred_drifted)

print(f"\nModel accuracy on drifted data: {accuracy_drifted: .4f}")

#Log drifted performance to Mlflow
with mlflow.start_run(run_id=run_id):
mlflow.log_metric("accuracy_drifted", accuracy_drifted)

# Train
model = RandomForestClassifier()
model.fit(X_train, y_train)
print("\nProcess completed! Check Mlflow UI for results.")
print("Run 'mlflow ui --backend-store-uri sqlite:///mlflow.db' to view the results in MLflow UI")

Large diffs are not rendered by default.

Loading