ironhack-labs · Aminmoh9 · Oct 29, 2025 · Oct 29, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+mlflow.db
diff --git a/data/drift_report.html b/data/drift_report.html
diff --git a/data/iris_RandomForestClassifier.py b/data/iris_RandomForestClassifier.py
@@ -1,16 +1,94 @@
 import mlflow
+import mlflow.sklearn
+import numpy as np
+import pandas as pd
 from sklearn.datasets import load_iris
 from sklearn.model_selection import train_test_split
 from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score
+from evidently.report import Report
+from evidently.metric_preset import DataDriftPreset
+import warnings
+warnings.filterwarnings('ignore')
+
+mlflow.set_tracking_uri("http://127.0.0.1:5000")
+mlflow.set_experiment("Iris_Classification_Drift_Detection")
+
+print(f"Tracking URI: {mlflow.get_tracking_uri()}")
+
+#Enable MLflow autologging
+mlflow.autolog()
 
 # Load data and prep
+print("\nStep 1: Loading data and preparing for training...")
 iris_data = load_iris(as_frame=True)
 df = iris_data.frame
 X = df.drop(columns=["target"])
 y = df["target"]
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+#Train and log baseline model with Mlflow
+print("\nStep 2: Training and logging baseline model...")
+with mlflow.start_run(run_name ="iris_rf_baseline") as run:
+    # Train model
+    model = RandomForestClassifier(n_estimators=100, random_state=42)
+    model.fit(X_train, y_train)
+
+    #Calculate and log metrics
+    y_pred = model.predict(X_test)
+    accuracy = accuracy_score(y_test, y_pred)
+    mlflow.log_metric("accuracy", accuracy)
+
+    #Log model
+    mlflow.sklearn.log_model(model, "random_forest_model")
+
+    #Log training data as reference for drift detection
+    mlflow.log_text(X_train.to_csv(index=False), "reference_data.csv")
+
+    run_id = run.info.run_id
+    print(f"Model trained with accuracy: {accuracy: .4f}")
+    print(f"Mlflow Run ID: {run_id}")
+
+#Simulate feature drift
+print("\nStep 3: Simulating feature drift...")
+X_drifted = X_test.copy()
+#Introduce drift in sepal length feature
+X_drifted["sepal length (cm)"] += np.random.normal(loc= 2.0, scale = 0.3, size = len(X_drifted))
+
+#Detect drift using Evidently
+print("\nStep 4: Detecting drift with Evidently... ")
+report = Report(metrics=[DataDriftPreset()])
+report.run(reference_data=X_train, current_data=X_drifted)
+
+#Save report as HTML
+report.save_html("drift_report.html")
+
+#Log drift report to Mlflow
+print('\nStep 5: Logging drift report to Mlflow...')
+with mlflow.start_run(run_id = run_id):
+    mlflow.log_artifact("drift_report.html", "drift_reports")
+
+#Print drift summary 
+result = report.as_dict()
+n_drifted_features = result ['metrics'][0]['result']['number_of_drifted_columns']
+n_features = result['metrics'][0]['result']['number_of_columns']
+
+print(f"\nDrift Detection Result:")
+print(f"Features analyzed: {n_features}")
+print(f"Features drifted: {n_drifted_features}")
+print(f"Drift percentage: {(n_drifted_features / n_features) * 100: .2f}%")
+
+#Check model performance on drifted data
+print("\nStep 6: Evaluating model on drifted data...")
+y_pred_drifted = model.predict(X_drifted)
+accuracy_drifted = accuracy_score(y_test, y_pred_drifted)
+
+print(f"\nModel accuracy on drifted data: {accuracy_drifted: .4f}")
+
+#Log drifted performance to Mlflow
+with mlflow.start_run(run_id=run_id):
+    mlflow.log_metric("accuracy_drifted", accuracy_drifted)
 
-# Train
-model = RandomForestClassifier()
-model.fit(X_train, y_train)
+print("\nProcess completed! Check Mlflow UI for results.")
+print("Run 'mlflow ui --backend-store-uri sqlite:///mlflow.db' to view the results in MLflow UI")
diff --git a/mlartifacts/1/9dc9885b8c604676a5c0ee7d39e932f2/artifacts/drift_reports/drift_report.html b/mlartifacts/1/9dc9885b8c604676a5c0ee7d39e932f2/artifacts/drift_reports/drift_report.html