1414import pandas as pd
1515import numpy as np
1616from sklearn .model_selection import train_test_split , cross_val_score
17- from sklearn .preprocessing import LabelEncoder , StandardScaler
17+ from sklearn .preprocessing import LabelEncoder
1818from sklearn .metrics import (
1919 accuracy_score , precision_score , recall_score , f1_score ,
20- roc_auc_score , confusion_matrix , classification_report ,
21- mean_squared_error , mean_absolute_error , r2_score
20+ roc_auc_score , confusion_matrix , mean_squared_error , mean_absolute_error , r2_score
2221)
2322from sklearn .ensemble import RandomForestClassifier , RandomForestRegressor
2423import xgboost as xgb
2726warnings .filterwarnings ('ignore' )
2827
2928# Database utilities
30- import sys
31- import os
29+ import sys # noqa: E402
30+ import os # noqa: E402
3231sys .path .append (os .path .dirname (os .path .dirname (os .path .abspath (__file__ ))))
3332
34- from operations .db_utils import (
35- save_dataframe_to_db ,
36- save_model_performance ,
33+ from operations .db_utils import ( # noqa: E402
34+ save_dataframe_to_db ,
35+ save_model_performance ,
3736 create_model_performance_table ,
3837 test_connection
3938)
40- from operations .db_config import TABLES , DB_CONFIG
39+ from operations .db_config import TABLES , DB_CONFIG # noqa: E402
4140
4241# Get the project root directory
4342PROJECT_ROOT = os .path .dirname (os .path .dirname (os .path .abspath (__file__ )))
@@ -178,7 +177,7 @@ def assign_credential_type(row):
178177
179178df ['target_credential_type' ] = df .apply (assign_credential_type , axis = 1 )
180179
181- print (f "Created target variables:" )
180+ print ("Created target variables:" )
182181print (f" - Retention: { df ['target_retention' ].value_counts ().to_dict ()} " )
183182print (f" - At Risk: { df ['target_at_risk' ].value_counts ().to_dict ()} " )
184183print (f" - Credential Type: { df ['target_credential_type' ].value_counts ().to_dict ()} " )
@@ -285,8 +284,8 @@ def preprocess_features(df, feature_list):
285284print ("TESTING MULTIPLE MODELS WITH CROSS-VALIDATION" )
286285print ("-" * 80 )
287286
288- from sklearn .linear_model import LogisticRegression
289- from sklearn .model_selection import StratifiedKFold
287+ from sklearn .linear_model import LogisticRegression # noqa: E402
288+ from sklearn .model_selection import StratifiedKFold # noqa: E402
290289
291290models_to_test = {
292291 'Logistic Regression' : LogisticRegression (
@@ -349,11 +348,11 @@ def preprocess_features(df, feature_list):
349348 print (f" Gap: { gap :.4f} ({ gap * 100 :.2f} %)" )
350349
351350 if gap < 0.05 :
352- print (f " ✓ No overfitting (gap < 5%)" )
351+ print (" ✓ No overfitting (gap < 5%)" )
353352 elif gap < 0.10 :
354- print (f " ⚠ Minimal overfitting (gap < 10%)" )
353+ print (" ⚠ Minimal overfitting (gap < 10%)" )
355354 else :
356- print (f " ✗ Overfitting detected (gap > 10%)" )
355+ print (" ✗ Overfitting detected (gap > 10%)" )
357356
358357 model_comparison .append ({
359358 'Model' : model_name ,
@@ -413,8 +412,8 @@ def preprocess_features(df, feature_list):
413412
414413print ("\n Confusion Matrix:" )
415414cm = confusion_matrix (y_test , y_pred )
416- print (f " Predicted" )
417- print (f " Not Ret Retained" )
415+ print (" Predicted" )
416+ print (" Not Ret Retained" )
418417print (f"Actual Not { cm [0 ,0 ]:6d} { cm [0 ,1 ]:6d} " )
419418print (f" Ret { cm [1 ,0 ]:6d} { cm [1 ,1 ]:6d} " )
420419
@@ -545,8 +544,8 @@ def assign_alert_level(risk_score):
545544low_retention_low_risk = df [(df ['retention_probability' ] < 0.3 ) & (df ['at_risk_alert' ] == 'LOW' )]
546545print (f"Students with <30% retention flagged as LOW: { len (low_retention_low_risk )} (should be very few)" )
547546
548- print (f "\n Early warning system aligned with retention predictions" )
549- print (f "\n Alert distribution:" )
547+ print ("\n Early warning system aligned with retention predictions" )
548+ print ("\n Alert distribution:" )
550549print (df ['at_risk_alert' ].value_counts ().sort_index ())
551550
552551# ============================================================================
@@ -611,7 +610,7 @@ def assign_alert_level(risk_score):
611610 df ['predicted_time_to_credential' ] = time_model .predict (X_full_retention )
612611 df ['predicted_graduation_year' ] = df ['Cohort' ].str [:4 ].astype (float ) + df ['predicted_time_to_credential' ]
613612
614- print (f "Time predictions generated" )
613+ print ("Time predictions generated" )
615614else :
616615 print ("Warning: Insufficient data for time-to-credential model" )
617616 df ['predicted_time_to_credential' ] = np .nan
@@ -630,7 +629,7 @@ def assign_alert_level(risk_score):
630629y_credential = y_credential [valid_idx ]
631630
632631print (f"\n Dataset size: { len (X_cred ):,} students" )
633- print (f "Credential type distribution:" )
632+ print ("Credential type distribution:" )
634633cred_labels = {0 : 'No Credential' , 1 : 'Certificate' , 2 : 'Associate' , 3 : 'Bachelor' }
635634for k , v in y_credential .value_counts ().sort_index ().items ():
636635 print (f" { cred_labels .get (k , k )} : { v :,} ({ v / len (y_credential )* 100 :.1f} %)" )
@@ -677,7 +676,7 @@ def assign_alert_level(risk_score):
677676 model_name = 'Credential Type Prediction' ,
678677 model_type = 'classification' ,
679678 metrics = {'accuracy' : cred_accuracy , 'f1' : cred_f1 },
680- notes = f 'Random Forest Classifier - 4 classes (No Credential, Certificate, Associate, Bachelor)'
679+ notes = 'Random Forest Classifier - 4 classes (No Credential, Certificate, Associate, Bachelor)'
681680 )
682681
683682# Generate predictions for all students
@@ -699,7 +698,7 @@ def assign_alert_level(risk_score):
699698 if class_idx < len (prob_labels ):
700699 df [prob_labels [int (class_idx )]] = proba [:, i ]
701700
702- print (f "Credential type predictions generated" )
701+ print ("Credential type predictions generated" )
703702
704703# ============================================================================
705704# STEP 8: MODEL 5 - GATEWAY MATH SUCCESS PREDICTION
@@ -784,8 +783,8 @@ def assign_alert_level(risk_score):
784783
785784print ("\n Confusion Matrix:" )
786785cm = confusion_matrix (y_test , y_pred )
787- print (f " Predicted" )
788- print (f " No Pass Pass" )
786+ print (" Predicted" )
787+ print (" No Pass Pass" )
789788print (f"Actual No { cm [0 ,0 ]:6d} { cm [0 ,1 ]:6d} " )
790789print (f" Pass { cm [1 ,0 ]:6d} { cm [1 ,1 ]:6d} " )
791790
@@ -795,7 +794,7 @@ def assign_alert_level(risk_score):
795794 model_name = 'Gateway Math Success Prediction' ,
796795 model_type = 'classification' ,
797796 metrics = {'accuracy' : math_accuracy , 'auc_roc' : math_auc , 'precision' : math_precision , 'recall' : math_recall , 'f1_score' : math_f1 },
798- notes = f 'XGBoost - Predicts gateway math completion Year 1'
797+ notes = 'XGBoost - Predicts gateway math completion Year 1'
799798 )
800799
801800# Generate predictions for all students
@@ -810,7 +809,7 @@ def assign_alert_level(risk_score):
810809 labels = ['High Risk' , 'Moderate Risk' , 'Likely Pass' , 'Very Likely Pass' ]
811810)
812811
813- print (f "Gateway math predictions generated" )
812+ print ("Gateway math predictions generated" )
814813
815814# ============================================================================
816815# STEP 9: MODEL 6 - GATEWAY ENGLISH SUCCESS PREDICTION (NEW!)
@@ -895,8 +894,8 @@ def assign_alert_level(risk_score):
895894
896895print ("\n Confusion Matrix:" )
897896cm = confusion_matrix (y_test , y_pred )
898- print (f " Predicted" )
899- print (f " No Pass Pass" )
897+ print (" Predicted" )
898+ print (" No Pass Pass" )
900899print (f"Actual No { cm [0 ,0 ]:6d} { cm [0 ,1 ]:6d} " )
901900print (f" Pass { cm [1 ,0 ]:6d} { cm [1 ,1 ]:6d} " )
902901
@@ -906,7 +905,7 @@ def assign_alert_level(risk_score):
906905 model_name = 'Gateway English Success Prediction' ,
907906 model_type = 'classification' ,
908907 metrics = {'accuracy' : english_accuracy , 'auc_roc' : english_auc , 'precision' : english_precision , 'recall' : english_recall , 'f1_score' : english_f1 },
909- notes = f 'XGBoost - Predicts gateway English completion Year 1'
908+ notes = 'XGBoost - Predicts gateway English completion Year 1'
910909 )
911910
912911# Generate predictions for all students
@@ -921,7 +920,7 @@ def assign_alert_level(risk_score):
921920 labels = ['High Risk' , 'Moderate Risk' , 'Likely Pass' , 'Very Likely Pass' ]
922921)
923922
924- print (f "Gateway English predictions generated" )
923+ print ("Gateway English predictions generated" )
925924
926925# ============================================================================
927926# STEP 10: MODEL 7 - FIRST-SEMESTER GPA < 2.0 PREDICTION (NEW! - FIXED DATA LEAKAGE)
@@ -1009,8 +1008,8 @@ def assign_alert_level(risk_score):
10091008
10101009print ("\n Confusion Matrix:" )
10111010cm = confusion_matrix (y_test , y_pred )
1012- print (f " Predicted" )
1013- print (f " GPA>=2.0 GPA<2.0" )
1011+ print (" Predicted" )
1012+ print (" GPA>=2.0 GPA<2.0" )
10141013print (f"Actual >=2.0 { cm [0 ,0 ]:6d} { cm [0 ,1 ]:6d} " )
10151014print (f" <2.0 { cm [1 ,0 ]:6d} { cm [1 ,1 ]:6d} " )
10161015
@@ -1020,7 +1019,7 @@ def assign_alert_level(risk_score):
10201019 model_name = 'First-Semester Low GPA Prediction' ,
10211020 model_type = 'classification' ,
10221021 metrics = {'accuracy' : gpa_accuracy , 'auc_roc' : gpa_auc , 'precision' : gpa_precision , 'recall' : gpa_recall , 'f1_score' : gpa_f1 },
1023- notes = f 'XGBoost - Predicts GPA < 2.0 risk (NO DATA LEAKAGE)'
1022+ notes = 'XGBoost - Predicts GPA < 2.0 risk (NO DATA LEAKAGE)'
10241023 )
10251024
10261025# Generate predictions for all students
@@ -1033,7 +1032,7 @@ def assign_alert_level(risk_score):
10331032 labels = ['Low Risk' , 'Moderate Risk' , 'High Risk' , 'Critical Risk' ]
10341033)
10351034
1036- print (f "Low GPA predictions generated" )
1035+ print ("Low GPA predictions generated" )
10371036
10381037# ============================================================================
10391038# STEP 11: SAVE PREDICTIONS TO STUDENT-LEVEL FILE
@@ -1066,7 +1065,7 @@ def assign_alert_level(risk_score):
10661065 if_exists = 'replace'
10671066 )
10681067 if success :
1069- print (f "✓ Student-level predictions saved to database" )
1068+ print ("✓ Student-level predictions saved to database" )
10701069 print (f" Table: { TABLES ['student_predictions' ]} " )
10711070 print (f" Records: { len (df ):,} " )
10721071 print (f" Columns: { len (df .columns )} " )
@@ -1077,7 +1076,7 @@ def assign_alert_level(risk_score):
10771076# Always save CSV files for backup and local analysis
10781077output_file = os .path .join (DATA_DIR , 'bishop_state_student_level_with_predictions.csv' )
10791078df .to_csv (output_file , index = False )
1080- print (f "\n ✓ Saved student-level predictions to CSV:" )
1079+ print ("\n ✓ Saved student-level predictions to CSV:" )
10811080print (f" File: { output_file } " )
10821081print (f" Records: { len (df ):,} " )
10831082print (f" Columns: { len (df .columns )} " )
@@ -1117,15 +1116,15 @@ def assign_alert_level(risk_score):
11171116 if_exists = 'replace'
11181117 )
11191118 if success :
1120- print (f "✓ Course-level predictions saved to database" )
1119+ print ("✓ Course-level predictions saved to database" )
11211120 print (f" Table: { TABLES ['course_predictions' ]} " )
11221121 print (f" Records: { len (merged_with_predictions ):,} " )
11231122 print (f" Columns: { len (merged_with_predictions .columns )} " )
11241123
11251124# Always save CSV files for backup and local analysis
11261125output_file = os .path .join (DATA_DIR , 'bishop_state_merged_with_predictions.csv' )
11271126merged_with_predictions .to_csv (output_file , index = False )
1128- print (f "\n ✓ Saved course-level predictions to CSV:" )
1127+ print ("\n ✓ Saved course-level predictions to CSV:" )
11291128print (f" File: { output_file } " )
11301129print (f" Records: { len (merged_with_predictions ):,} " )
11311130print (f" Columns: { len (merged_with_predictions .columns )} " )
@@ -1165,7 +1164,7 @@ def assign_alert_level(risk_score):
11651164 pct = count / len (df ) * 100
11661165 summary_report += f" { cat :20s} { count :6,} ({ pct :5.1f} %)\n "
11671166
1168- summary_report += f """
1167+ summary_report += """
116911682. EARLY WARNING SYSTEM
11701169 Algorithm: Composite Risk Score (Retention + Performance Metrics)
11711170 Approach: Aligned with retention predictions to eliminate contradictions
@@ -1331,14 +1330,14 @@ def assign_alert_level(risk_score):
13311330 print (" ✗ Database connection failed - used CSV fallback" )
13321331
13331332# Record counts loaded to database
1334- print (f "\n Records Loaded to Database:" )
1333+ print ("\n Records Loaded to Database:" )
13351334if db_connected == 1 :
13361335 print (f" - student_predictions table: { len (df ):,} records" )
13371336 print (f" - course_predictions table: { len (merged_with_predictions ):,} records" )
1338- print (f " - ml_model_performance table: 4 model records" )
1337+ print (" - ml_model_performance table: 4 model records" )
13391338 print (f"\n Total records saved: { len (df ) + len (merged_with_predictions ) + 4 :,} " )
13401339else :
1341- print (f " - No records loaded to database (CSV fallback used)" )
1340+ print (" - No records loaded to database (CSV fallback used)" )
13421341 print (f" - student_predictions.csv: { len (df ):,} records" )
13431342 print (f" - course_predictions.csv: { len (merged_with_predictions ):,} records" )
13441343
0 commit comments