SIG-AI-Project-Track-Fall-2025/baseline.py at main · acm-uic/SIG-AI-Project-Track-Fall-2025 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
"""
Baseline Model for Housing Price Prediction Model

Several things need to be done by this script:
    1. Load the data
    2. Drop any data that is nonlinear
    3. One-Hot Encode Categorical Data (includes "rating" columns like 'view')
    4. Split data into training, validation, and testing.
    5. Train SGDRegressor up to 200 epochs (or more), recording MAE, RMSE, and R^2 after every epoch.
    6. Stop training early when validation MAE stops improving.
    7. Evaluate final model on the test set, print metrics
    8. Plot training/validation metrics (3 sub-plots) and save figures to 'results/[metric]_plot.png'
        [metric] = specific metric being plotted
        You can also save them all to one singular png if you can figure that out too.
    9. Save trained model ('models/baseline.joblib') and a JSON file containing all metrics ('results/baseline.json')
"""

# Imports
import json  # To dump metrics into a file
import os  # File managing

import matplotlib.pyplot as plt  # Plotting
import numpy as np  # Math functions
import pandas as pd  # Data processing
from joblib import dump  # Saving model
from sklearn.linear_model import SGDRegressor  # Used for model
from sklearn.metrics import mean_absolute_error  # Metric calc
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split  # Data splitting

# |--------------|
# | Load Dataset |
# |--------------|
DATA_PATH = "data/clean/(Clean) USA Housing Dataset.csv"  # Path to file
df = pd.read_csv(DATA_PATH)  # Reading file

# |----------------------------|
# | Remove Interaction Columns |
# |----------------------------|
interaction_cols = [
    "Lot-Living Ratio",
    "Basement Ratio",
    "Areas Per Bedroom",
    "Bathrooms Per Bedroom",  # |--- List of interaction columns
    "Bedrooms Per Floor",
    "Beds x Baths",
    "Sqft Living x Waterfront",
]
df = df.drop(columns=interaction_cols)  # Drop interaction_cols
df = df.drop(columns=["date"])  # We cannot convert `date` to float32

# |-------------------------------|
# | Separate target from features |
# |-------------------------------|
y = df.pop("price")  # Pop removes it from dataframe while saving it to `y`

# |---------------------------|
# | One-Hot Encode Categories |
# |---------------------------|
cat_cols = [
    "waterfront",
    "view",
    "condition",
    "city",
    "Day of Week",  # |---    List of categorical columns
    "Season Sold",
    "Is Renovated",
    "State",
    "ZIP Code",
]
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)  # Drop cat_cols

# |--------------------|
# | Convert to float32 |
# |--------------------|
X = df.astype(np.float32)  # Predictors
y = y.astype(np.float32)  # Target

# |---------------------------------|
# | Train/Validation/Testing Splits |
# |---------------------------------|
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42  # 70/30 Split
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42  # 70/15/15 Split
)

# |--------------|
# | SGDRegressor |
# |--------------|
model = SGDRegressor(
    loss="squared_error",  # In the slides I had this as `squared_loss`, I meant this
    penalty=None,  # No penalty to keep it purely linear
    max_iter=200, # 200 epochs
    random_state=42,
    learning_rate="constant",  # Can change this to `optimal` to make smoother changes in metrics
    eta0=1e-3,  # Sweet spot where it doesn't diverge and converges quickly
    warm_start=True  # Allows for continual calls of .partial_fit()
)

# |---------------|
# | Training Loop |
# |---------------|
n_epochs = 200  # 200 Epochs

# Initializing lists for metrics
train_mae, val_mae = [], []
train_rmse, val_rmse = [], []
train_r2, val_r2 = [], []

for epoch in range(n_epochs):
    model.partial_fit(X_train, y_train)

    y_pred_train = model.predict(X_train)  # Training
    y_pred_val = model.predict(X_val)  # Validation

    # Save MAE
    train_mae.append(mean_absolute_error(y_train, y_pred_train))
    val_mae.append(mean_absolute_error(y_val, y_pred_val))

    # Save RMSE
    train_rmse.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))
    val_rmse.append(np.sqrt(mean_squared_error(y_val, y_pred_val)))

    # Save R^2
    train_r2.append(r2_score(y_train, y_pred_train))
    val_r2.append(r2_score(y_val, y_pred_val))

    # Early stopping
    # if epoch > 0 and val_mae[-1] > min(val_mae[:-1]):
    #    print(f"Early stopping after epoch {epoch + 1}")
    #    break

# |------------|
# | Final Eval |
# |------------|
y_pred_test = model.predict(X_test)
test_mae = mean_absolute_error(y_test, y_pred_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
test_r2 = r2_score(y_test, y_pred_test)

print("\n=== Test Set Performance ===")
print(f"Test MAE: {test_mae:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")
print(f"Test R^2: {test_r2:.4f}")

# |--------------------------|
# | Plot metrics over epochs |
# |--------------------------|
epochs = range(1, len(train_mae) + 1)

plt.figure(figsize=(12, 4))

# MAE
plt.subplot(1, 3, 1)
plt.plot(epochs, train_mae, label="Train MAE")
plt.plot(epochs, val_mae, label="Val MAE", linestyle="--")
plt.xlabel("Epoch")
plt.ylabel("MAE")
plt.title("Mean Absolute Error")
plt.legend()

# RMSE
plt.subplot(1, 3, 2)
plt.plot(epochs, train_rmse, label="Train RMSE")
plt.plot(epochs, val_rmse, label="Val RMSE", linestyle="--")
plt.xlabel("Epoch")
plt.ylabel("RMSE")
plt.title("Root Mean Squared Error")
plt.legend()

# R^2
plt.subplot(1, 3, 3)
plt.plot(epochs, train_r2, label="Train R^2")
plt.plot(epochs, val_r2, label="Val R^2", linestyle="--")
plt.xlabel("Epoch")
plt.ylabel("R^2")
plt.title("Coefficient of Determination")
plt.legend()

plt.tight_layout()
os.makedirs("results", exist_ok=True)
plt.savefig("results/metrics_plot.png")

# |------------------------|
# | Save Model and Metrics |
# |------------------------|
os.makedirs("models", exist_ok=True)
dump(model, "models/baseline.joblib")

baseline_results = {
    "test_mae": test_mae,
    "test_rmse": test_rmse,
    "test_r2": test_r2,
    "train_mae_per_epoch": train_mae,
    "val_mae_per_epoch": val_mae,
    "train_rmse_per_epoch": train_rmse,
    "val_rmse_per_epoch": val_rmse,
    "train_r2_per_epoch": train_r2,
    "val_r2_per_epoch": val_r2,
}

with open("results/baseline.json", "w") as f:
    json.dump(baseline_results, f, indent=4)

print("\nBaseline model & metrics saved in `models/` and `results/`.")