forked from WecoAI/aideml
-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathcompetitive-data-science-predict-future-sales.py
77 lines (63 loc) · 2.5 KB
/
competitive-data-science-predict-future-sales.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
# Load data
sales = pd.read_csv("./input/sales_train.csv")
test = pd.read_csv("./input/test.csv")
# Convert date to datetime and extract year and month
sales["date"] = pd.to_datetime(sales["date"], format="%d.%m.%Y")
sales["year"] = sales["date"].dt.year
sales["month"] = sales["date"].dt.month
# Aggregate data to monthly level
monthly_sales = (
sales.groupby(["year", "month", "shop_id", "item_id"])
.agg({"item_cnt_day": "sum"})
.reset_index()
)
monthly_sales.rename(columns={"item_cnt_day": "item_cnt_month"}, inplace=True)
# Create lag features
for lag in [1, 2, 3]:
shifted = monthly_sales.copy()
shifted["month"] += lag
shifted["year"] += shifted["month"] // 12
shifted["month"] %= 12
shifted.rename(
columns={"item_cnt_month": f"item_cnt_month_lag_{lag}"}, inplace=True
)
monthly_sales = pd.merge(
monthly_sales, shifted, on=["year", "month", "shop_id", "item_id"], how="left"
)
# Mean encoded features
item_mean = monthly_sales.groupby("item_id")["item_cnt_month"].mean().reset_index()
item_mean.rename(columns={"item_cnt_month": "item_mean_cnt"}, inplace=True)
shop_mean = monthly_sales.groupby("shop_id")["item_cnt_month"].mean().reset_index()
shop_mean.rename(columns={"item_cnt_month": "shop_mean_cnt"}, inplace=True)
monthly_sales = pd.merge(monthly_sales, item_mean, on="item_id", how="left")
monthly_sales = pd.merge(monthly_sales, shop_mean, on="shop_id", how="left")
# Prepare training data
X = monthly_sales.drop(["item_cnt_month", "year", "month"], axis=1)
y = monthly_sales["item_cnt_month"].clip(0, 20)
# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Model training
model = LGBMRegressor()
model.fit(X_train, y_train)
# Predictions
y_pred = model.predict(X_val).clip(0, 20)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse}")
# Prepare test set
test = pd.merge(
test,
monthly_sales.drop(["item_cnt_month"], axis=1),
on=["shop_id", "item_id"],
how="left",
).fillna(0)
# Drop 'year' and 'month' columns to match training data
test.drop(["year", "month"], axis=1, inplace=True)
# Make predictions on test set
test["item_cnt_month"] = model.predict(test.drop(["ID"], axis=1)).clip(0, 20)
# Save submission
test[["ID", "item_cnt_month"]].to_csv("./working/submission.csv", index=False)