forked from WecoAI/aideml
-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathtabular-playground-series-oct-2022.py
86 lines (71 loc) · 2.36 KB
/
tabular-playground-series-oct-2022.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
# Read dtypes and replace 'float16' with 'float32'
dtypes_df = pd.read_csv("./input/train_dtypes.csv")
dtypes = {
k: (v if v != "float16" else "float32")
for (k, v) in zip(dtypes_df.column, dtypes_df.dtype)
}
# Read and concatenate training data
train_dfs = [pd.read_csv(f"./input/train_{i}.csv", dtype=dtypes) for i in range(10)]
train_df = pd.concat(train_dfs, ignore_index=True)
# Prepare the data
X = train_df.drop(
[
"game_num",
"event_id",
"event_time",
"player_scoring_next",
"team_scoring_next",
"team_A_scoring_within_10sec",
"team_B_scoring_within_10sec",
],
axis=1,
)
y = train_df[["team_A_scoring_within_10sec", "team_B_scoring_within_10sec"]]
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the model for team A
model_A = lgb.LGBMClassifier()
model_A.fit(X_train, y_train.iloc[:, 0])
# Train the model for team B
model_B = lgb.LGBMClassifier()
model_B.fit(X_train, y_train.iloc[:, 1])
# Predict on validation set for team A
val_preds_A = model_A.predict_proba(X_val)[:, 1]
# Predict on validation set for team B
val_preds_B = model_B.predict_proba(X_val)[:, 1]
# Combine predictions
val_preds = pd.DataFrame(
{
"team_A_scoring_within_10sec": val_preds_A,
"team_B_scoring_within_10sec": val_preds_B,
}
)
# Calculate log loss
val_log_loss = log_loss(y_val, val_preds)
print(f"Validation Log Loss: {val_log_loss}")
# Predict on test set
test_dtypes_df = pd.read_csv("./input/test_dtypes.csv")
test_dtypes = {
k: (v if v != "float16" else "float32")
for (k, v) in zip(test_dtypes_df.column, test_dtypes_df.dtype)
}
test_df = pd.read_csv("./input/test.csv", dtype=test_dtypes)
X_test = test_df.drop(["id"], axis=1)
# Predict on test set for team A
test_preds_A = model_A.predict_proba(X_test)[:, 1]
# Predict on test set for team B
test_preds_B = model_B.predict_proba(X_test)[:, 1]
# Combine predictions
test_preds = pd.DataFrame(
{
"team_A_scoring_within_10sec": test_preds_A,
"team_B_scoring_within_10sec": test_preds_B,
}
)
# Prepare submission
submission = pd.concat([test_df["id"], test_preds], axis=1)
submission.to_csv("./working/submission.csv", index=False)