forked from WecoAI/aideml
-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathicr-identify-age-related-conditions.py
66 lines (56 loc) · 1.88 KB
/
icr-identify-age-related-conditions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import log_loss, make_scorer
from sklearn.preprocessing import LabelEncoder
import numpy as np
# Load the data
train_data = pd.read_csv("./input/train.csv")
test_data = pd.read_csv("./input/test.csv")
# Encode categorical features
le = LabelEncoder()
train_data["EJ"] = le.fit_transform(train_data["EJ"])
test_data["EJ"] = le.transform(test_data["EJ"])
# Prepare the data
X = train_data.drop(["Id", "Class"], axis=1)
y = train_data["Class"]
X_test = test_data.drop("Id", axis=1)
# Define the model parameters and parameter grid for randomized search
model = lgb.LGBMClassifier(objective="binary", boosting_type="gbdt", is_unbalance=True)
param_grid = {
"learning_rate": [0.01, 0.05, 0.1],
"num_leaves": [15, 31, 63],
"max_depth": [-1, 5, 10],
"min_child_samples": [10, 20, 30],
"max_bin": [255, 300],
"subsample": [0.6, 0.8, 1.0],
"colsample_bytree": [0.3, 0.5, 0.7],
}
# Create a scorer for log loss
log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
# Perform randomized search with cross-validation
random_search = RandomizedSearchCV(
model,
param_distributions=param_grid,
n_iter=10,
scoring=log_loss_scorer,
cv=KFold(n_splits=10, shuffle=True, random_state=42),
random_state=42,
verbose=1,
)
random_search.fit(X, y)
# Best model and log loss
best_model = random_search.best_estimator_
best_score = -random_search.best_score_
print(f"Best Log Loss: {best_score}")
# Predict on test set with the best model
test_predictions = best_model.predict_proba(X_test)[:, 1]
# Create a submission file
submission = pd.DataFrame(
{
"Id": test_data["Id"],
"class_0": 1 - test_predictions,
"class_1": test_predictions,
}
)
submission.to_csv("./working/submission.csv", index=False)