sample_results/playground-series-s3e17.py

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder

# Load the data
train_data = pd.read_csv("./input/train.csv")
test_data = pd.read_csv("./input/test.csv")

# One-hot encode categorical features
encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
encoded_features = encoder.fit_transform(train_data[["Product ID", "Type"]])
encoded_test_features = encoder.transform(test_data[["Product ID", "Type"]])

# Add encoded features back to the dataframe
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())
train_data = train_data.join(encoded_df).drop(["Product ID", "Type"], axis=1)

encoded_test_df = pd.DataFrame(
    encoded_test_features, columns=encoder.get_feature_names_out()
)
test_data = test_data.join(encoded_test_df).drop(["Product ID", "Type"], axis=1)

# Split the data into features and target
X = train_data.drop(["Machine failure", "id"], axis=1)
y = train_data["Machine failure"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=rf, param_grid=param_grid, cv=3, scoring="roc_auc", n_jobs=-1
)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best estimator
best_rf = grid_search.best_estimator_

# Predict on the validation set using the best estimator
y_pred_proba = best_rf.predict_proba(X_val)[:, 1]

# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_val, y_pred_proba)
print(f"AUC-ROC score: {auc_roc}")

# Predict on the test set using the best estimator
test_predictions = best_rf.predict_proba(test_data.drop("id", axis=1))[:, 1]

# Create the submission file
submission = pd.DataFrame({"id": test_data["id"], "Machine failure": test_predictions})
submission.to_csv("./working/submission.csv", index=False)