forked from WecoAI/aideml
-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathtabular-playground-series-jul-2022.py
39 lines (32 loc) · 1.47 KB
/
tabular-playground-series-jul-2022.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
# Load the dataset
data = pd.read_csv("./input/data.csv")
data.drop("id", axis=1, inplace=True)
# Standardize the features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
# Determine the optimal number of clusters using the elbow method and silhouette score
range_n_clusters = list(range(2, 11))
silhouette_scores = []
for n_clusters in range_n_clusters:
clusterer = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = clusterer.fit_predict(data_scaled)
silhouette_avg = silhouette_score(data_scaled, cluster_labels)
silhouette_scores.append(silhouette_avg)
# Select the number of clusters with the highest silhouette score
optimal_n_clusters = range_n_clusters[silhouette_scores.index(max(silhouette_scores))]
# Fit the KMeans model with the optimal number of clusters
kmeans = KMeans(n_clusters=optimal_n_clusters, random_state=42)
data["Predicted"] = kmeans.fit_predict(data_scaled)
# Evaluate the model using silhouette score
final_silhouette_score = silhouette_score(data_scaled, data["Predicted"])
print(
f"Silhouette Score for optimal clusters {optimal_n_clusters}: {final_silhouette_score}"
)
# Save the predictions to a CSV file
submission = pd.read_csv("./input/sample_submission.csv")
submission["Predicted"] = data["Predicted"]
submission.to_csv("./working/submission.csv", index=False)