-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtraining.py
107 lines (76 loc) · 3.12 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# import libraries
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Data manipulation
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
# model selection
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
# machine learning models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
import xgboost as xgb
# metrics
from sklearn.metrics import classification_report, accuracy_score, f1_score, mean_squared_error, precision_score, \
recall_score
# Save models
import joblib
# misc
from scipy import stats
from custom_transformer import CombinedAttributesAdder
df = pd.read_csv("WineQT.csv")
df = df.drop(['Id'], axis=1)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['quality'])
# prepare data now
df_train_final = df_train.drop("quality", axis=1)
y_train = df_train["quality"].copy()
num_pipeline = Pipeline([
('attribs_adder', CombinedAttributesAdder(add_ratio_density=False)),
('min_max_scaler', MinMaxScaler())
])
df_train_prepared = num_pipeline.fit_transform(df_train_final.values)
df_train_prepared = pd.DataFrame(df_train_prepared, columns=list(df_train_final.columns) + ["fixed_volatile"],
index=df_train_final.index)
le = LabelEncoder()
y_train_labeled = le.fit_transform(y_train)
models = [RandomForestClassifier(), xgb.XGBClassifier()]
scores = dict()
for m in models:
results = cross_val_score(m, df_train_prepared, y_train_labeled, cv=10, scoring="accuracy")
print(f'model: {str(m)}')
print(results)
print('-------------')
param_grid = {
'n_estimators': [25, 50, 100, 150],
'max_features': ['sqrt', 'log2', None],
'max_depth': [3, 6, 9],
'min_samples_split': [3, 5, 6, 7],
'max_leaf_nodes': [3, 6, 9],
'criterion': ['gini', 'entropy', 'log_loss']
}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, scoring="accuracy")
grid_search.fit(df_train_prepared, y_train_labeled )
final_model = grid_search.best_estimator_
df_test_final =df_test.drop("quality", axis=1)
y_test = df_test["quality"].copy()
df_test_prepared = num_pipeline.transform(df_test_final.values)
df_test_prepared = pd.DataFrame(df_test_prepared, columns = list(df_test_final.columns) + ["fixed_volatile"], index=df_test_final.index)
y_test_labeled = le.transform(y_test)
final_pred = final_model.predict(df_test_prepared)
final_pred_2 = le.inverse_transform(final_pred)
production_model = Pipeline([
('preparation', num_pipeline),
('prediction', final_model)
])
joblib.dump(production_model, "final_prod_model.pkl")
joblib.dump(le,'labelEncoder.joblib',compress=9)
print("process ended")