-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathFeatureSelection.py
93 lines (78 loc) · 3.46 KB
/
FeatureSelection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# coding:utf-8
import os
import re
import sys
import tqdm
import numpy as np
import pandas as pd
from category_encoders import TargetEncoder
def filter_nan_feature(feature):
"""
:param feature: feature pd.Series
:return:
"""
return (np.sum(feature.isna()) / len(feature)) > 0.9
class FeatureSelection(object):
def __init__(self, *, input_path, output_path):
# init
self.__input_path, self.__output_path = input_path, output_path
# data prepare
self.__train_feature_before, self.__train_feature_after = [None for _ in range(2)];
self.__train, self.__test = [None for _ in range(2)]
self.__train_label = None
self.__train_feature, self.__test_feature = [None for _ in range(2)]
self.__categorical_columns = None
# data output
self.__train_select_feature, self.__test_select_feature = [None for _ in range(2)]
def data_prepare(self):
self.__train_feature_before = pd.read_csv(os.path.join(self.__input_path, "train_feature_before_df.csv"))
self.__train_feature_after = pd.read_csv(os.path.join(self.__input_path, "train_feature_after_df.csv"))
self.__train = pd.concat([self.__train_feature_before, self.__train_feature_after])
self.__test = pd.read_csv(os.path.join(self.__input_path, "test_feature_df.csv"))
self.__train_label = self.__train["TARGET"].copy()
self.__train_feature = (
self.__train.drop(
["TARGET"] + [col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col)], axis=1
)
).copy()
self.__test_feature = self.__test[self.__train_feature.columns.tolist()].copy()
self.__categorical_columns = self.__train_feature.select_dtypes(include="object").columns.tolist()
encoder = TargetEncoder()
encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label)
self.__train_feature[self.__categorical_columns] = encoder.transform(
self.__train_feature[self.__categorical_columns]
)
def feature_filter(self):
# np.nan feature filter
flag_list = []
for col in tqdm.tqdm(self.__train_feature.columns):
flag_list.append(filter_nan_feature(self.__train_feature[col]))
self.__train_feature = self.__train_feature[
[col for col, flag in zip(self.__train_feature.columns, flag_list) if flag is not True]]
# std filter
flag_list = []
for col in tqdm.tqdm(self.__train_feature.columns):
flag_list.append(self.__train_feature[col].std() < 0.01)
self.__train_feature = self.__train_feature[
[col for col, flag in zip(self.__train_feature.columns, flag_list) if flag is not True]]
def data_output(self):
self.__train_select_feature = (
self.__train[["TARGET"] + self.__train_feature.columns.tolist()]
)
self.__test_select_feature = (
self.__test[self.__train_feature.columns.tolist()]
)
self.__train_select_feature.to_csv(
os.path.join(self.__output_path, "train_select_feature_df.csv"), index=False
)
self.__test_select_feature.to_csv(
os.path.join(self.__output_path, "test_select_feature_df.csv"), index=False
)
if __name__ == "__main__":
fs = FeatureSelection(
input_path=sys.argv[1],
output_path=sys.argv[2]
)
fs.data_prepare()
fs.feature_filter()
fs.data_output()