-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathpreprocess_data.py
executable file
·291 lines (239 loc) · 11.3 KB
/
preprocess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
import pickle
from collections import defaultdict
from itertools import islice, combinations
from datetime import datetime as dt
import warnings
import argparse
warnings.filterwarnings("ignore")
def changeDateFormat(x):
idx = x.index('-')
if idx == 2:
date = x[:idx]
if idx == 1:
date = '0'+x[:idx]
year = x[-2:]
month = x[-6:-3]
m = {'Jan':'01', 'JAN':'01',
'Feb':'02', 'FEB':'02',
'Mar':'03', 'MAR':'03',
'Apr':'04', 'APR':'04',
'May':'05', 'MAY':'05',
'Jun':'06', 'JUN':'06',
'Jul':'07', 'JUL':'07',
'Aug':'08', 'AUG':'08',
'Sep':'09', 'SEP':'09',
'Oct':'10', 'OCT':'10',
'Nov':'11', 'NOV':'11',
'Dec':'12', 'DEC':'12',
}
new_date = year+'-'+m[month]+'-'+date
return new_date
def changeDateFormat2(x):
new_date = x[2:10]
return new_date
def merge_attributes(df: pd.DataFrame, *args: str) -> None:
"""
dtype df: dataframe
dtype *args: strings (attribute names that want to be combined)
"""
iterables = [df[arg].astype(str) for arg in args]
columnName = '&'.join([*args])
fs = [''.join([v for v in var]) for var in zip(*iterables)]
df.loc[:, columnName] = fs
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
"""
dtype df: dataframe
rtype df: dataframe
"""
df = df.dropna(subset=['FOB.VALUE', 'TOTAL.TAXES']) # Remove 170 rows which does not have FOB, CIF value.
df.loc[:, 'Unitprice'] = df['CIF.VALUE']/df['QUANTITY']
df.loc[:, 'WUnitprice'] = df['CIF.VALUE']/df['GROSS.WEIGHT']
df.loc[:, 'TaxRatio'] = df['TOTAL.TAXES'] / df['CIF.VALUE']
df.loc[:, 'TaxUnitquantity'] = df['TOTAL.TAXES'] / df['QUANTITY']
df.loc[:, 'FOBCIFRatio'] = df['FOB.VALUE']/df['CIF.VALUE']
df.loc[:, 'HS6'] = df['TARIFF.CODE'].apply(lambda x: int(x // 10000))
df.loc[:, 'HS4'] = df['HS6'].apply(lambda x: int(x // 100))
df.loc[:, 'HS2'] = df['HS4'].apply(lambda x: int(x // 100))
# Made a general function "merge_attributes" for supporting any combination
merge_attributes(df, 'HS6', 'ORIGIN.CODE')
merge_attributes(df, 'OFFICE','IMPORTER.TIN')
merge_attributes(df, 'OFFICE','HS6')
merge_attributes(df, 'OFFICE','ORIGIN.CODE')
# # Generated all possible combinations, But the final AUC is smaller than just adding four combinations active below.
# candFeaturesCombine = ['OFFICE','IMPORTER.TIN','ORIGIN.CODE','HS6','DECLARANT.CODE']
# for subset in combinations(candFeaturesCombine, 2):
# merge_attributes(df, *subset)
# for subset in combinations(candFeaturesCombine, 3):
# merge_attributes(df, *subset)
# Day of Year of SGD.DATE
tmp2 = {}
for date in set(df['SGD.DATE']):
tmp2[date] = dt.strptime(date, '%y-%m-%d')
tmp_day = {}
tmp_week = {}
tmp_month = {}
yearStart = dt(tmp2[date].date().year, 1, 1)
for item in tmp2:
tmp_day[item] = (tmp2[item] - yearStart).days
tmp_week[item] = int(tmp_day[item] / 7)
tmp_month[item] = int(tmp_day[item] / 30)
df.loc[:, 'SGD.DayofYear'] = df['SGD.DATE'].apply(lambda x: tmp_day[x])
df.loc[:, 'SGD.WeekofYear'] = df['SGD.DATE'].apply(lambda x: tmp_week[x])
df.loc[:, 'SGD.MonthofYear'] = df['SGD.DATE'].apply(lambda x: tmp_month[x])
return df
def find_risk_profile(df: pd.DataFrame, feature: str, topk_ratio: float, adj: float, option: str) -> list or dict:
"""
dtype feature: str
dtype topk_ratio: float (range: 0-1)
dtype adj: float (to modify the mean)
dtype option: str ('topk', 'ratio')
rtype: list(option='topk') or dict(option='ratio')
The option topk is usually better than the ratio because of overfitting.
"""
# Top-k suspicious item flagging
if option == 'topk':
total_cnt = df.groupby([feature])['illicit']
nrisky_profile = int(topk_ratio*len(total_cnt))+1
# prob_illicit = total_cnt.mean() # Simple mean
adj_prob_illicit = total_cnt.sum() / (total_cnt.count()+adj) # Smoothed mean
return list(adj_prob_illicit.sort_values(ascending=False).head(nrisky_profile).index)
# Illicit-ratio encoding (Mean target encoding)
# Refer: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-munging/target-encoding.html
# Refer: https://towardsdatascience.com/why-you-should-try-mean-encoding-17057262cd0
elif option == 'ratio':
# For target encoding, we just use 70% of train data to avoid overfitting (otherwise, test AUC drops significantly)
total_cnt = df.sample(frac=0.7).groupby([feature])['illicit']
nrisky_profile = int(topk_ratio*len(total_cnt))+1
# prob_illicit = total_cnt.mean() # Simple mean
adj_prob_illicit = total_cnt.sum() / (total_cnt.count()+adj) # Smoothed mean
return adj_prob_illicit.to_dict()
def tag_risky_profiles(df: pd.DataFrame, profile: str, profiles: list or dict, option: str) -> pd.DataFrame:
"""
dtype df: dataframe
dtype profile: str
dtype profiles: list(option='topk') or dictionary(option='ratio')
dtype option: str ('topk', 'ratio')
rtype: dataframe
The option topk is usually better than the ratio because of overfitting.
"""
# Top-k suspicious item flagging
if option == 'topk':
d = defaultdict(int)
for id in profiles:
d[id] = 1
# print(list(islice(d.items(), 10))) # For debugging
df.loc[:, 'RiskH.'+profile] = df[profile].apply(lambda x: d[x])
# Illicit-ratio encoding
elif option == 'ratio':
overall_ratio_train = np.mean(train.illicit) # When scripting, saving it as a class variable is clearer.
df.loc[:, 'RiskH.'+profile] = df[profile].apply(lambda x: profiles.get(x, overall_ratio_train))
return df
def changeAttributeNames(df):
df = df.rename(columns={"OFFICE_CODE": "OFFICE",
"SGD_NUM": "SGD.NUMBER",
"SGD_DATE": "SGD.DATE",
"RECEIPT_NUM": "RECEIPT.NUMBER",
"IMPORTER_TIN": "IMPORTER.TIN",
"IMPORTER_NAME": "IMPORTER.NAME",
"DECLARANT_CODE": "DECLARANT.CODE",
"DECLARANT_NAME": "DECLARANT.NAME",
"INITIAL_ITM_FOB": "FOB.VALUE",
"INITIAL_ITM_CIF": "CIF.VALUE",
"ITM_NUM": "ITEM.NUMBER",
"HSCODE": "TARIFF.CODE",
"INITIAL_ITM_QTY": "QUANTITY",
"ORIGIN_CODE": "ORIGIN.CODE",
"INITIAL_ITM_TOTAL_TAX": "TOTAL.TAXES",
"INITIAL_ITM_WEIGHT": "GROSS.WEIGHT",
})
return df
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--week',
type=int,
default=2,
help="week number: e.g., --week 2")
parser.add_argument('--date',
type=str,
default='16-01-01',
help="training staring date")
args = parser.parse_args()
input_path = './data/Nigeria_pilot_weekly/week'+str(args.week)+'_ano.csv'
# Training data
df = pd.read_csv('./data/train.csv', encoding = "ISO-8859-1")
# Weekly testing data
dft = pd.read_csv(input_path)
print("Data is loaded.")
df = df.dropna(subset=["illicit"])
dft = changeAttributeNames(dft)
df['SGD.DATE'] = df['SGD.DATE'].apply(changeDateFormat)
# For test dataset, selectively use either changeDateFormat method or changeDateFormat2 method depending on the data format. -> Use changeDateFormat when SGD_DATE format is '14-APR-20', use changeDateFormat2 when SGD_DATE format is 2020-04-21T00:00:00Z
dft['SGD.DATE'] = dft['SGD.DATE'].apply(changeDateFormat2)
print(df.head(5).T)
print(dft.head(5).T)
dates = [args.date]
valid_start = '17-10-01'
for start_date in dates[::-1]:
assert start_date < valid_start
train = df[(df['SGD.DATE'] >= start_date) & (df['SGD.DATE'] < valid_start)]
valid = df[(df['SGD.DATE'] >= valid_start) & (df['SGD.DATE'] < '18-01-01')]
test = dft
# save label data
train_reg_label = train['RAISED_TAX_AMOUNT'].values
valid_reg_label = valid['RAISED_TAX_AMOUNT'].values
test_reg_label = np.zeros(len(test))
train_cls_label = train["illicit"].values
valid_cls_label = valid["illicit"].values
test_cls_label = np.zeros(len(test))
# Run preprocessing
train = preprocess(train)
valid = preprocess(valid)
test = preprocess(test)
print("Data size:")
print(train.shape, valid.shape,test.shape)
# Add a few more risky profiles
risk_profiles = {}
profile_candidates = ['IMPORTER.TIN', 'DECLARANT.CODE', 'TARIFF.CODE', 'QUANTITY', 'HS6', 'HS4', 'HS2', 'OFFICE'] + [col for col in train.columns if '&' in col]
for profile in profile_candidates:
option = 'topk'
risk_profiles[profile] = find_risk_profile(train, profile, 0.1, 10, option=option)
train = tag_risky_profiles(train, profile, risk_profiles[profile], option=option)
valid = tag_risky_profiles(valid, profile, risk_profiles[profile], option=option)
test = tag_risky_profiles(test, profile, risk_profiles[profile], option=option)
# Features to use in a classifier
column_to_use = ['FOB.VALUE', 'CIF.VALUE', 'TOTAL.TAXES', 'GROSS.WEIGHT', 'QUANTITY', 'Unitprice', 'WUnitprice', 'TaxRatio', 'FOBCIFRatio', 'TaxUnitquantity', 'TARIFF.CODE', 'HS6', 'HS4', 'HS2', 'SGD.DayofYear', 'SGD.WeekofYear', 'SGD.MonthofYear'] + [col for col in train.columns if 'RiskH' in col]
X_train = train[column_to_use].values
X_valid = valid[column_to_use].values
X_test = test[column_to_use].values
# impute nan
X_train = np.nan_to_num(X_train, 0)
X_valid = np.nan_to_num(X_valid, 0)
X_test = np.nan_to_num(X_test, 0)
# store all data in a dictionary
all_data = {"raw":{"train":train,"valid":valid,"test":test},
"xgboost_data":{"train_x":X_train,"train_y":train_cls_label,\
"valid_x":X_valid,"valid_y":valid_cls_label,\
"test_x":X_test,"test_y":test_cls_label},
"revenue":{"train":train_reg_label,"valid":valid_reg_label,"test":test_reg_label}}
# make sure the data size are correct
print("Checking data size...")
print(X_train.shape[0], train_cls_label.shape, train_reg_label.shape)
print(X_valid.shape[0], valid_cls_label.shape, valid_reg_label.shape)
print(X_test.shape[0], test_cls_label.shape, test_reg_label.shape)
from collections import Counter
print("Checking labled distribution")
cnt = Counter(train_cls_label)
print("Training:",cnt[1]/cnt[0])
cnt = Counter(valid_cls_label)
print("Validation:",cnt[1]/cnt[0])
cnt = Counter(test_cls_label)
print("Testing:",cnt[1]/cnt[0])
# pickle a variable to a file
file_name = f'./data/processed_data_{start_date}.pickle'
file = open(file_name, 'wb')
pickle.dump(all_data, file)
file.close()