-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpred_llm.py
148 lines (137 loc) · 7.43 KB
/
pred_llm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import numpy as np
import argparse
import pandas as pd
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from tabular_llm.predllm_utils import _encode_row_partial
from tabular_llm.predllm import PredLLM
import read_data
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', default="iris", type=str, nargs='?', help='dataset name')
parser.add_argument('--method', default="pred_llm", type=str, nargs='?', help='generative method')
parser.add_argument('--trainsize', default="1.0", type=str, nargs='?', help='size of training set')
parser.add_argument('--testsize', default="0.2", type=str, nargs='?', help='size of test set')
parser.add_argument('--gensize', default="1.0", type=str, nargs='?', help='size of generation set')
parser.add_argument('--runs', default="3", type=str, nargs='?', help='no of times to run algorithm')
args = parser.parse_args()
print("dataset: {}, method: {}, train_size: {}, test_size: {}, gen_size: {}".
format(args.dataset, args.method, args.trainsize, args.testsize, args.gensize))
dataset_input = args.dataset
method_input = args.method
train_size = float(args.trainsize)
test_size = float(args.testsize)
gen_size = float(args.gensize)
n_run = int(args.runs)
llm_batch_size = 32
llm_epochs = 50
if dataset_input == "classification":
datasets = ["iris", "breast_cancer", "australian",
"blood_transfusion", "steel_plates_fault",
"qsar_biodeg", "phoneme", "waveform",
"churn", "cardiotocography",
"kc1", "kc2", "balance_scale",
"diabetes", "compas", "bank", "adult"]
else:
datasets = [dataset_input]
print("datasets: {}".format(datasets))
if method_input == "all":
methods = ["original", "pred_llm"]
else:
methods = [method_input]
print("methods: {}".format(methods))
xgb_dataset_method_run = []
for dataset in datasets:
print("dataset: {}".format(dataset))
# compute no of generated samples
_, _, _, _, n_generative, _, n_feature, n_class, feature_names = read_data.gen_train_test_data(dataset,
train_size=gen_size,
normalize_x=None)
xgb_method_run = []
for method in methods:
print("method: {}".format(method))
xgb_run = np.zeros(n_run)
for run in range(n_run):
print("run: {}".format(run))
np.random.seed(run)
X_train, y_train, X_test, y_test, n_train, n_test, _, _, _ = \
read_data.gen_train_test_data(dataset, train_size, test_size, normalize_x=True, seed=run)
# train a classifier to predict labels of synthetic samples
xgb_org = XGBClassifier(random_state=run)
xgb_org.fit(X_train, y_train)
y_pred = xgb_org.predict(X_test)
acc_org = round(accuracy_score(y_test, y_pred), 4)
print("original accuracy-{}: {}".format(run, acc_org))
# train a generative method
if method == "original":
X_train_new, y_train_new = X_train, y_train
if method == "pred_llm":
X_y_train = np.append(X_train, y_train.reshape(-1, 1), axis=1)
X_y_train_df = pd.DataFrame(X_y_train)
X_y_train_df.columns = np.append(feature_names, "target")
predllm = PredLLM(llm='distilgpt2', batch_size=llm_batch_size, epochs=llm_epochs)
predllm.fit(X_y_train_df)
# compute length of input sequence
encoded_text = _encode_row_partial(X_y_train_df.iloc[0], shuffle=False)
prompt_len = len(predllm.tokenizer(encoded_text)["input_ids"])
X_y_train_new = predllm.sample_new(n_samples=n_generative, max_length=prompt_len, task="classification")
X_train_new = X_y_train_new.iloc[:, :-1].to_numpy(dtype=float).reshape(-1, n_feature)
y_train_new = X_y_train_new.iloc[:, -1:].to_numpy(dtype=float).reshape(-1, )
print("X_train: {}, y_train: {}".format(X_train.shape, y_train.shape))
print("n_generative: {}".format(n_generative))
print("X_train_new: {}, y_train_new: {}".format(X_train_new.shape, y_train_new.shape))
# convert labels from float to int
y_train_new = np.array([int(y) for y in y_train_new])
# save result to text file
file_name = "ds{}_tr{}_te{}_ge{}_run{}".format(dataset, train_size, test_size, gen_size, run)
with open("./results/_classification/{}/X_gen_{}.npz".format(method, file_name), "wb") as f:
np.save(f, X_train_new)
with open("./results/_classification/{}/y_gen_{}.npz".format(method, file_name), "wb") as f:
np.save(f, y_train_new)
# get number of generative classes
n_class_generative = len(np.unique(y_train_new))
# train a classifier
if n_class_generative != n_class:
print("generate less/more than the number of real classes")
acc_new = 0
else:
xgb_new = XGBClassifier(random_state=run)
xgb_new.fit(X_train_new, y_train_new)
y_pred = xgb_new.predict(X_test)
acc_new = round(accuracy_score(y_test, y_pred), 4)
print("dataset: {}, method: {}, accuracy-{}: {}".format(dataset, method, run, acc_new))
# save accuracy of each run of each method for each train_size in each dataset
xgb_run[run] = acc_new
# save result to text file
if run == (n_run - 1):
with open('./results/_classification/{}/accuracy_{}.txt'.format(method, file_name), 'w') as f:
acc_avg = round(np.mean(xgb_run), 4)
acc_std = round(np.std(xgb_run), 4)
f.write("accuracy ORG: {}\n".format(acc_org))
f.write("accuracy GEN: {}\n".format(acc_new))
f.write("accuracy AVG: {} ({})\n".format(acc_avg, acc_std))
# save accuracy of n_run of each method in each dataset
xgb_method_run.append(xgb_run)
# save accuracy of n_run of all methods in each dataset
xgb_dataset_method_run.append(xgb_method_run)
# save all results to csv file
n_dataset = len(datasets)
n_method = len(methods)
file_result = './results/_classification/accuracy_ds{}_me{}_tr{}_te{}_ge{}'.\
format(dataset_input, method_input, train_size, test_size, gen_size)
with open(file_result + ".csv", 'w') as f:
f.write("dataset,method,classifier,train_size,test_size,gen_size,run,accuracy\n")
for data_id in range(n_dataset):
for method_id in range(n_method):
for run_id in range(n_run):
dataset_name = datasets[data_id]
method_name = methods[method_id]
classifier = "xgb"
line = dataset_name + "," + method_name + "," + classifier + "," + \
str(train_size) + "," + str(test_size) + "," + str(gen_size) + "," + \
str(run_id) + "," + str(xgb_dataset_method_run[data_id][method_id][run_id]) + "\n"
f.write(line)
# save accuracy of all datasets to text file
with open(file_result + ".txt", 'w') as f:
acc_avg = round(np.mean(xgb_dataset_method_run), 4)
acc_std = round(np.std(xgb_dataset_method_run), 4)
f.write("accuracy: {} ({})\n".format(acc_avg, acc_std))