-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathabuse.py
66 lines (48 loc) · 2.62 KB
/
abuse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from model.networks import generate_abuse_network
from utilities.data_management import make_dir, make_path, open_w_pandas, check_existence, get_model_path, \
vector_to_file, split_sets, get_embedding_path
from fasttext import load_model
from model.layers.realtime_embedding import RealtimeEmbedding
from keras.callbacks import EarlyStopping
from pandas import DataFrame
from config import dataset, max_tokens, training_verbosity, batch_size
from time import time
# Define paths
abuse_weights_path = get_model_path('abuse')
embedding_path = get_embedding_path()
base_path = make_path('data/processed_data/') / dataset / 'analysis'
data_path = make_path('data/prepared_data/abusive_data.csv')
dest_dir = base_path / 'abuse'
# Check for files and make directories
check_existence([embedding_path, data_path])
make_dir(abuse_weights_path.parent)
make_dir(dest_dir)
print('Config complete.')
# Load embeddings and contexts
embedding_model = load_model(str(embedding_path))
labels, documents = open_w_pandas(data_path)[['is_abusive', 'document_content']].sample(frac=1).values.transpose()
labels = labels.astype(bool)
print('Loaded data.')
training_data, testing_data, training_labels, testing_labels = split_sets(documents, labels=labels)
# Generate model
training = RealtimeEmbedding(embedding_model, training_data, training_labels, uniform_weights=True)
training.set_usage_mode(True)
testing = RealtimeEmbedding(embedding_model, testing_data, testing_labels, uniform_weights=True)
testing.set_usage_mode(True)
model = generate_abuse_network(max_tokens, embedding_dimension=training.embedding_dimension)
print('Generated model\n', model.summary())
start = time()
stopping_conditions = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)
history = model.fit_generator(training, epochs=50, verbose=training_verbosity, callbacks=[stopping_conditions],
validation_data=testing, shuffle=True).history
training_time = time() - start
print('Completed training in', training_time, 's')
DataFrame(history).to_csv(dest_dir / 'training_history.csv')
evaluated_accuracy = model.evaluate_generator(testing, verbose=training_verbosity)
print('Model validation accuracy', evaluated_accuracy)
model.save_weights(str(abuse_weights_path))
print('Completed training and saving abuse model.')
vector_to_file(training.data_source, dest_dir / 'training_data.csv')
vector_to_file(training.labels, dest_dir / 'training_labels.csv')
vector_to_file(testing.data_source, dest_dir / 'testing_data.csv')
vector_to_file(testing.labels, dest_dir / 'testing_labels.csv')