-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcontext_sequence_matrix.py
32 lines (26 loc) · 1.03 KB
/
context_sequence_matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from sklearn.feature_extraction.text import CountVectorizer
from utilities.data_management import make_path, open_w_pandas, vector_to_file
from scipy.sparse import save_npz
from numpy import asarray
from time import time
from config import dataset
# Define paths
base = make_path('data/processed_data/') / dataset / 'analysis' / 'intent'
source = base / 'contexts.csv'
matrix_path = base / 'document_matrix.npz'
feature_path = base / 'ngrams.csv'
# Load contexts
contexts = open_w_pandas(source)['contexts'].values.astype(str)
print('Data loaded')
# Initialize vectorizer
vectorizer = CountVectorizer(ngram_range=(3, 6), max_features=500000, token_pattern=r'\b\w+\b')
# Compute context-term matrix
start = time()
document_matrix = vectorizer.fit_transform(contexts)
print('Computed working sequence matrix in', start - time(), 'seconds')
sequences = asarray(vectorizer.get_feature_names())
print('Context ngram matrix computed, saving')
# Save data
save_npz(matrix_path, document_matrix)
vector_to_file(sequences, feature_path)
print('Save complete')