-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleanText.py
69 lines (52 loc) · 1.6 KB
/
cleanText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""
Fake news detection
The Doc2Vec pre-processing
"""
import numpy as np
import re
import string
import pandas as pd
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
from gensim import utils
from nltk.corpus import stopwords
def textClean(text):
"""
Get rid of the non-letter and non-number characters
"""
text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
text = text.lower().split()
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
text = " ".join(text)
return (text)
def cleanup(text):
text = textClean(text)
text = text.translate(None,string.punctuation)
#text.translate(str.maketrans("", "", string.punctuation))
return text
def constructLabeledSentences(data):
sentences = []
for index, row in data.iteritems():
sentences.append(LabeledSentence(utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)]))
return sentences
def clean_data():
"""
Generate processed string
"""
path1='test1.csv'
data1 = pd.read_csv(path1)
# path = 'train.csv'
vector_dimension=300
# data = pd.read_csv(path)
missing_rows = []
for i in range(len(data1)):
if data1.loc[i, 'text'] != data1.loc[i, 'text']:
missing_rows.append(i)
# data1 = data1.drop(missing_rows).reset_index().drop(['index','id'],axis=1)
for i in range(len(data1)):
data1.loc[i, 'text'] = cleanup(data1.loc[i,'text'])
data1 = data1.sample(frac=1).reset_index(drop=True)
X = data1.loc[0,'text']
xtest = X
np.save('xtest.npy',xtest)