-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_characters.py
49 lines (41 loc) · 1.13 KB
/
get_characters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import codecs
DIGITS = "~!%'()+,-.\/0123456789:ABCDEFGIJKLMNOPRSTUVWYabcdefghiklmnoprstuvwxz-V،د‘“ ؤب,گ0ذصط3وLِbT2dh9ٰٴxAڈlژ؛؟أGاpث4/س7ًtCهKیُS\"۔WOcgk…ٓosw(ﷺجڑ.آئکتخز6غEشہقنضDNR8ظ:fnrvzپچB’”لء%)ْFحر5عںھف!JمIM#ّےUYَae'Pimة1uٹ+".decode('utf-8')
char = {}
files1 = open('train.txt','r').readlines()
files2 = open('valid.txt','r').readlines()
files = []
for x in files1:
files.append(x.strip('\n'))
for x in files2:
files.append(x.strip('\n'))
for f in files:
f1=codecs.open(os.path.splitext(f)[0]+ '.gt.txt','r',encoding='utf8')
while True:
c = f1.read(1)
if not c:
break
char[c] = 1
f1.close()
print len(char)
l = char.keys()
print len(DIGITS)
print len(set(l))
x = set(l)
x = filter(lambda a:a != '\n' , x)
#x = ''.join(l).strip().split()
for y in x:
if(DIGITS.find(y) == -1):
print y
DIGITS+=y
print len(x)
print len(DIGITS)
d = DIGITS.split()
d = set(d)
d = ''.join(d)
print len(d)
g = codecs.open('ch.txt','w','utf-8')
g.write(d)
g.close()