-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathat_glance.py
224 lines (206 loc) · 9.56 KB
/
at_glance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import six
assert six.PY3, "Run me with Python3"
import jinja2
import json
import argparse
import yaml
from yaml.loader import SafeLoader
import sys
import functools
import os.path
import re
def sum_dicts(dicts):
#dicts is a bunch of dicts with int values, for all keys sums all values
res={}
for k in dicts[0]:
if isinstance(dicts[0][k],int):
res[k]=sum(d[k] for d in dicts)
return res
def thousand_sep_filter(val,use_k=False):
"""Used from the template to produce thousand-separated numbers, optionally with "K" for thousands"""
if not use_k:
return "{:,}".format(val)
else:
if val==0:
return "-"
elif val<1000:
return "<1K"
else:
return "{:,}K".format(val//1000)
def tag_filter(counts):
"""Used from the template to produce the L-F-D tags"""
result=""
empty_span='<span class="tagspan"></span>'
tag_span='<span class="tagspan"><span class="hint--top hint--info" data-hint="%s"><img class="propertylogo" src="logos/%s.svg" /></span></span>'
if counts["word"] and counts["word_w_lemma"]/counts["word"]>0.1:
result+=tag_span%("Lemmas","L")
else:
result+=empty_span
if len(counts["fvals"])>5:
result+=tag_span%("Features","F")
else:
result+=empty_span
if counts["word"] and counts["word_w_deps"]>10:
result+=tag_span%("Secondary dependencies","D")
else:
result+=empty_span
return result
def annotation_filter(metadata):
"""Used from the template to produce the conversion logo"""
source=metadata["source"]["all"]
if source=="automatic":
return '<span class="hint--top hint--info" data-hint="Automatic conversion"><i class="fa fa-cogs"></i></span>'
elif source=="semi-automatic":
return '<span class="hint--top hint--info" data-hint="Automatic conversion with manual corrections"><i class="fa fa-cogs"></i><i class="fa fa-check"></i></span>'
elif source=="manual":
return '<span class="hint--top hint--info" data-hint="Full manual check of the data"><i class="fa fa-user"></i></span>'
else:
return '<span class="hint--top hint--info" data-hint="Unknown">?</span>'
def genre_filter(genres, genre_symbols={}):
"""
Used from the template to produce the genre symbols.
"""
genres = sorted(set(genres))
span = '<i class="fa fa-%s"></i>'
symbols = ' '.join(genres)
spans = ''.join(span%genre_symbols.get(g,"file-o") for g in genres)
return '<span class="hint--top hint--info" data-hint="%s">%s</span>'%(symbols,spans)
def family_filter(language_family_genus):
"""
Used from the template to produce language family with genre if present.
"""
classification = language_family_genus[0]
if len(language_family_genus) > 1 and language_family_genus[1]:
# Shorten genera that are too long for the accordion.
genus = language_family_genus[1]
genus = re.sub(r'^Central Malayo-Polynesian$', r'Malayo-Polynesian', genus)
genus = re.sub(r' Pama-Nyungan$', r'', genus) # e.g. Western Pama-Nyungan
classification += ', ' + genus
return classification
def license_filter(lic):
"""Used from the template to produce the license logo"""
lic_abbr,lic_name=lic # something like BY-SA, CC BY-SA 4.0 unported
if lic_abbr == "GNU":
logo_file = "gpl"
elif lic_name.startswith("CC0"):
logo_file = "cc-zero"
elif lic_name.startswith("CC"):
logo_file = lic_abbr.lower()
elif lic == "LGPLLR":
logo_file = "LGPLLR"
else:
logo_file = None
if logo_file:
return '<span class="hint--top hint--info" data-hint="%s"><img class="license" src="logos/%s.svg" /></span>'%(lic_name,logo_file)
else:
return '<span class="hint--top hint--info" data-hint="%s">?</span>'%(lic_name)
def contributor_filter(contributors):
cont_list=[]
for c in contributors:
parts=c.split(", ",1)
if len(parts)==2:
cont_list.append(parts[1]+" "+parts[0])
else:
cont_list.append(parts[0])
return ", ".join(cont_list)
def stars_filter(scorestars):
"""
Used from the template to produce stars rating the treebank.
Takes a pair of floats (score,stars).
"""
score=scorestars[0]
stars=scorestars[1]
return '<span class="hint--top hint--info" data-hint="%f"><img src="/img/stars%02d.png" style="max-height:1em; vertical-align:middle" /></span>'%(score,stars*10)
if __name__=="__main__":
opt_parser = argparse.ArgumentParser(description='Generates the index page table')
opt_parser.add_argument('--codes-flags', help="YAML file with language codes and flags.")
opt_parser.add_argument('--releases', help="JSON file with release descriptions.")
opt_parser.add_argument('--genre-symbols', help="JSON file with genre symbols.")
opt_parser.add_argument('--subset', default=None, action='store', help="Default: print all. Optionally select one of 'current', 'sapling', 'retired'.")
opt_parser.add_argument('--docs-dir', default="docs-src", action="store", help="Docs dir so we can check for existence of files. Default '%(default)s'.")
opt_parser.add_argument('input', nargs='+', help='Input corpus stat json files')
args=opt_parser.parse_args()
with open(args.codes_flags) as f:
codes_flags = yaml.load(f, Loader=SafeLoader)
# Get the list of used genera for each language family.
# We will only display the genus if we have languages from multiple genera of that family.
family_genera = {}
for l in codes_flags:
if not codes_flags[l]['family'] in family_genera:
family_genera[codes_flags[l]['family']] = {}
if 'genus' in codes_flags[l]:
family_genera[codes_flags[l]['family']][codes_flags[l]['genus']] = True
# Replace the dictionary of genera with the number of genera.
for f in family_genera:
family_genera[f] = len(list(family_genera[f]))
with open(args.releases) as f:
releases = json.load(f)['releases']
# The database of releases is a dictionary but the keys should be already sorted.
release_numbers = [r for r in releases.keys()]
last_release_number = release_numbers[-1]
print("Last release number = %s" % last_release_number, file=sys.stderr)
last_release_treebanks = releases[last_release_number]['treebanks']
with open(args.genre_symbols) as f:
genre_symbols = json.load(f)
t_env = jinja2.Environment(loader=jinja2.PackageLoader('at_glance', 'templates'), autoescape=True)
t_env.filters['tsepk'] = thousand_sep_filter
t_env.filters['tag_filter'] = tag_filter
t_env.filters['annotation_filter'] = annotation_filter
t_env.filters['genre_filter'] = functools.partial(genre_filter,genre_symbols=genre_symbols)
t_env.filters['family_filter'] = family_filter
t_env.filters['license_filter'] = license_filter
t_env.filters['contributor_filter'] = contributor_filter
t_env.filters['stars_filter'] = stars_filter
tbanks={} # language -> [tbank, tbank, ...]
for f_name in args.input:
try:
with open(f_name) as f:
tbank = json.load(f)
tbanks.setdefault(tbank['language_name'], []).append(tbank)
except:
print("Whoa, couldn't load", f_name, file=sys.stderr)
lang_template = t_env.get_template('language.md')
for lang, lang_tbanks in sorted(tbanks.items()):
# Select the required subset of treebanks. If no subset is required, all treebanks will be output.
if args.subset == 'current':
lang_tbanks = [t for t in lang_tbanks if t['repo_name'] in last_release_treebanks]
elif args.subset == 'sapling':
lang_tbanks = [t for t in lang_tbanks if not t['first_release']]
elif args.subset == 'retired':
lang_tbanks = [t for t in lang_tbanks if t['first_release'] and not t['repo_name'] in last_release_treebanks]
if len(lang_tbanks)==0:
continue
sum_counts = sum_dicts(list(t['counts'] for t in lang_tbanks))
union_genres = set()
for t in lang_tbanks:
union_genres |= set(t['meta']['genre'])
union_genres = list(union_genres)
# Sort treebanks by evaluation score (this is new) or by size (this is old; comment one of the two lines):
#lang_tbanks.sort(key=lambda tb: tb["counts"]["word"],reverse=True)
lang_tbanks.sort(key=lambda tb: tb['score'], reverse=True)
language_code = codes_flags[lang]['lcode']
language_name_short = lang_tbanks[0]['language_name_short'] if len(lang_tbanks)>0 else lang
if os.path.exists(os.path.join(args.docs_dir, '_'+language_code, 'index.md')):
language_hub = 'index.md'
else:
language_hub = None
if os.path.exists(os.path.join(args.docs_dir, 'treebanks', language_code+'-comparison.md')):
tbank_comparison = language_code+'-comparison.html'
else:
tbank_comparison = None
language_genus = None
if 'genus' in codes_flags[lang] and family_genera[codes_flags[lang]['family']] > 1:
language_genus = codes_flags[lang]['genus']
r = lang_template.render(
flag=codes_flags[lang]['flag'],
language_name=lang,
language_name_short=language_name_short,
language_code=language_code,
language_hub=language_hub,
tbank_comparison=tbank_comparison,
counts=sum_counts,
treebanks=lang_tbanks,
genres=union_genres,
language_family=codes_flags[lang]['family'],
language_genus=language_genus)
print(r)