-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathevaluate_treebank.pl
executable file
·489 lines (477 loc) · 18.9 KB
/
evaluate_treebank.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
#!/usr/bin/env perl
# Evaluates quality of a UD treebank. Should help to determine if there are
# multiple treebanks in one language, which is the best one to use.
# Copyright © 2018 Dan Zeman <[email protected]>
# License: GNU GPL
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
use Getopt::Long;
use File::Which; # find executable files in $PATH
use udlib;
my $verbose = 0;
my $forcemaster = 0;
GetOptions
(
'verbose' => \$verbose,
'forcemaster!' => \$forcemaster
);
# Path to the local copy of the UD repository (e.g., UD_Czech).
my $folder = $ARGV[0];
if(!defined($folder))
{
die("Usage: $0 path-to-ud-folder");
}
$folder =~ s:/$::;
$folder =~ s:^\./::;
if($folder =~ m:/:)
{
print STDERR ("WARNING: argument '$folder' contains a slash, indicating that the treebank is not a direct subfolder of the current folder. Some tests will probably not work as expected!\n");
}
if($verbose)
{
# Log the current version of evaluate_treebank.pl, validate.py, and the data files used by validate.py.
###!!! Here we assume that 'tools' is a subfolder of the current folder.
###!!! However, later we take validate.py from PATH and it could be a different copy at a different location!
print STDERR ("Running the following version of UD tools:\n");
system("cd tools ; (git log | head -3 1>&2) ; cd ..");
}
# The ranking that we apply to the list of treebanks of a given language (on UD title page)
# should be based on the most recent official release, i.e., on the master branch.
if($forcemaster)
{
###!!! At present we ignore the fact that multiple CGI processes may attempt to
###!!! fiddle with the repository at the same time! When that happens, the output
###!!! will be wrong!
system("cd $folder ; (git checkout master 1>&2) ; cd ..");
}
if($verbose)
{
print STDERR ("Evaluating the following revision of $folder:\n");
system("cd $folder ; (git log | head -3 1>&2) ; cd ..");
}
my $record = udlib::get_ud_files_and_codes($folder);
my $metadata = udlib::read_readme($folder);
my $n = 0;
my $ntrain = 0;
my $ndev = 0;
my $ntest = 0;
my %forms;
my %lemmas;
my %tags;
my $n_words_with_features = 0;
my %udeprels;
foreach my $file (@{$record->{files}})
{
open(FILE, "$folder/$file") or die("Cannot read $folder/$file: $!");
while(<FILE>)
{
if(m/^\d+\t/)
{
s/\r?\n$//;
my @f = split(/\t/, $_);
my $form = $f[1];
my $lemma = $f[2];
my $upos = $f[3];
my $feat = $f[5];
my $udeprel = $f[7];
$udeprel =~ s/:.*$//;
$n++;
if($file =~ m/ud-train/)
{
$ntrain++;
}
elsif($file =~ m/ud-dev/)
{
$ndev++;
}
elsif($file =~ m/ud-test/)
{
$ntest++;
}
$forms{$form}++;
$lemmas{$lemma}++;
$tags{$upos}++;
$n_words_with_features++ if($feat ne '_');
$udeprels{$udeprel}++;
}
}
close(FILE);
}
# Compute partial scores.
my %score;
#------------------------------------------------------------------------------
# Size. Project size to the interval <0; 1>.
# Do not modify the real number of words, $n. It will be needed in other metrics, too.
my $ntrunc = $n;
$ntrunc = 1000000 if($ntrunc > 1000000);
$ntrunc = 1 if($ntrunc <= 0);
my $lognn = log(($ntrunc/1000)**2); $lognn = 0 if($lognn < 0);
$score{size} = $lognn / log(1000000);
if($verbose)
{
print STDERR ("Size: counted $ntrunc of $n words (nodes).\n");
print STDERR ("Size: min(0, log((N/1000)**2)) = $lognn.\n");
printf STDERR ("Size: maximum value %f is for 1000000 words or more.\n", log(1000000));
}
#------------------------------------------------------------------------------
# Split. This is also very much related to size, but per individual parts.
$score{split} = 0.01;
$score{split} += 0.33 if($ntrain > 10000);
$score{split} += 0.33 if($ndev >= 10000);
$score{split} += 0.33 if($ntest >= 10000);
if($verbose)
{
if($ntrain > 10000)
{
print STDERR ("Split: Found more than 10000 training words.\n");
}
else
{
print STDERR ("Split: Did not find more than 10000 training words.\n");
}
if($ndev >= 10000)
{
print STDERR ("Split: Found at least 10000 development words.\n");
}
else
{
print STDERR ("Split: Did not find at least 10000 development words.\n");
}
if($ntest >= 10000)
{
print STDERR ("Split: Found at least 10000 test words.\n");
}
else
{
print STDERR ("Split: Did not find at least 10000 test words.\n");
}
}
#------------------------------------------------------------------------------
# Lemmas. If the most frequent lemma is '_', we infer that the corpus does not annotate lemmas.
my @lemmas = sort {$lemmas{$b} <=> $lemmas{$a}} (keys(%lemmas));
my $lsource = $metadata->{Lemmas} eq 'manual native' ? 1 : $metadata->{Lemmas} eq 'converted with corrections' ? 0.9 : $metadata->{Lemmas} eq 'converted from manual' ? 0.8 : $metadata->{Lemmas} eq 'automatic with corrections' ? 0.5 : 0.4;
$score{lemmas} = (scalar(@lemmas) < 1 || $lemmas[0] eq '_') ? 0.01 : $lsource;
if($verbose)
{
if(scalar(@lemmas)<1)
{
print STDERR ("Lemmas: No lemmas found.\n");
}
elsif($lemmas[0] eq '_')
{
print STDERR ("Lemmas: '_' is the most frequent lemma.\n");
}
else
{
print STDERR ("Lemmas: source of annotation (from README) factor is $lsource.\n");
}
}
#------------------------------------------------------------------------------
# Tags. How many of the 17 universal POS tags have been seen at least once?
# Some languages may not have use for some tags, and some tags may be very rare.
# But for comparison within one language this is useful. If a tag exists in the
# language but the corpus does not contain it, maybe it cannot distinguish it.
my $tsource = $metadata->{UPOS} eq 'manual native' ? 1 : $metadata->{UPOS} eq 'converted with corrections' ? 0.9 : $metadata->{UPOS} eq 'converted from manual' ? 0.8 : 0.1;
my $maxtags = 17;
$score{tags} = (scalar(keys(%tags)) / $maxtags) * $tsource;
$score{tags} = 0.01 if($score{tags}<0.01);
if($verbose)
{
printf STDERR ("Universal POS tags: %d out of $maxtags found in the corpus.\n", scalar(keys(%tags)));
print STDERR ("Universal POS tags: source of annotation (from README) factor is $tsource.\n");
}
#------------------------------------------------------------------------------
# Features. There is no universal rule how many features must be in every language.
# It is only sure that every language can have some features. It may be misleading
# to say that a treebank has features if at least one feature has been observed:
# Some treebanks have just NumType=Card with every NUM but nothing else (and this
# is just a consequence of how Interset works). Therefore we will distinguish several
# very coarse-grained degrees.
my $fsource = $metadata->{Features} eq 'manual native' ? 1 : $metadata->{Features} eq 'converted with corrections' ? 0.9 : $metadata->{Features} eq 'converted from manual' ? 0.8 : $metadata->{Features} eq 'automatic with corrections' ? 0.5 : 0.4;
$score{features} = $n_words_with_features==0 ? 0.01 : $n_words_with_features<$n/3 ? 0.3*$fsource : $n_words_with_features<$n/2 ? 0.5*$fsource : 1*$fsource;
if($verbose)
{
print STDERR ("Features: $n_words_with_features out of $n total words have one or more features.\n");
print STDERR ("Features: source of annotation (from README) factor is $fsource.\n");
}
#------------------------------------------------------------------------------
# Dependency relations. How many of the 37 universal relation types have been
# seen at least once? Some languages may not have use for some relations, and
# some relations may be very rare. But for comparison within one language this
# is useful. If a relation exists in the language but the corpus does not
# contain it, maybe it cannot distinguish it.
my $rsource = $metadata->{Relations} eq 'manual native' ? 1 : $metadata->{Relations} eq 'converted with corrections' ? 0.9 : $metadata->{Relations} eq 'converted from manual' ? 0.8 : 0.1;
my $maxudeprels = 37;
$score{udeprels} = (scalar(keys(%udeprels)) / $maxudeprels) * $rsource;
$score{udeprels} = 0.01 if($score{udeprels}<0.01);
if($verbose)
{
printf STDERR ("Universal relations: %d out of $maxudeprels found in the corpus.\n", scalar(keys(%udeprels)));
print STDERR ("Universal relations: source of annotation (from README) factor is $rsource.\n");
}
#------------------------------------------------------------------------------
# Udapi MarkBugs (does the content follow the guidelines?)
# Measured only if the corpus is not empty and if udapy is found at the expected place.
if($n > 0)
{
$score{udapi} = 1;
my $command = get_udapi_command($folder, $record->{lcode});
###!!! In verbose mode we should log the exact version of Udapi that we used, like we do for the tools repository.
###!!! This may be more difficult given that in CGI mode we use the copy in the curren folder instead of the git clone.
if(defined($command))
{
my $output = `$command`;
my $nbugs = 0;
my $maxwordsperbug = 10; # if there are more bugs than every n-th word, we will count maximum error rate
if($output =~ m/(\d+)/)
{
$nbugs = $1;
# Evaluate the proportion of bugs to the size of the treebank
# (with a ceiling defined by $maxwordsperbug; anything above is considered "absolutely bad").
my $nbugs1 = $nbugs>$n/$maxwordsperbug ? $n/$maxwordsperbug : $nbugs;
$score{udapi} = 1-$nbugs1/($n/$maxwordsperbug);
$score{udapi} = 0.01 if($score{udapi}<0.01);
}
if($verbose)
{
print STDERR ("Udapi:\n");
print STDERR ($output);
print STDERR ("Udapi: found $nbugs bugs.\n");
print STDERR ("Udapi: worst expected case (threshold) is one bug per $maxwordsperbug words. There are $n words.\n");
}
}
elsif($verbose)
{
print STDERR ("WARNING: Udapi not found. The content-based tests were not performed.\n");
print STDERR (" In order to get a full evaluation, you need to:\n");
print STDERR (" 1. Put this script in the current folder:\n");
print STDERR (" https://github.com/UniversalDependencies/docs-automation/blob/master/valdan/udapi-markbugs.sh\n");
print STDERR (" 2. Install Python version of Udapi.\n");
print STDERR (" 3. Copy udapi-python and pythonlib to the current folder, as indicated in udapi-markbugs.sh.\n");
}
}
else
{
$score{udapi} = 0;
}
#------------------------------------------------------------------------------
# Genres. Idea: an attempt at a balance of many genres provides for a more
# versatile dataset. Of course this is just an approximation. We cannot verify
# how well the authors described the genres in their corpus and how much they
# managed to make it balanced. We look only for the listed, "officially known"
# genres. (Sometimes there are typos in the READMEs and besides "news", people
# also use "new" or "newswire"; this is undesirable.)
my @official_genres = ('academic', 'bible', 'blog', 'email', 'fiction', 'grammar-examples', 'learner-essays', 'legal', 'medical', 'news', 'nonfiction', 'poetry', 'reviews', 'social', 'spoken', 'web', 'wiki');
my @genres = grep {my $g = $_; scalar(grep {$_ eq $g} (@official_genres));} (split(/\s+/, $metadata->{Genre}));
my $ngenres = scalar(@genres);
$ngenres = 1 if($ngenres<1);
$score{genres} = $ngenres / scalar(@official_genres);
if($verbose)
{
printf STDERR ("Genres: found %d out of %d known.\n", $ngenres, scalar(@official_genres));
}
#------------------------------------------------------------------------------
# Evaluate availability. If the most frequent form is '_', we infer that the
# corpus does not contain the underlying text (which is done for copyright reasons;
# the user must obtain the underlying text elsewhere and merge it with UD annotation).
my @forms = sort {$forms{$b} <=> $forms{$a}} (keys(%forms));
# At the same time, such corpora should be also labeled in the README metadata
# item "Includes text".
my $availability = $metadata->{'Includes text'} !~ m/^yes$/i || scalar(@forms) < 1 || $forms[0] eq '_' ? 0.1 : 1;
if($verbose)
{
if($metadata->{'Includes text'} !~ m/^yes$/i)
{
print STDERR ("Availability: README does not say Includes text: yes\n");
}
if(scalar(@forms) < 1)
{
print STDERR ("Availability: No words found.\n");
}
elsif($forms[0] eq '_')
{
print STDERR ("Availability: '_' is the most frequent form.\n");
}
}
#------------------------------------------------------------------------------
# Evaluate validity. Formally invalid data should get a score close to zero.
my $folder_success = 1;
foreach my $file (@{$record->{files}})
{
my $command = get_validator_command($folder, $file, $record->{lcode});
if(defined($command))
{
system("echo $command");
my $result = saferun("$command 2>&1");
$folder_success = $folder_success && $result;
}
elsif($verbose)
{
print STDERR ("WARNING: Validator not found. We will assume that all files are valid.\n");
last;
}
}
my $validity = $folder_success ? 1 : 0.01;
if($verbose)
{
print STDERR ("Validity: $validity\n");
}
#------------------------------------------------------------------------------
# Score of empty treebanks should be zero regardless of the other features.
my $score = 0;
if($n > 1)
{
my %weights =
(
'size' => 10,
'split' => 2,
'lemmas' => 3,
'tags' => 3,
'features' => 3,
'udeprels' => 3,
'udapi' => 12,
'genres' => 3
);
my @dimensions = sort(keys(%weights));
my $wsum = 0;
foreach my $d (@dimensions)
{
$wsum += $weights{$d};
}
foreach my $d (@dimensions)
{
my $nweight = $weights{$d} / $wsum;
$score += $nweight * $score{$d};
if($verbose)
{
my $wscore = $nweight * $score{$d};
print STDERR ("(weight=$nweight) * (score{$d}=$score{$d}) = $wscore\n");
}
}
# The availability and validity dimensions are show stoppers. Instead of weighted combination, we multiply the score by them.
if($verbose)
{
print STDERR ("(TOTAL score=$score) * (availability=$availability) * (validity=$validity) = ");
}
$score *= $availability * $validity;
if($verbose)
{
print STDERR ("$score\n");
}
}
my $stars = sprintf("%d", $score*10+0.5)/2;
if($verbose)
{
print STDERR ("STARS = $stars\n");
}
print("$folder\t$score\t$stars\n");
# When we are done we must switch the repository back from master to the dev branch.
# (Provided we actually switched it to master. And ignoring the possibility that there is a third branch or that it already was in master when we came.)
if($forcemaster)
{
###!!! At present we ignore the fact that multiple CGI processes may attempt to
###!!! fiddle with the repository at the same time! When that happens, the output
###!!! will be wrong!
system("cd $folder ; (git checkout dev 1>&2) ; cd ..");
}
#------------------------------------------------------------------------------
# Figures out whether the UD validator is available and how to invoke it.
#------------------------------------------------------------------------------
sub get_validator_command
{
my $folder = shift;
my $file = shift;
my $lcode = shift;
my $command;
# If evaluation runs under a CGI script on quest, validation must be called through an envelope script.
if(-x './validate.sh')
{
$command = "./validate.sh --lang $lcode --max-err=10 $folder/$file";
}
else
{
my $validate = which('validate.py');
if($validate)
{
$command = "$validate --lang $lcode --max-err=10 $folder/$file";
}
}
return $command;
}
#------------------------------------------------------------------------------
# Figures out whether Udapi is available and how to invoke it.
#------------------------------------------------------------------------------
sub get_udapi_command
{
my $folder = shift;
my $lcode = shift;
my $command;
# If evaluation runs under a CGI script on quest, Udapi must be called through an envelope script.
if(-x './udapi-markbugs.sh')
{
$command = "(cat $folder/*.conllu | ./udapi-markbugs.sh 2>&1) | grep TOTAL";
}
elsif(which('udapy'))
{
# If Udapi knows the language (given as the zone of each sentence),
# it can customize selected tests for the language.
my $read_zone = 'read.Conllu';
if(defined($lcode))
{
$read_zone = "read.Conllu zone=$lcode";
}
$command = "(cat $folder/*.conllu | udapy $read_zone ud.MarkBugs 2>&1) | grep TOTAL";
}
return $command;
}
#------------------------------------------------------------------------------
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# This function comes from Dan's library dzsys. I do not want to depend on that
# library here, so I am copying the function. I have also modified it so that
# it does not throw exceptions.
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#
# Calls an external program. Uses system(). In addition, echoes the command
# line to the standard error output, and returns true/false according to
# whether the call was successful and the external program returned 0 (success)
# or non-zero (error).
#
# Typically called as follows:
# saferun($command) or die;
#------------------------------------------------------------------------------
sub saferun
{
my $command = join(' ', @_);
#my $ted = cas::ted()->{datumcas};
#print STDERR ("[$ted] Executing: $command\n");
system($command);
# The external program does not exist, is not executable or the execution failed for other reasons.
if($?==-1)
{
print STDERR ("ERROR: Failed to execute: $command\n $!\n");
return;
}
# We were able to start the external program but its execution failed.
elsif($? & 127)
{
printf STDERR ("ERROR: Execution of: $command\n died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without');
return;
}
# The external program ended "successfully" (this still does not guarantee
# that the external program returned zero!)
else
{
my $exitcode = $? >> 8;
print STDERR ("Exit code: $exitcode\n") if($exitcode);
# Return false if the program returned a non-zero value.
# It is up to the caller how they will handle the return value.
# (The easiest is to always write:
# saferun($command) or die;
# )
return ! $exitcode;
}
}