-
Notifications
You must be signed in to change notification settings - Fork 52
/
Copy pathparlamint2distro.pl
executable file
·577 lines (538 loc) · 24.2 KB
/
parlamint2distro.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
#!/usr/bin/env perl
# Make ParlaMint corpora ready for distribution:
# 1. Finalize input corpora (version, date, handle, extent)
# 2. Validate corpora
# 3. Produce derived formats
# For help on parameters do
# $ parlamint2distro.pl -h
#
use warnings;
use utf8;
use open ':utf8';
use FindBin qw($Bin);
use File::Temp qw/ tempfile tempdir /; #creation of tmp files and directory
my $tempdirroot = "$Bin/tmp";
mkdir($tempdirroot) unless(-d $tempdirroot);
my $tmpDir = tempdir(DIR => $tempdirroot, CLEANUP => 1);
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
sub usage {
print STDERR ("Usage:\n");
print STDERR ("$0 -help\n");
print STDERR ("$0 [<procFlags>] -codes '<Codes>' -version <Version> -teihandle <TeiHandle> -anahandle <AnaHandle>");
print STDERR (" -schema [<Schema>] -docs [<Docs>] -in <Input> -out <Output>\n");
print STDERR (" Prepares ParlaMint corpora for distribution.\n");
print STDERR (" <Codes> is the list of country codes of the corpora to be processed.\n");
print STDERR (" <Schema> is the directory where ParlaMint RNG schemas are.\n");
print STDERR (" <Docs> is the directory where ParlaMint README files are.\n");
print STDERR (" <TeiHandle> is the handle of the plain text corpus.\n");
print STDERR (" <AnaHandle> is the handle of the linguistically annotated (.ana) corpus.\n");
print STDERR (" <Input> is the directory where ParlaMint-XX.TEI/ and ParlaMint-XX.TEI.ana/ are.\n");
print STDERR (" <Output> is the directory where output directories are written.\n");
print STDERR (" <procFlags> are process flags that set which operations are carried out:\n");
print STDERR (" * -ana: finalizes the TEI.ana directory\n");
print STDERR (" * -tei: finalizes the TEI directory (needs TEI.ana output)\n");
print STDERR (" * -sample: produces samples (from TEI.ana and TEI output)\n");
print STDERR (" * -valid: validates TEI, TEI.ana and samples\n");
print STDERR (" * -vert: produces vertical files (from TEI.ana output)\n");
print STDERR (" * -txt: produces plain text files with metadata files (from TEI output)\n");
print STDERR (" * -conll: produces conllu files with metadata files (from TEI.ana output)\n");
print STDERR (" * -all: do all of the above.\n");
print STDERR (" The flags can be also negated, e.g. \"-all -novalid\".\n");
print STDERR (" Example: \n");
print STDERR (" ./parlamint2distro.pl -all -novalid -codes 'BE ES' \\\n");
print STDERR (" -schema ../Schema -docs My/Docs/ -in Originals/ -out Final/ \\\n");
print STDERR (" 2> ParlaMint.ana.log\n");
}
use Getopt::Long;
use FindBin qw($Bin);
use File::Spec;
use File::Copy;
use File::Copy::Recursive qw(dircopy);
my $procAll = 0;
my $procAna = 2;
my $procTei = 2;
my $procSample = 2;
my $procValid = 2;
my $procTxt = 2;
my $procConll = 2;
my $procVert = 2;
GetOptions
(
'help' => \$help,
'codes=s' => \$countryCodes,
'schema=s' => \$schemaDir,
'docs=s' => \$docsDir,
'version=s' => \$Version,
'teihandle=s'=> \$handleTEI,
'anahandle=s'=> \$handleAna,
'in=s' => \$inDir,
'out=s' => \$outDir,
'all' => \$procAll,
'ana!' => \$procAna,
'tei!' => \$procTei,
'sample!' => \$procSample,
'valid!' => \$procValid,
'txt!' => \$procTxt,
'conll!' => \$procConll,
'vert!' => \$procVert,
);
if ($help) {
&usage;
exit;
}
$schemaDir = File::Spec->rel2abs($schemaDir) if $schemaDir;
$docsDir = File::Spec->rel2abs($docsDir) if $docsDir;
$inDir = File::Spec->rel2abs($inDir) if $inDir;
$outDir = File::Spec->rel2abs($outDir) if $outDir;
#Execution
#$Parallel = "parallel --gnu --halt 2 --jobs 15";
$Saxon = "java -jar $Bin/bin/saxon.jar";
# Problem with Out of heap space with TR, NL, GB for ana
$SaxonX = "java -Xmx240g -jar $Bin/bin/saxon.jar";
# logger variable stores info how long takes certain parts of code, used by logger subrutine
my $logger = {
code => '',
time => undef,
message => undef
};
# We substitute the local taxonomy with common one,
# reduced to the relevant languages, if the language exists in the taxonomy
# We assume the location of taxonomies is relative to the Scripts/ (i.e. $Bin/) directory
$taxonomyDir = "$Bin/../Build/Taxonomies";
$taxonomy{'ParlaMint-taxonomy-parla.legislature'} = "$taxonomyDir/ParlaMint-taxonomy-parla.legislature.xml";
$taxonomy{'ParlaMint-taxonomy-politicalOrientation'} = "$taxonomyDir/ParlaMint-taxonomy-politicalOrientation.xml";
$taxonomy{'ParlaMint-taxonomy-speaker_types'} = "$taxonomyDir/ParlaMint-taxonomy-speaker_types.xml";
$taxonomy{'ParlaMint-taxonomy-subcorpus'} = "$taxonomyDir/ParlaMint-taxonomy-subcorpus.xml";
$taxonomy{'ParlaMint-taxonomy-NER.ana'} = "$taxonomyDir/ParlaMint-taxonomy-NER.ana.xml";
$taxonomy{'ParlaMint-taxonomy-CHES'} = "$taxonomyDir/ParlaMint-taxonomy-CHES.xml";
$taxonomy{'ParlaMint-taxonomy-UD-SYN.ana'} = "$taxonomyDir/ParlaMint-taxonomy-UD-SYN.ana.xml";
$taxonomy{'ParlaMint-taxonomy-sentiment.ana'} = "$taxonomyDir/ParlaMint-taxonomy-sentiment.ana.xml";
# Mapping of countries to languages, we need it for mapping of common taxonomies
$country2lang{'AT'} = 'de';
$country2lang{'BA'} = 'bs';
$country2lang{'BE'} = 'nl, fr';
$country2lang{'BG'} = 'bg';
$country2lang{'CZ'} = 'cs';
$country2lang{'DE'} = 'de';
$country2lang{'DK'} = 'da';
$country2lang{'EE'} = 'et';
$country2lang{'ES'} = 'es';
$country2lang{'ES-AN'} = 'es';
$country2lang{'ES-CT'} = 'ca, es';
$country2lang{'ES-GA'} = 'gl';
$country2lang{'ES-PV'} = 'eu, es';
$country2lang{'FI'} = 'fi';
$country2lang{'FR'} = 'fr';
$country2lang{'GB'} = 'en';
$country2lang{'GR'} = 'el';
$country2lang{'HR'} = 'hr';
$country2lang{'HU'} = 'hu';
$country2lang{'IS'} = 'is';
$country2lang{'IT'} = 'it';
$country2lang{'LT'} = 'lt';
$country2lang{'LV'} = 'lv';
$country2lang{'NL'} = 'nl';
$country2lang{'NO'} = 'no';
$country2lang{'PL'} = 'pl';
$country2lang{'PT'} = 'pt';
$country2lang{'RO'} = 'ro';
$country2lang{'RS'} = 'sr';
$country2lang{'SE'} = 'sv';
$country2lang{'SI'} = 'sl';
$country2lang{'SK'} = 'sk';
$country2lang{'TR'} = 'tr';
$country2lang{'UA'} = 'uk, ru';
# Fake country for testing:
$country2lang{'XX'} = 'hr';
$scriptRelease = "$Bin/parlamint2release.xsl";
$scriptCommon = "$Bin/parlamint-add-common-content.xsl";
$scriptTaxonomy= "$Bin/parlamint-init-taxonomy.xsl";
$scriptPolish = "$Bin/polish-xml.pl";
$scriptValid = "$Bin/validate-parlamint.pl";
$scriptSample = "$Bin/corpus2sample.xsl";
$scriptTexts = "$Bin/parlamintp-tei2text.pl";
$scriptVerts = "$Bin/parlamintp-tei2vert.pl";
$scriptConls = "$Bin/parlamintp2conllu.pl";
$XX_template = "ParlaMint-XX";
my $cmd;
unless ($countryCodes) {
print STDERR "Need some country codes.\n";
print STDERR "For help: parlamint2distro.pl -h\n";
exit
}
foreach my $countryCode (split(/[, ]+/, $countryCodes)) {
print STDERR "INFO: *****Converting $countryCode (" . localtime(). ")\n";
$logger->{code} = $countryCode;
# Is this an MTed corpus?
if ($countryCode =~ m/-([a-z]{2,3})$/) {$MT = $1}
else {$MT = 0}
my $XX = $XX_template;
$XX =~ s|XX|$countryCode|g;
my $teiDir = "$XX.TEI";
my $anaDir = "$XX.TEI.ana";
my $teiRoot = "$teiDir/$XX.xml";
my $anaRoot = "$anaDir/$XX.ana.xml";
my $inTeiDir = "$inDir/$teiDir" if $inDir;
my $inAnaDir = "$inDir/$anaDir" if $inDir;
my $listOrg = "$XX-listOrg.xml";
my $listPerson = "$XX-listPerson.xml";
my $taxonomies = "*-taxonomy-*.xml";
my $inTeiRoot = "$inDir/$teiRoot" if $inDir;
my $inAnaRoot = "$inDir/$anaRoot" if $inDir;
#In case input dir is for samples remove .TEI(.ana)
if ($inTeiRoot) {
unless (-e $inTeiRoot) {
my $altTeiRoot = $inTeiRoot;
$altTeiRoot =~ s/\.TEI// ;
print STDERR "WARN: Can't find input TEI root $inTeiRoot, trying sample $altTeiRoot\n";
unless (-e $altTeiRoot) {
print STDERR "WARN: Can't find sample TEI root $altTeiRoot\n";
}
else {$inTeiRoot = $altTeiRoot}
}
}
if ($inAnaRoot) {
unless (-e $inAnaRoot) {
my $altAnaRoot = $inAnaRoot;
$altAnaRoot =~ s/\.TEI\.ana// ;
print STDERR "WARN: Can't find input TEI root $inAnaRoot, trying sample $altAnaRoot\n";
unless (-e $altAnaRoot) {
print STDERR "WARN: Can't find sample TEI root $altAnaRoot\n";
}
else {$inAnaRoot = $altAnaRoot}
}
}
my $outTeiDir = "$outDir/$teiDir"; # $outTeiDir =~ s/$XX/-$MT/ if $MT;
my $outTeiRoot = "$outDir/$teiRoot"; # $outTeiRoot =~ s/$XX/-$MT/ if $MT;
my $outAnaDir = "$outDir/$anaDir"; # $outAnaDir =~ s/$XX/-$MT/ if $MT;
my $outAnaRoot = "$outDir/$anaRoot"; # $outAnaRoot =~ s/$XX/-$MT/ if $MT;
my $outSmpDir = "$outDir/$XX"; # $outSmpDir =~ s/$XX/-$MT/ if $MT;
my $outTxtDir = "$outDir/$XX.txt"; # $outTxtDir =~ s/$XX/-$MT/ if $MT;
my $outConlDir = "$outDir/$XX.conllu"; # $outConlDir =~ s/$XX/-$MT/ if $MT;
my $outVertDir = "$outDir/$XX.vert"; # $outVertDir =~ s/$XX/-$MT/ if $MT;
# Location, name and extention of registry files, need $Version to compute it!
if ($Version) {
$regiDir = $docsDir . '/registry';
$vertRegi = 'parlamint' . $Version . '_' . lc $countryCode;
$vertRegi =~ s/\.//g; #e.g. 3.1 -> 31, so we will get e.g. parlamint31_at
$vertRegi =~ s/-/_/g; #e.g. parlamint31_es-ct.regi to parlamint31_es_ct
# Remove -en suffix as we don't have parlamint99_xx_en registry files
$vertRegi =~ s/_$MT$// if $MT;
$regiExt = 'regi'
}
if (($procAll and $procAna) or (!$procAll and $procAna == 1)) {
print STDERR "INFO: ***Finalizing $countryCode TEI.ana\n";
die "FATAL ERROR: Need version\n" unless $Version;
die "FATAL ERROR: Can't find input ana root $inAnaRoot\n" unless -e $inAnaRoot;
die "FATAL ERROR: No handle given for ana distribution\n" unless $handleAna;
logger('Preparing TEI.ana corpus directory');
# Output top level readme
&cp_readme_top($countryCode, $MT, 'ana', $handleAna, $Version, $docsDir, $outDir);
`rm -fr $outAnaDir; mkdir $outAnaDir`;
if ($MT) {$inReadme = "$docsDir/README-$MT.TEI.ana.txt"}
else {$inReadme = "$docsDir/README.TEI.ana.txt"}
die "FATAL ERROR: No handle given for TEI.ana distribution\n" unless $handleAna;
&cp_readme($countryCode, $handleAna, $Version, $inReadme, "$outAnaDir/00README.txt");
&cp_schema($schemaDir, $outAnaDir);
my $tmpOutDir = "$tmpDir/release.ana";
my $tmpOutAnaDir = "$tmpDir/$anaDir";
my $tmpAnaRoot = "$tmpOutDir/$anaRoot";
print STDERR "INFO: ***Fixing TEI.ana corpus for release\n";
logger('Fixing TEI.ana corpus for release');
$cmd = "$SaxonX outDir=$tmpOutDir -xsl:$scriptRelease $inAnaRoot";
`$cmd`;
die "FATAL ERROR: $cmd exited with $?\n" if $?;
print STDERR "INFO: ***Adding common content to TEI.ana corpus\n";
logger('Adding common content to TEI.ana corpus');
$cmd = "$SaxonX version=$Version handle-ana=$handleAna anaDir=$outAnaDir outDir=$outDir -xsl:$scriptCommon $tmpAnaRoot";
`$cmd`;
die "FATAL ERROR: $cmd exited with $?\n" if $?;
&commonTaxonomies($countryCode, $outAnaDir);
logger('Polishing TEI.ana corpus');
&polish($outAnaDir);
logger()
}
if (($procAll and $procTei) or (!$procAll and $procTei == 1)) {
print STDERR "INFO: ***Finalizing $countryCode TEI\n";
die "FATAL ERROR: Need version\n" unless $Version;
die "FATAL ERROR: Can't find input tei root $inTeiRoot\n" unless -e $inTeiRoot;
die "FATAL ERROR: No handle given for TEI distribution\n" unless $handleTEI;
logger('Preparing TEI corpus directory');
# Output top level readme
&cp_readme_top($countryCode, $MT, 'tei', $handleTEI, $Version, $docsDir, $outDir);
`rm -fr $outTeiDir; mkdir $outTeiDir`;
if ($MT) {$inReadme = "$docsDir/README-$MT.TEI.txt"}
else {$inReadme = "$docsDir/README.TEI.txt"}
&cp_readme($countryCode, $handleTEI, $Version, $inReadme, "$outTeiDir/00README.txt");
&cp_schema($schemaDir, $outTeiDir);
my $tmpOutDir = "$tmpDir/release.tei";
my $tmpOutTeiDir = "$tmpDir/$teiDir";
my $tmpTeiRoot = "$tmpOutDir/$teiRoot";
print STDERR "INFO: ***Fixing TEI corpus for release\n";
logger('Fixing TEI corpus for release');
$cmd = "$SaxonX anaDir=$outAnaDir outDir=$tmpOutDir -xsl:$scriptRelease $inTeiRoot";
`$cmd`;
die "FATAL ERROR: $cmd exited with $?\n" if $?;
print STDERR "INFO: ***Adding common content to TEI corpus\n";
logger('Adding common content to TEI corpus');
$cmd = "$SaxonX version=$Version handle-txt=$handleTEI anaDir=$outAnaDir outDir=$outDir -xsl:$scriptCommon $tmpTeiRoot";
`$cmd`;
die "FATAL ERROR: $cmd exited with $?\n" if $?;
&commonTaxonomies($countryCode, $outTeiDir);
logger('Polishing TEI corpus');
&polish($outTeiDir);
logger();
}
if (($procAll and $procSample) or (!$procAll and $procSample == 1)) {
print STDERR "INFO: ***Making $countryCode samples\n";
logger('Making samples');
`rm -fr $outSmpDir; mkdir $outSmpDir`;
if (-e $outTeiRoot) {
`$Saxon outDir=$outSmpDir -xsl:$scriptSample $outTeiRoot`;
`$scriptTexts $outSmpDir $outSmpDir`;
}
else {print STDERR "WARN: No TEI files for $countryCode samples (needed root file is $outTeiRoot)\n"}
if (-e $outAnaRoot) {
`$Saxon outDir=$outSmpDir -xsl:$scriptSample $outAnaRoot`;
#Make also derived files
`$scriptTexts $outSmpDir $outSmpDir` unless $outTeiRoot;
`$scriptVerts $outSmpDir $outSmpDir`;
if (-e "$regiDir/$vertRegi") {`cp $regiDir/$vertRegi $outSmpDir/$vertRegi.$regiExt`}
else {print STDERR "WARN: registry file $vertRegi not found\n"}
`$scriptConls $outSmpDir $outSmpDir`
}
else {print STDERR "ERROR: No .ana files for $countryCode samples (needed root file is $outAnaRoot)\n"}
#For some reason both ParlaMint-XX_YYY-MM-DD-meta-en.tsv and ParlaMint-XX_YYY-MM-DD.ana-meta-en.tsv
#are present in Sample directory, remove the .ana variant:
`rm -f $outSmpDir/*.ana-meta-en.tsv`;
# Output top level readme but not for $MTed version, as it would overwrite the original
# The Sample readme does not have handle or version, as the sample can change irrespective of them
&commonTaxonomies($countryCode, $outSmpDir);
&cp_readme_top($countryCode, '', 'sample', '', '', $docsDir, $outSmpDir)
unless $MT;
&polish($outSmpDir);
&dirify($outSmpDir);
}
if (($procAll and $procValid) or (!$procAll and $procValid == 1)) {
print STDERR "INFO: ***Validating $countryCode TEI\n";
logger('Validating TEI');
die "FATAL ERROR: Can't find schema directory\n" unless $schemaDir and -e $schemaDir;
`$scriptValid $schemaDir $outSmpDir` if -e $outSmpDir;
`$scriptValid $schemaDir $outTeiDir` if -e $outTeiDir;
`$scriptValid $schemaDir $outAnaDir` if -e $outAnaDir;
}
if (($procAll and $procTxt) or (!$procAll and $procTxt == 1)) {
print STDERR "INFO: ***Making $countryCode text\n";
logger('Making text');
# We have an oportunistic handle, could be $handleTEI or $handleAna, depending on which one exists
if ($handleTEI) {$handleTxt = $handleTEI}
elsif ($handleAna) {$handleTxt = $handleAna}
else {die "FATAL ERROR: No handle given for TEI or .ana distribution\n"}
`rm -fr $outTxtDir; mkdir $outTxtDir`;
if ($MT) {$inReadme = "$docsDir/README-$MT.text.txt"}
else {$inReadme = "$docsDir/README.text.txt"}
&cp_readme($countryCode, $handleTxt, $Version, $inReadme, "$outTxtDir/00README.txt");
if (-e $outTeiDir) {`$scriptTexts $outTeiDir $outTxtDir`}
elsif (-e $outAnaDir) {`$scriptTexts $outAnaDir $outTxtDir`}
else {die "FATAL ERROR: Neither $outTeiDir nor $outAnaDir exits\n"}
&dirify($outTxtDir);
}
if (($procAll and $procConll) or (!$procAll and $procConll == 1)) {
print STDERR "INFO: ***Making $countryCode CoNLL-U\n";
logger('Making CoNLL-U');
die "FATAL ERROR: Can't find input ana dir $outAnaDir\n" unless -e $outAnaDir;
die "FATAL ERROR: No handle given for ana distribution\n" unless $handleAna;
`rm -fr $outConlDir; mkdir $outConlDir`;
if ($MT) {$inReadme = "$docsDir/README-$MT.conll.txt"}
else {$inReadme = "$docsDir/README.conll.txt"}
&cp_readme($countryCode, $handleAna, $Version, $inReadme, "$outConlDir/00README.txt");
`$scriptConls $outAnaDir $outConlDir`;
&dirify($outConlDir);
}
if (($procAll and $procVert) or (!$procAll and $procVert == 1)) {
print STDERR "INFO: ***Making $countryCode vert\n";
logger('Making vert');
die "FATAL ERROR: Can't find input ana dir $outAnaDir\n" unless -e $outAnaDir;
die "FATAL ERROR: No handle given for ana distribution\n" unless $handleAna;
`rm -fr $outVertDir; mkdir $outVertDir`;
if ($MT) {$inReadme = "$docsDir/README-$MT.vert.txt"}
else {$inReadme = "$docsDir/README.vert.txt"}
&cp_readme($countryCode, $handleAna, $Version, $inReadme, "$outVertDir/00README.txt");
if (-e "$regiDir/$vertRegi") {`cp $regiDir/$vertRegi $outVertDir/$vertRegi.$regiExt`}
else {print STDERR "WARN: registry file $vertRegi not found\n"}
`$scriptVerts $outAnaDir $outVertDir`;
&dirify($outVertDir);
}
logger();
print STDERR "INFO: ***Finished processing $countryCode corpus.\n";
}
# Substitute local with common taxonomies & reduce languages to en + corpus one(s)
sub commonTaxonomies {
my $Country = shift;
my $outDir = shift;
# If this is an MTed corpus then fix Country to be without langauge suffix
$Country =~ s/-[a-z]{2}$//;
foreach my $taxonomy (sort keys %taxonomy) {
if ($taxonomy !~ /\.ana/ or
($taxonomy =~ /\.ana/ and ($outDir =~ /\.ana/ or $outDir !~ /\.TEI/))) {
if (-e $taxonomy{$taxonomy}) {
if (exists($country2lang{$Country})) {
my $Language = $country2lang{$Country};
$Language =~ s/, .+//; #For multilingual corpora take the first language as main language
my $command = "$Saxon if-lang-missing=skip langs='$Language' -xsl:$scriptTaxonomy";
`$command $taxonomy{$taxonomy} > $outDir/$taxonomy.xml`;
}
else {
die "FATAL ERROR: Can't find mapping between country code and language: ".
"pls. add \$country2lang{'$Country'} to parlamint2distro.pl!\n"
}
}
else {print STDERR "ERROR: Can't find common taxonomy $taxonomy at $taxonomy{$taxonomy}\n"}
}
}
return 1;
}
#Format XML file to be a bit nicer & smaller
sub polish {
my $dir = shift;
foreach my $file (glob("$dir/*.xml $dir/*/*.xml")) {
`$scriptPolish < $file > $file.tmp`;
rename("$file.tmp", $file);
}
}
#If a directory has more than $MAX files, store them in year directories
sub dirify {
my $MAX = 1; #In ParlaMint II we always put them in year directories
my $inDir = shift;
my @files = glob("$inDir/*");
if (scalar @files > $MAX) {
foreach my $file (@files) {
if (my ($year) = $file =~ m|ParlaMint-.+?_(\d\d\d\d)|) {
my $newDir = "$inDir/$year";
mkdir($newDir) unless -d $newDir;
move($file, $newDir);
}
}
}
}
#Read in the appropriate top level $inFile README, modify it and output it $outFile
sub cp_readme_top {
my $country = shift;
my $mt = shift;
my $type = shift;
my $handle = shift;
my $version = shift;
my $inDir = shift;
my $outDir = shift;
my $countryName; # Country name obtained from existing README
my $countryCode; # Country code obtained from existing README
die "FATAL ERROR: No country for cp_readme_top\n" unless $country;
die "FATAL ERROR: No handle for cp_readme_top\n" unless $handle or $type eq 'sample';
die "FATAL ERROR: No version for cp_readme_top\n" unless $version or $type eq 'sample';
my $inFile = "$inDir/README.md/README-$country.md";
$inFile =~ s|-$mt|| if $mt; #Need to remove e.g. '-en' from input readme, as we don't have such input files
# Construct output filename: in sample it is just README.md, other types add on a suffix
my $outFile = "$outDir/README";
if ($type eq 'sample') {}
elsif ($type eq 'ana' or $type eq 'tei') {$outFile .= "-" . $country }
if ($type eq 'ana') {$outFile .= ".ana"}
$outFile .= ".md";
open IN, '<:utf8', $inFile or die "FATAL ERROR: Can't open input top README $inFile\n";
open OUT,'>:utf8', $outFile or die "FATAL ERROR: Can't open output top README $outFile\n";
# Output depends on $type, $MT, and $country:
# sample: # Samples of the ParlaMint-XX corpus
# en-smp: # Samples of the ParlaMint-XX corpus (translation to English)
# TEI: # Corpus of parliamentary debates ParlaMint-XX
# ana: # Linguistically annotated corpus of parliamentary debates ParlaMint-XX.ana
# en-TEI: # Corpus of parliamentary debates, ParlaMint-XX-en (translation to English)
# en-ana: # Linguistically annotated corpus of parliamentary debates ParlaMint-XX-en.ana (translation to English)
while (<IN>) {
if (m|^# Samples|) {
($countryCode) = m|-([A-Z]{2}(-[A-Z]{2})?) |
or die "FATAL ERROR: Bad line in README.md file: $_";
die "FATAL ERROR: Bad code $countryCode (!= $country) in $inFile\n" unless $country =~ /$countryCode/;
if ($type =~ /sample/i) {print OUT "# Samples of the ParlaMint-$countryCode corpus"}
elsif ($type =~ /tei/i) {print OUT "# Corpus of parliamentary debates ParlaMint-$countryCode"}
elsif ($type =~ /ana/i) {print OUT "# Linguistically annotated corpus of parliamentary debates ParlaMint-$countryCode"}
else {die "FATAL ERROR: Strange type $type for cp_readme_top\n"}
if ($MT) {print OUT "-en (translation to English)"}
print OUT "\n";
}
elsif (m|- +Country| or m|- +Autonomous region|) {
if (/ [A-Z]{2}-[A-Z]{2}/) {print OUT "- Autonomous region: "}
elsif (/ [A-Z]{2}/) {print OUT "- Country: "}
else {die "FATAL ERROR: Strange country code $countryCode for cp_readme_top in $_\n"}
unless (($countryName) = /\((.+)\)/) {
die "FATAL ERROR: Strange country code $countryName for cp_readme_top in $_\n"
}
print OUT "$countryCode ($countryName)\n";
}
elsif (m|- +Language|) {
if ($MT) {s/(Languages:) /$1 en (English) from /}
print OUT;
unless ($type eq 'sample') {
print OUT "- Version: $version\n";
print OUT "- Handle: [$handle]($handle)\n";
}
}
else {print OUT}
}
close IN;
close OUT;
}
#Read in the appropriate $inFile README, change XX in it to country code, and output it $outFile
sub cp_readme {
my $country = shift;
my $handle = shift;
my $version = shift;
my $inFile = shift;
my $outFile = shift;
die "FATAL ERROR: No country for cp_readme\n" unless $country;
die "FATAL ERROR: No handle for cp_readme\n" unless $handle;
die "FATAL ERROR: No version for cp_readme\n" unless $version;
open IN, '<:utf8', $inFile or die "FATAL ERROR: Can't open input README $inFile\n";
open OUT,'>:utf8', $outFile or die "FATAL ERROR: Can't open output README $outFile\n";
while (<IN>) {
s/XX/$country/g;
s/YY/$handle/g;
s/ZZ/$version/g;
print OUT
}
close IN;
close OUT;
}
#Read in the appropriate $inFile README, change XX in it to country code, and output it $outFile
sub cp_schema {
my $schemaDir = shift;
my $outDir = shift;
# Do not preserve symlinks when copying (for links in Schema/)
$File::Copy::Recursive::CopyLink = 0;
die "FATAL ERROR: Can't find schema directory\n"
unless $schemaDir and -e $schemaDir;
dircopy($schemaDir, "$outDir/Schema");
# Remove unwanted files
`rm -fr $outDir/Schema/.git*`;
`rm -f $outDir/Schema/nohup.*`;
`rm -f $outDir/Schema/*.log`;
`rm -f $outDir/Schema/Makefile`;
}
sub logger {
my $message = shift;
my $time = time();
if($logger->{time} && $logger->{message}) {
logger_print($logger->{code},$time,"DONE",$logger->{message},$time - $logger->{time});
$logger->{message} = undef;
$logger->{time} = undef;
}
if($message){
logger_print($logger->{code},$time,"START",$message);
$logger->{message} = $message;
$logger->{time} = $time;
}
}
sub logger_print {
my ($countryCode, $time, $status, $message, $duration) = @_;
print STDERR "INFO: $countryCode (",scalar(localtime($time)),") ### $status",(defined($duration) ? "($duration s)": ""),": $message","\n";
}