Skip to content

Commit

Permalink
Merge pull request #873 from clarin-eric/devel
Browse files Browse the repository at this point in the history
Devel-main
  • Loading branch information
matyaskopp authored May 22, 2024
2 parents 6eee33b + 9d89157 commit c2e0684
Show file tree
Hide file tree
Showing 990 changed files with 345,894 additions and 236,712 deletions.
2 changes: 2 additions & 0 deletions Build/Distro/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
get-cz:
rsync -rv /home/kopp/ParlaMint-4.1-distro/Distro-MT/* .
662 changes: 341 additions & 321 deletions Build/Distro/ParlaMint-en.ana.xml

Large diffs are not rendered by default.

407 changes: 205 additions & 202 deletions Build/Distro/ParlaMint.ana.xml

Large diffs are not rendered by default.

67 changes: 35 additions & 32 deletions Build/Distro/ParlaMint.xml
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,9 @@
<meeting ana="#parla.uni-PT #parla.term-PT #L.XIV-PT"
xml:lang="pt"
corresp="#ParlaMint-PT">XIV Legislatura</meeting>
<meeting ana="#parla.uni-PT #parla.term-PT #L.XV-PT"
xml:lang="pt"
corresp="#ParlaMint-PT">XV Legislatura</meeting>
<meeting n="4"
corresp="#ParlaMint-RS"
ana="#parla.uni-RS #parla.term-RS #NS.4-RS"
Expand Down Expand Up @@ -974,7 +977,7 @@
</editionStmt>
<extent>
<measure unit="corpora" quantity="29">29 corpora</measure>
<measure unit="speeches" quantity="7995766">7,995,766 speeches</measure>
<measure unit="speeches" quantity="8073406">8,073,406 speeches</measure>
<measure unit="speeches" quantity="231759" corresp="#ParlaMint-AT">231,759 speeches</measure>
<measure unit="speeches" quantity="126252" corresp="#ParlaMint-BA">126,252 speeches</measure>
<measure xmlns:xi="http://www.w3.org/2001/XInclude"
Expand Down Expand Up @@ -1007,13 +1010,13 @@
unit="speeches"
quantity="228326"
corresp="#ParlaMint-PL">228,326 speeches</measure>
<measure unit="speeches" quantity="170937" corresp="#ParlaMint-PT">170,937 speeches</measure>
<measure unit="speeches" quantity="248577" corresp="#ParlaMint-PT">248,577 speeches</measure>
<measure unit="speeches" quantity="316069" corresp="#ParlaMint-RS">316,069 speeches</measure>
<measure unit="speeches" quantity="84662" corresp="#ParlaMint-SE">84,662 speeches</measure>
<measure unit="speeches" quantity="311354" corresp="#ParlaMint-SI">311,354 speeches</measure>
<measure unit="speeches" quantity="681052" corresp="#ParlaMint-TR">681,052 speeches</measure>
<measure unit="speeches" quantity="429437" corresp="#ParlaMint-UA">429437 speeches</measure>
<measure unit="words" quantity="1225187375">1,225,187,375 words</measure>
<measure unit="speeches" quantity="429437" corresp="#ParlaMint-UA">429,437 speeches</measure>
<measure unit="words" quantity="1231036093">1,231,036,093 words</measure>
<measure unit="words" quantity="60839145" corresp="#ParlaMint-AT">60,839,145 words</measure>
<measure unit="words" quantity="18307498" corresp="#ParlaMint-BA">18,307,498 words</measure>
<measure xmlns:xi="http://www.w3.org/2001/XInclude"
Expand All @@ -1022,7 +1025,7 @@
corresp="#ParlaMint-BE">44,372,160 words</measure>
<measure unit="words" quantity="26471533" corresp="#ParlaMint-BG">26,471,533 words</measure>
<measure unit="words" quantity="30774250" corresp="#ParlaMint-CZ">30,774,250 words</measure>
<measure unit="words" quantity="40797597" corresp="#ParlaMint-DK">40,797,597 words</measure>
<measure unit="words" quantity="40801196" corresp="#ParlaMint-DK">40,801,196 words</measure>
<measure unit="words" quantity="22874712" corresp="#ParlaMint-EE">22,874,712 words</measure>
<measure unit="words" quantity="15947446" corresp="#ParlaMint-ES-CT">15,947,446 words</measure>
<measure unit="words" quantity="17837709" corresp="#ParlaMint-ES-GA">17,837,709 words</measure>
Expand All @@ -1046,12 +1049,12 @@
unit="words"
quantity="36064778"
corresp="#ParlaMint-PL">36,064,778 words</measure>
<measure unit="words" quantity="17646820" corresp="#ParlaMint-PT">17,646,820 words</measure>
<measure unit="words" quantity="23491939" corresp="#ParlaMint-PT">23,491,939 words</measure>
<measure unit="words" quantity="84568976" corresp="#ParlaMint-RS">84,568,976 words</measure>
<measure unit="words" quantity="28980081" corresp="#ParlaMint-SE">28,980,081 words</measure>
<measure unit="words" quantity="69921953" corresp="#ParlaMint-SI">69,921,953 words</measure>
<measure unit="words" quantity="49255262" corresp="#ParlaMint-TR">49,255,262 words</measure>
<measure unit="words" quantity="41997790" corresp="#ParlaMint-UA">41997790 words</measure>
<measure unit="words" quantity="41997790" corresp="#ParlaMint-UA">41,997,790 words</measure>
</extent>
<publicationStmt>
<publisher>
Expand All @@ -1063,7 +1066,7 @@
<licence>http://creativecommons.org/licenses/by/4.0/</licence>
<p>This work is licensed under the <ref target="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ref>.</p>
</availability>
<date when="2024-03-06">March 6, 2024</date>
<date when="2024-05-21">May 21, 2024</date>
</publicationStmt>
<sourceDesc>
<listBibl corresp="#ParlaMint-AT">
Expand Down Expand Up @@ -1319,7 +1322,7 @@
<title>Minutes of the Assembleia da República of Portugal</title>
<publisher xml:lang="pt">Assembleia da República</publisher>
<idno type="URI" subtype="parliament">https://debates.parlamento.pt/catalogo/r3/dar</idno>
<date from="2015-01-07" to="2022-03-22">07.01.2015 - 22.03.2022</date>
<date from="2015-01-07" to="2024-03-26">07.01.2015 - 26.03.2024</date>
</bibl>
</listBibl>
<listBibl corresp="#ParlaMint-RS">
Expand Down Expand Up @@ -1532,20 +1535,20 @@
</editorialDecl>
<tagsDecl>
<namespace name="http://www.tei-c.org/ns/1.0">
<tagUsage gi="body" occurs="46007"/>
<tagUsage gi="desc" occurs="1620489"/>
<tagUsage gi="div" occurs="179644"/>
<tagUsage gi="body" occurs="46204"/>
<tagUsage gi="desc" occurs="1650244"/>
<tagUsage gi="div" occurs="180038"/>
<tagUsage gi="gap" occurs="120870"/>
<tagUsage gi="head" occurs="146981"/>
<tagUsage gi="head" occurs="147383"/>
<tagUsage gi="incident" occurs="79527"/>
<tagUsage gi="kinesic" occurs="936156"/>
<tagUsage gi="note" occurs="8167107"/>
<tagUsage gi="kinesic" occurs="954808"/>
<tagUsage gi="note" occurs="8175909"/>
<tagUsage gi="pb" occurs="160325"/>
<tagUsage gi="seg" occurs="20119140"/>
<tagUsage gi="text" occurs="46007"/>
<tagUsage gi="seg" occurs="20289537"/>
<tagUsage gi="text" occurs="46204"/>
<tagUsage gi="time" occurs="464496"/>
<tagUsage gi="u" occurs="7995766"/>
<tagUsage gi="vocal" occurs="468774"/>
<tagUsage gi="u" occurs="8073406"/>
<tagUsage gi="vocal" occurs="479877"/>
<tagUsage gi="body" occurs="1221" corresp="#ParlaMint-AT"/>
<tagUsage gi="body" occurs="743" corresp="#ParlaMint-BA"/>
<tagUsage xmlns:xi="http://www.w3.org/2001/XInclude"
Expand Down Expand Up @@ -1578,7 +1581,7 @@
gi="body"
occurs="686"
corresp="#ParlaMint-PL"/>
<tagUsage gi="body" occurs="704" corresp="#ParlaMint-PT"/>
<tagUsage gi="body" occurs="901" corresp="#ParlaMint-PT"/>
<tagUsage gi="body" occurs="2060" corresp="#ParlaMint-RS"/>
<tagUsage gi="body" occurs="938" corresp="#ParlaMint-SE"/>
<tagUsage gi="body" occurs="1572" corresp="#ParlaMint-SI"/>
Expand Down Expand Up @@ -1609,7 +1612,7 @@
gi="desc"
occurs="250002"
corresp="#ParlaMint-PL"/>
<tagUsage gi="desc" occurs="62704" corresp="#ParlaMint-PT"/>
<tagUsage gi="desc" occurs="92459" corresp="#ParlaMint-PT"/>
<tagUsage gi="desc" occurs="5989" corresp="#ParlaMint-RS"/>
<tagUsage gi="desc" occurs="9656" corresp="#ParlaMint-SE"/>
<tagUsage gi="desc" occurs="42322" corresp="#ParlaMint-SI"/>
Expand Down Expand Up @@ -1647,7 +1650,7 @@
gi="div"
occurs="686"
corresp="#ParlaMint-PL"/>
<tagUsage gi="div" occurs="1408" corresp="#ParlaMint-PT"/>
<tagUsage gi="div" occurs="1802" corresp="#ParlaMint-PT"/>
<tagUsage gi="div" occurs="2060" corresp="#ParlaMint-RS"/>
<tagUsage gi="div" occurs="15843" corresp="#ParlaMint-SE"/>
<tagUsage gi="div" occurs="1572" corresp="#ParlaMint-SI"/>
Expand Down Expand Up @@ -1689,7 +1692,7 @@
gi="head"
occurs="686"
corresp="#ParlaMint-PL"/>
<tagUsage gi="head" occurs="1430" corresp="#ParlaMint-PT"/>
<tagUsage gi="head" occurs="1832" corresp="#ParlaMint-PT"/>
<tagUsage gi="head" occurs="15819" corresp="#ParlaMint-SE"/>
<tagUsage gi="head" occurs="4706" corresp="#ParlaMint-SI"/>
<tagUsage gi="incident" occurs="1864" corresp="#ParlaMint-BA"/>
Expand Down Expand Up @@ -1743,7 +1746,7 @@
gi="kinesic"
occurs="94217"
corresp="#ParlaMint-PL"/>
<tagUsage gi="kinesic" occurs="42606" corresp="#ParlaMint-PT"/>
<tagUsage gi="kinesic" occurs="61258" corresp="#ParlaMint-PT"/>
<tagUsage gi="kinesic" occurs="2390" corresp="#ParlaMint-RS"/>
<tagUsage gi="kinesic" occurs="9656" corresp="#ParlaMint-SE"/>
<tagUsage gi="kinesic" occurs="903" corresp="#ParlaMint-SI"/>
Expand Down Expand Up @@ -1780,7 +1783,7 @@
gi="note"
occurs="241406"
corresp="#ParlaMint-PL"/>
<tagUsage gi="note" occurs="34745" corresp="#ParlaMint-PT"/>
<tagUsage gi="note" occurs="43547" corresp="#ParlaMint-PT"/>
<tagUsage gi="note" occurs="318697" corresp="#ParlaMint-RS"/>
<tagUsage gi="note" occurs="370551" corresp="#ParlaMint-SE"/>
<tagUsage gi="note" occurs="392734" corresp="#ParlaMint-SI"/>
Expand Down Expand Up @@ -1822,7 +1825,7 @@
gi="seg"
occurs="1097269"
corresp="#ParlaMint-PL"/>
<tagUsage gi="seg" occurs="458643" corresp="#ParlaMint-PT"/>
<tagUsage gi="seg" occurs="629040" corresp="#ParlaMint-PT"/>
<tagUsage gi="seg" occurs="318781" corresp="#ParlaMint-RS"/>
<tagUsage gi="seg" occurs="535639" corresp="#ParlaMint-SE"/>
<tagUsage gi="seg" occurs="1009004" corresp="#ParlaMint-SI"/>
Expand Down Expand Up @@ -1860,7 +1863,7 @@
gi="text"
occurs="686"
corresp="#ParlaMint-PL"/>
<tagUsage gi="text" occurs="704" corresp="#ParlaMint-PT"/>
<tagUsage gi="text" occurs="901" corresp="#ParlaMint-PT"/>
<tagUsage gi="text" occurs="2060" corresp="#ParlaMint-RS"/>
<tagUsage gi="text" occurs="938" corresp="#ParlaMint-SE"/>
<tagUsage gi="text" occurs="1572" corresp="#ParlaMint-SI"/>
Expand Down Expand Up @@ -1901,7 +1904,7 @@
gi="u"
occurs="228326"
corresp="#ParlaMint-PL"/>
<tagUsage gi="u" occurs="170937" corresp="#ParlaMint-PT"/>
<tagUsage gi="u" occurs="248577" corresp="#ParlaMint-PT"/>
<tagUsage gi="u" occurs="316069" corresp="#ParlaMint-RS"/>
<tagUsage gi="u" occurs="84662" corresp="#ParlaMint-SE"/>
<tagUsage gi="u" occurs="311354" corresp="#ParlaMint-SI"/>
Expand Down Expand Up @@ -1932,7 +1935,7 @@
gi="vocal"
occurs="106762"
corresp="#ParlaMint-PL"/>
<tagUsage gi="vocal" occurs="20098" corresp="#ParlaMint-PT"/>
<tagUsage gi="vocal" occurs="31201" corresp="#ParlaMint-PT"/>
<tagUsage gi="vocal" occurs="1710" corresp="#ParlaMint-RS"/>
<tagUsage gi="vocal" occurs="2725" corresp="#ParlaMint-SI"/>
<tagUsage gi="vocal" occurs="9655" corresp="#ParlaMint-TR"/>
Expand Down Expand Up @@ -2051,7 +2054,7 @@
<name type="address">Kossuth tér 1-3.</name>
<name type="city">Budapest</name>
<name type="country" key="HU">Hungary</name>
<date from="2014-05-06" to="2023-07-31">2014.05.06 - 2023.07.31.</date>
<date from="2014-05-06" to="2023-07-31"/>
</setting>
<setting corresp="#ParlaMint-IS">
<name type="address">Kirkjustræti</name>
Expand Down Expand Up @@ -2096,7 +2099,7 @@
<name type="address">Alameda da Universidade, 1600-214 Lisboa, Portugal</name>
<name type="city">Lisboa</name>
<name type="country" key="PT">Portugal</name>
<date from="2015-01-07" to="2022-03-22">07.01.2015 - 22.03.2022</date>
<date from="2015-01-07" to="2024-03-26">07.01.2015 - 26.03.2024</date>
</setting>
<setting corresp="#ParlaMint-RS">
<name type="address">Trg Nikole Pašića 13</name>
Expand Down Expand Up @@ -2447,7 +2450,7 @@
</langUsage>
</profileDesc>
<revisionDesc>
<change when="2024-03-06">
<change when="2024-05-21">
<name>Tomaž Erjavec</name>: Generate root</change>
</revisionDesc>
</teiHeader>
Expand Down
61 changes: 31 additions & 30 deletions Build/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@

### COMPLETE SET OF CORPORA
CORPORA=AT BA BE BG CZ DK EE ES ES-CT ES-GA ES-PV FI FR GB GR HR HU IS IT LV NL NO PL PT RS SE SI TR UA
#CORPORA=AT HU BA BE CZ EE ES-CT ES-GA ES-PV / DK
CORPORA=SI BA
#CORPORA=
# Used in targets that run only for one corpus
#CORPUS=
CORPUS=

#Absolute paths are needed otherwise problems with XSLT
PARLAMINT := $(shell realpath .. | tr -d '\n')# get real absolute path to ParlaMint directory
Expand Down Expand Up @@ -68,9 +67,9 @@ source-metadata:
test:
date
test-tei2:
${FINALIZE} -valid -codes UA -in ${HERE}/Distro -out ${HERE}/Distro
${FINALIZE} -valid -codes TR -in ${HERE}/Distro -out ${HERE}/Distro
test-tei1:
${FINALIZE} -tei -valid -codes DK -in ${HERE}/Distro -out ${HERE}/Distro
${FINALIZE} -tei -codes BG -in ${HERE}/Sources-TEI -out ${HERE}/Distro
test-vert6:
../Scripts/parlamintp-tei2vert-xx.pl ${HERE}/Distro/ParlaMint-LV.TEI.ana Test/ParlaMint-LV-xx.vert
test-vert5:
Expand Down Expand Up @@ -150,13 +149,10 @@ mt-samples:
done;
#Merge original and MTed samples into official Samples directory
cp-samples:
for CORPUS in ${CORPORA}; do \
Scripts/cp-samples.pl Distro/ParlaMint-$${CORPUS} ../Samples; \
done;
Scripts/cp-samples.pl 'Distro/ParlaMint-*' ../Samples
#cp Logs/ParlaMint-$${CORPUS}-samples.log ../Samples/ParlaMint-$${CORPUS}; \


# Make vertical fine with en metadata, a hack:
# Make vertical file with en metadata, a hack:
XX-CORPORA = AT-xx BA-xx BE-xx BG-xx CZ-xx DK-xx EE-xx ES-xx ES-CT-xx ES-GA-xx ES-PV-xx FI-xx FR-xx GB-xx GR-xx HR-xx HU-xx IS-xx IT-xx LV-xx NL-xx NO-xx PL-xx PT-xx RS-xx SE-xx SI-xx TR-xx UA-xx
# Test: make make-verts-xx CORPORA='LV ES-CT'
make-verts-xx-nohup:
Expand Down Expand Up @@ -188,12 +184,12 @@ source-roots:
$s base=${HERE}/Sources-TEI type=TEI.ana -xsl:../Scripts/parlamint2root.xsl \
../Scripts/ParlaMint-rootTemplate.xml > ${HERE}/Sources-TEI/ParlaMint.ana.xml
master-roots:
$s base=${HERE}/Distro -xsl:../Scripts/parlamint2root.xsl \
../Scripts/ParlaMint-template.xml > ${HERE}/Distro/ParlaMint.xml
$s base=${HERE}/Distro -xsl:../Scripts/parlamint2root.xsl \
../Scripts/ParlaMint-template.ana.xml > ${HERE}/Distro/ParlaMint.ana.xml
$s base=${HERE}/Distro -xsl:../Scripts/parlamint2root.xsl \
../Scripts/ParlaMint-template-en.ana.xml > ${HERE}/Distro/ParlaMint-en.ana.xml
$s base=${HERE}/Distro type=TEI -xsl:../Scripts/parlamint2root.xsl \
../Scripts/ParlaMint-rootTemplate.xml > ${HERE}/Distro/ParlaMint.xml
$s base=${HERE}/Distro type=TEI.ana -xsl:../Scripts/parlamint2root.xsl \
../Scripts/ParlaMint-rootTemplate.xml > ${HERE}/Distro/ParlaMint.ana.xml
$s base=${HERE}/Distro type=en.TEI.ana -xsl:../Scripts/parlamint2root.xsl \
../Scripts/ParlaMint-rootTemplate.xml > ${HERE}/Distro/ParlaMint-en.ana.xml

mt-logs:
for CORPUS in ${CORPORA}; do \
Expand Down Expand Up @@ -252,19 +248,19 @@ FINALIZE-MT=perl ../Scripts/parlamint2distro.pl -version ${VERSION} -anahandle $

# Targets
mt-nohup1:
nice nohup time make mt-all-final > Logs/ParlaMint-en.log &
nice nohup time make mt-all > Logs/ParlaMint-en.log &
mt-nohup2:
nice nohup time make mt-all-final > Logs/ParlaMint-en.2.log &
nice nohup time make mt-all > Logs/ParlaMint-en.2.log &
mt-nohup3:
nice nohup time make mt-all-final > Logs/ParlaMint-en.3.log &
nice nohup time make mt-all > Logs/ParlaMint-en.3.log &
mt-nohup4:
nice nohup time make mt-all-final > Logs/ParlaMint-en.4.log &
nice nohup time make mt-all > Logs/ParlaMint-en.4.log &
mt-nohup5:
nice nohup time make mt-all-final > Logs/ParlaMint-en.5.log &
nice nohup time make mt-all > Logs/ParlaMint-en.5.log &
mt-nohup6:
nice nohup time make mt-all-final > Logs/ParlaMint-en.6.log &
nice nohup time make mt-all > Logs/ParlaMint-en.6.log &

mt-all-final: mt-convert
mt-all: mt-convert
mt-xall-final: mt-convert mt-verts mt-pack mt-web

# Make MT .txt and CoNLL files
Expand All @@ -278,9 +274,9 @@ mt-web:
nohup-mt-pack:
nohup time make mt-pack > mt-pack.log &
mt-pack:
perl ../Scripts/pack-parlamint.pl -codes '${CORPORA}-en' -in Distro -out Packed
rsync -av Packed/*-en*.tgz ${WEB}/Repo
cp Packed/*-en*.tgz /project/clarin-upload/ParlaMint
for CORPUS in ${CORPORA}; do \
perl Scripts/pack-parlamint.pl -codes $${CORPUS}-en -in Distro -out Packed ; \
done;

# Make MT CoNNL-U files only
mt-nohup-conll:
Expand All @@ -305,7 +301,7 @@ mt-make-verts:

# Join verts only
mt-verts:
perl ../Scripts/join-all-verts.pl -codes '${CORPORA}' -in 'Distro' -out Verts/ParlaMint-XX.${VERSION}.vert
#perl ../Scripts/join-all-verts.pl -codes '${CORPORA}' -in 'Distro' -out Verts/ParlaMint-XX.${VERSION}.vert
perl ../Scripts/join-all-verts.pl -en -codes '${CORPORA}' -in 'Distro' -out Verts/ParlaMint-XX-en.${VERSION}.vert

# Sanity check for alignment
Expand Down Expand Up @@ -362,7 +358,7 @@ mt-test7:
$s meta=${HERE}/Distro/ParlaMint-AT-en.TEI.ana/ParlaMint-AT-en.ana.xml -xsl:../Scripts//check-links.xsl \
${HERE}/Distro/ParlaMint-AT-en.TEI.ana/2022/ParlaMint-AT-en_2022-01-20-027-XXVII-NRSITZ-00139.ana.xml
mt-test6:
${FINALIZE-MT} -vert -codes AT-en -in ${TEMP} -out ${HERE}/Distro
${FINALIZE-MT} -sample -codes ES-CT-en -out ${HERE}/Distro
mt-test5:
${vta} Test/ParlaMint-AT-en.TEI.ana/ParlaMint-taxonomy-*.xml
${vlp} Test/ParlaMint-AT-en.TEI.ana/ParlaMint-AT-listPerson.xml
Expand Down Expand Up @@ -405,7 +401,8 @@ merge-taxos:
done;
${vta} Taxonomies/ParlaMint-taxonomy-*.xml

### Some idea, need to think about it!
### Some ideas, need to think about them...

#REGIS=at ba be bg cz dk es_ct fr gb gr hr hu is it lv nl no pl pt rs se si tr ua
REGIS=ua
QUERY=https://dev:[email protected]/noske-beta/parlamint.cgi/wordlist?
Expand All @@ -417,9 +414,13 @@ body:
done

###################### SCRIPT VARIABLES
##$JAVA-MEMORY## Set a java memory maxsize in GB
JAVA-MEMORY=240
JM := $(shell test -n "$(JAVA-MEMORY)" && echo -n "-Xmx$(JAVA-MEMORY)g")

P = parallel --citation --gnu --halt 2
#Run java with a large heap, as a complete corpus needs to be read in
s = java -jar -Xmx240g ../Scripts/bin/saxon.jar
s = java -jar $(JM) ../Scripts/bin/saxon.jar
j = java -jar ../Scripts/bin/jing.jar

pc = -I % $s -xi -xsl:../Scripts/copy.xsl % | $j parla-clarin.rng
Expand Down
Loading

0 comments on commit c2e0684

Please sign in to comment.