-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #873 from clarin-eric/devel
Devel-main
- Loading branch information
Showing
990 changed files
with
345,894 additions
and
236,712 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
get-cz: | ||
rsync -rv /home/kopp/ParlaMint-4.1-distro/Distro-MT/* . |
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,10 +6,9 @@ | |
|
||
### COMPLETE SET OF CORPORA | ||
CORPORA=AT BA BE BG CZ DK EE ES ES-CT ES-GA ES-PV FI FR GB GR HR HU IS IT LV NL NO PL PT RS SE SI TR UA | ||
#CORPORA=AT HU BA BE CZ EE ES-CT ES-GA ES-PV / DK | ||
CORPORA=SI BA | ||
#CORPORA= | ||
# Used in targets that run only for one corpus | ||
#CORPUS= | ||
CORPUS= | ||
|
||
#Absolute paths are needed otherwise problems with XSLT | ||
PARLAMINT := $(shell realpath .. | tr -d '\n')# get real absolute path to ParlaMint directory | ||
|
@@ -68,9 +67,9 @@ source-metadata: | |
test: | ||
date | ||
test-tei2: | ||
${FINALIZE} -valid -codes UA -in ${HERE}/Distro -out ${HERE}/Distro | ||
${FINALIZE} -valid -codes TR -in ${HERE}/Distro -out ${HERE}/Distro | ||
test-tei1: | ||
${FINALIZE} -tei -valid -codes DK -in ${HERE}/Distro -out ${HERE}/Distro | ||
${FINALIZE} -tei -codes BG -in ${HERE}/Sources-TEI -out ${HERE}/Distro | ||
test-vert6: | ||
../Scripts/parlamintp-tei2vert-xx.pl ${HERE}/Distro/ParlaMint-LV.TEI.ana Test/ParlaMint-LV-xx.vert | ||
test-vert5: | ||
|
@@ -150,13 +149,10 @@ mt-samples: | |
done; | ||
#Merge original and MTed samples into official Samples directory | ||
cp-samples: | ||
for CORPUS in ${CORPORA}; do \ | ||
Scripts/cp-samples.pl Distro/ParlaMint-$${CORPUS} ../Samples; \ | ||
done; | ||
Scripts/cp-samples.pl 'Distro/ParlaMint-*' ../Samples | ||
#cp Logs/ParlaMint-$${CORPUS}-samples.log ../Samples/ParlaMint-$${CORPUS}; \ | ||
|
||
|
||
# Make vertical fine with en metadata, a hack: | ||
# Make vertical file with en metadata, a hack: | ||
XX-CORPORA = AT-xx BA-xx BE-xx BG-xx CZ-xx DK-xx EE-xx ES-xx ES-CT-xx ES-GA-xx ES-PV-xx FI-xx FR-xx GB-xx GR-xx HR-xx HU-xx IS-xx IT-xx LV-xx NL-xx NO-xx PL-xx PT-xx RS-xx SE-xx SI-xx TR-xx UA-xx | ||
# Test: make make-verts-xx CORPORA='LV ES-CT' | ||
make-verts-xx-nohup: | ||
|
@@ -188,12 +184,12 @@ source-roots: | |
$s base=${HERE}/Sources-TEI type=TEI.ana -xsl:../Scripts/parlamint2root.xsl \ | ||
../Scripts/ParlaMint-rootTemplate.xml > ${HERE}/Sources-TEI/ParlaMint.ana.xml | ||
master-roots: | ||
$s base=${HERE}/Distro -xsl:../Scripts/parlamint2root.xsl \ | ||
../Scripts/ParlaMint-template.xml > ${HERE}/Distro/ParlaMint.xml | ||
$s base=${HERE}/Distro -xsl:../Scripts/parlamint2root.xsl \ | ||
../Scripts/ParlaMint-template.ana.xml > ${HERE}/Distro/ParlaMint.ana.xml | ||
$s base=${HERE}/Distro -xsl:../Scripts/parlamint2root.xsl \ | ||
../Scripts/ParlaMint-template-en.ana.xml > ${HERE}/Distro/ParlaMint-en.ana.xml | ||
$s base=${HERE}/Distro type=TEI -xsl:../Scripts/parlamint2root.xsl \ | ||
../Scripts/ParlaMint-rootTemplate.xml > ${HERE}/Distro/ParlaMint.xml | ||
$s base=${HERE}/Distro type=TEI.ana -xsl:../Scripts/parlamint2root.xsl \ | ||
../Scripts/ParlaMint-rootTemplate.xml > ${HERE}/Distro/ParlaMint.ana.xml | ||
$s base=${HERE}/Distro type=en.TEI.ana -xsl:../Scripts/parlamint2root.xsl \ | ||
../Scripts/ParlaMint-rootTemplate.xml > ${HERE}/Distro/ParlaMint-en.ana.xml | ||
|
||
mt-logs: | ||
for CORPUS in ${CORPORA}; do \ | ||
|
@@ -252,19 +248,19 @@ FINALIZE-MT=perl ../Scripts/parlamint2distro.pl -version ${VERSION} -anahandle $ | |
|
||
# Targets | ||
mt-nohup1: | ||
nice nohup time make mt-all-final > Logs/ParlaMint-en.log & | ||
nice nohup time make mt-all > Logs/ParlaMint-en.log & | ||
mt-nohup2: | ||
nice nohup time make mt-all-final > Logs/ParlaMint-en.2.log & | ||
nice nohup time make mt-all > Logs/ParlaMint-en.2.log & | ||
mt-nohup3: | ||
nice nohup time make mt-all-final > Logs/ParlaMint-en.3.log & | ||
nice nohup time make mt-all > Logs/ParlaMint-en.3.log & | ||
mt-nohup4: | ||
nice nohup time make mt-all-final > Logs/ParlaMint-en.4.log & | ||
nice nohup time make mt-all > Logs/ParlaMint-en.4.log & | ||
mt-nohup5: | ||
nice nohup time make mt-all-final > Logs/ParlaMint-en.5.log & | ||
nice nohup time make mt-all > Logs/ParlaMint-en.5.log & | ||
mt-nohup6: | ||
nice nohup time make mt-all-final > Logs/ParlaMint-en.6.log & | ||
nice nohup time make mt-all > Logs/ParlaMint-en.6.log & | ||
|
||
mt-all-final: mt-convert | ||
mt-all: mt-convert | ||
mt-xall-final: mt-convert mt-verts mt-pack mt-web | ||
|
||
# Make MT .txt and CoNLL files | ||
|
@@ -278,9 +274,9 @@ mt-web: | |
nohup-mt-pack: | ||
nohup time make mt-pack > mt-pack.log & | ||
mt-pack: | ||
perl ../Scripts/pack-parlamint.pl -codes '${CORPORA}-en' -in Distro -out Packed | ||
rsync -av Packed/*-en*.tgz ${WEB}/Repo | ||
cp Packed/*-en*.tgz /project/clarin-upload/ParlaMint | ||
for CORPUS in ${CORPORA}; do \ | ||
perl Scripts/pack-parlamint.pl -codes $${CORPUS}-en -in Distro -out Packed ; \ | ||
done; | ||
|
||
# Make MT CoNNL-U files only | ||
mt-nohup-conll: | ||
|
@@ -305,7 +301,7 @@ mt-make-verts: | |
|
||
# Join verts only | ||
mt-verts: | ||
perl ../Scripts/join-all-verts.pl -codes '${CORPORA}' -in 'Distro' -out Verts/ParlaMint-XX.${VERSION}.vert | ||
#perl ../Scripts/join-all-verts.pl -codes '${CORPORA}' -in 'Distro' -out Verts/ParlaMint-XX.${VERSION}.vert | ||
perl ../Scripts/join-all-verts.pl -en -codes '${CORPORA}' -in 'Distro' -out Verts/ParlaMint-XX-en.${VERSION}.vert | ||
|
||
# Sanity check for alignment | ||
|
@@ -362,7 +358,7 @@ mt-test7: | |
$s meta=${HERE}/Distro/ParlaMint-AT-en.TEI.ana/ParlaMint-AT-en.ana.xml -xsl:../Scripts//check-links.xsl \ | ||
${HERE}/Distro/ParlaMint-AT-en.TEI.ana/2022/ParlaMint-AT-en_2022-01-20-027-XXVII-NRSITZ-00139.ana.xml | ||
mt-test6: | ||
${FINALIZE-MT} -vert -codes AT-en -in ${TEMP} -out ${HERE}/Distro | ||
${FINALIZE-MT} -sample -codes ES-CT-en -out ${HERE}/Distro | ||
mt-test5: | ||
${vta} Test/ParlaMint-AT-en.TEI.ana/ParlaMint-taxonomy-*.xml | ||
${vlp} Test/ParlaMint-AT-en.TEI.ana/ParlaMint-AT-listPerson.xml | ||
|
@@ -405,7 +401,8 @@ merge-taxos: | |
done; | ||
${vta} Taxonomies/ParlaMint-taxonomy-*.xml | ||
|
||
### Some idea, need to think about it! | ||
### Some ideas, need to think about them... | ||
|
||
#REGIS=at ba be bg cz dk es_ct fr gb gr hr hu is it lv nl no pl pt rs se si tr ua | ||
REGIS=ua | ||
QUERY=https://dev:[email protected]/noske-beta/parlamint.cgi/wordlist? | ||
|
@@ -417,9 +414,13 @@ body: | |
done | ||
|
||
###################### SCRIPT VARIABLES | ||
##$JAVA-MEMORY## Set a java memory maxsize in GB | ||
JAVA-MEMORY=240 | ||
JM := $(shell test -n "$(JAVA-MEMORY)" && echo -n "-Xmx$(JAVA-MEMORY)g") | ||
|
||
P = parallel --citation --gnu --halt 2 | ||
#Run java with a large heap, as a complete corpus needs to be read in | ||
s = java -jar -Xmx240g ../Scripts/bin/saxon.jar | ||
s = java -jar $(JM) ../Scripts/bin/saxon.jar | ||
j = java -jar ../Scripts/bin/jing.jar | ||
|
||
pc = -I % $s -xi -xsl:../Scripts/copy.xsl % | $j parla-clarin.rng | ||
|
Oops, something went wrong.