-
Notifications
You must be signed in to change notification settings - Fork 52
/
Copy pathcorpus2sample.xsl
executable file
·255 lines (232 loc) · 9.02 KB
/
corpus2sample.xsl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
<?xml version="1.0"?>
<!-- Take root corpus file and output sample in $outDir directory -->
<!-- Script retains first and last component file, and first and last $Range utterances in them -->
<xsl:stylesheet
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xi="http://www.w3.org/2001/XInclude"
xmlns:tei="http://www.tei-c.org/ns/1.0"
xmlns="http://www.tei-c.org/ns/1.0"
xmlns:et="http://nl.ijs.si/et"
exclude-result-prefixes="#all"
version="2.0">
<xsl:import href="parlamint-lib.xsl"/>
<!-- Output directory for samples -->
<xsl:param name="outDir"/>
<!-- Revision responsible person -->
<xsl:param name="revRespPers">Tomaž Erjavec</xsl:param>
<!-- How many TEI files to take -->
<xsl:param name="Files">3</xsl:param>
<!-- How many utterances to select from start and end of component files -->
<xsl:param name="Range">2</xsl:param>
<!-- Location of the GitHub project containing the output files -->
<xsl:param name="GitHub-project">https://github.com/clarin-eric/ParlaMint</xsl:param>
<xsl:variable name="today" select="format-date(current-date(), '[Y0001]-[M01]-[D01]')"/>
<!-- Select $Files XInclude components -->
<xsl:variable name="components">
<xsl:variable name="n" select="count(/tei:teiCorpus/xi:include)"/>
<xsl:choose>
<!-- When too few files -->
<xsl:when test="$n < 2 * $Files">
<xsl:message select="concat('INFO: from ', $n , ' files selecting all of them: ')"/>
<xsl:for-each select="/tei:teiCorpus/xi:include">
<xsl:message select="concat('INFO: selecting component file ', @href)"/>
<xsl:copy-of select="."/>
</xsl:for-each>
</xsl:when>
<xsl:otherwise>
<xsl:message select="concat('INFO: from ', $n , ' component files selecting ~', $Files, ' files:')"/>
<xsl:for-each select="/tei:teiCorpus/xi:include">
<xsl:if test="(position()-1) mod floor($n div $Files) = floor($n div $Files) - 1">
<xsl:message select="concat('INFO: selecting component file ', @href)"/>
<xsl:copy-of select="."/>
</xsl:if>
</xsl:for-each>
</xsl:otherwise>
</xsl:choose>
</xsl:variable>
<xsl:output method="xml" indent="yes"/>
<xsl:template match="/">
<!-- Output root file -->
<xsl:variable name="inFile" select="replace(base-uri(), '.+/([^/]+$)', '$1')"/>
<xsl:result-document href="{$outDir}/{$inFile}" method="xml">
<xsl:apply-templates/>
</xsl:result-document>
<!-- Output component file samples -->
<xsl:variable name="inDir" select="replace(base-uri(), '/[^/]+$', '')"/>
<xsl:for-each select="$components/xi:include | //tei:teiHeader//xi:include">
<!-- Get rid of subdirectories if in original -->
<xsl:variable name="href" select="replace(@href, '.+/', '')"/>
<xsl:result-document href="{$outDir}/{$href}" method="xml">
<xsl:apply-templates mode="component" select="document(concat($inDir, '/', @href))"/>
</xsl:result-document>
</xsl:for-each>
</xsl:template>
<xsl:template mode="component" match="/">
<xsl:apply-templates/>
</xsl:template>
<xsl:template match="tei:teiCorpus">
<xsl:copy>
<xsl:apply-templates select="@*"/>
<xsl:apply-templates select="tei:teiHeader"/>
<xsl:for-each select="$components/xi:include">
<xi:include href="{@href}"/>
</xsl:for-each>
</xsl:copy>
</xsl:template>
<xsl:template match="tei:teiHeader">
<xsl:copy>
<xsl:apply-templates select="@*"/>
<xsl:apply-templates/>
<xsl:if test="not(./tei:revisionDesc)">
<revisionDesc>
<xsl:call-template name="revisionSample"/>
</revisionDesc>
</xsl:if>
</xsl:copy>
</xsl:template>
<xsl:template match="tei:titleStmt/tei:title[@type='main']">
<xsl:copy>
<xsl:apply-templates select="@*"/>
<xsl:value-of select="replace(., '( SAMPLE)?\]', ' SAMPLE]')"/>
</xsl:copy>
</xsl:template>
<xsl:template match="tei:publicationStmt/tei:date">
<xsl:copy>
<xsl:attribute name="when" select="$today-iso"/>
<xsl:value-of select="$today-iso"/>
</xsl:copy>
</xsl:template>
<!-- This makes a "proper" sample, but is confusing for those that
take the samples as a model of how to prepare their corpora
<xsl:template match="tei:publicationStmt/tei:pubPlace"/>
<xsl:template match="tei:publicationStmt/tei:idno[@type='handle']">
<idno type="URL">
<xsl:value-of select="$GitHub-project"/>
</idno>
<pubPlace>
<ref target="{$GitHub-project}">
<xsl:value-of select="$GitHub-project"/>
</ref>
</pubPlace>
</xsl:template>
<xsl:template match="tei:sourceDesc">
<xsl:copy>
<xsl:apply-templates select="@*"/>
<bibl>
<title>Multilingual comparable corpora of parliamentary debates ParlaMint</title>
<xsl:copy-of select="ancestor::tei:teiHeader//tei:publicationStmt/tei:idno[@type='handle']"/>
</bibl>
<xsl:apply-templates/>
</xsl:copy>
</xsl:template>
-->
<xsl:template match="tei:extent | tei:tagsDecl">
<xsl:copy>
<xsl:apply-templates select="@*"/>
<xsl:comment>These numbers do not reflect the size of the sample!</xsl:comment>
<xsl:apply-templates/>
</xsl:copy>
</xsl:template>
<xsl:template match="tei:revisionDesc">
<xsl:copy>
<xsl:apply-templates select="@*"/>
<xsl:call-template name="revisionSample"/>
<xsl:apply-templates/>
</xsl:copy>
</xsl:template>
<xsl:template name="revisionSample">
<change when="{$today-iso}"><name><xsl:value-of select="$revRespPers"/></name>: Made sample.</change>
</xsl:template>
<!-- Here we pick the first and last $Range utterances and all
immediatelly preceding and intervening other elements -->
<xsl:template match="tei:body">
<xsl:variable name="all" select="count(//tei:u)"/>
<xsl:copy>
<xsl:apply-templates select="@*"/>
<xsl:variable name="to">
<xsl:choose>
<!-- If there is too few <u>s in the document -->
<xsl:when test="$all < $Range">
<xsl:value-of select="(.//tei:u)[last()]/@xml:id"/>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="(.//tei:u)[position() = $Range]/@xml:id"/>
</xsl:otherwise>
</xsl:choose>
</xsl:variable>
<xsl:variable name="from">
<xsl:choose>
<!-- If there is too few <u>s in the document -->
<xsl:when test="$all < 2 * $Range">0</xsl:when>
<xsl:otherwise>
<xsl:value-of select="(.//tei:u)[position() = $all - ($Range - 1)]/@xml:id"/>
</xsl:otherwise>
</xsl:choose>
</xsl:variable>
<xsl:apply-templates>
<xsl:with-param name="from" select="$from"/>
<xsl:with-param name="to" select="$to"/>
</xsl:apply-templates>
</xsl:copy>
</xsl:template>
<xsl:template match="tei:div[@type='debateSection']">
<xsl:param name="from">0</xsl:param>
<xsl:param name="to">0</xsl:param>
<!--xsl:message select="concat('SELECTING ', /tei:TEI/@xml:id, ': ', $to, ' AND ', $from)"/-->
<xsl:variable name="div">
<xsl:copy>
<xsl:apply-templates select="@*"/>
<xsl:variable name="incipit">
<xsl:apply-templates>
<xsl:with-param name="to" select="$to"/>
</xsl:apply-templates>
</xsl:variable>
<xsl:variable name="explicit">
<xsl:apply-templates>
<xsl:with-param name="from" select="$from"/>
</xsl:apply-templates>
</xsl:variable>
<xsl:if test="$incipit/tei:*">
<xsl:copy-of select="$incipit"/>
<gap reason="editorial"><desc xml:lang="en">SAMPLING</desc></gap>
</xsl:if>
<xsl:copy-of select="$explicit"/>
</xsl:copy>
</xsl:variable>
<xsl:if test="$div//tei:u">
<xsl:copy-of select="$div"/>
</xsl:if>
</xsl:template>
<xsl:template match="tei:div[@type='debateSection']/node()">
<xsl:param name="from">0</xsl:param>
<xsl:param name="to">0</xsl:param>
<xsl:if test="($from = '0' and (self::tei:* | following::tei:*)[@xml:id = $to]) or
($to = '0' and (self::tei:* | preceding::tei:*)[@xml:id = $from])">
<xsl:choose>
<xsl:when test="self::tei:gap[@reason='editorial' and ./tei:desc/text() = 'SAMPLING']" /> <!-- don't copy gap/desc SAMPLING -->
<xsl:when test="self::tei:*">
<xsl:copy>
<xsl:apply-templates select="@*"/>
<xsl:apply-templates/>
</xsl:copy>
</xsl:when>
<xsl:when test="self::text()">
<xsl:value-of select="."/>
</xsl:when>
</xsl:choose>
</xsl:if>
</xsl:template>
<xsl:template match="tei:*">
<xsl:copy>
<xsl:apply-templates select="@*"/>
<xsl:apply-templates/>
</xsl:copy>
</xsl:template>
<xsl:template match="@*">
<xsl:copy/>
</xsl:template>
<xsl:template match="xi:include[ancestor::tei:teiHeader]">
<xsl:message select="concat('INFO: selecting meta file ', @href)"/>
<xsl:copy-of select="."/>
</xsl:template>
</xsl:stylesheet>