2323import shutil
2424import glob
2525import signal
26+ import re
2627
2728from lxml import etree
2829
@@ -273,6 +274,49 @@ def init_alpino_lookup(configfile, inputdir, outputdir):
273274 inputfiles .append (str (inputfile ))
274275
275276
277+ def copy_modify_metadata (outputdir : str , source_filename : str , total_filename : str ) -> None :
278+ with open (os .path .join (outputdir , f".{ source_filename } .METADATA" ), 'r' ) as source_file :
279+ with open (os .path .join (outputdir , f".{ total_filename } .METADATA" ), 'w' ) as target_file :
280+ content = source_file .read ()
281+ content = content .replace ("Document statistics" , "Corpus statistics" )
282+ content = content .replace (re .sub (r'\.[^\.]+\.csv$' , '' , source_filename ), "TOTAL" )
283+ target_file .write (content )
284+
285+
286+ def merge_output (outputdir : str , type : str ) -> None :
287+ first_file = True
288+ total_filename = f"total.{ type } .csv"
289+
290+ # create the empty target file
291+ open (total_filename , 'w' ).close ()
292+
293+ for f in glob .glob (f"{ outputdir } /*.{ type } .csv" ):
294+ filename = os .path .basename (f )
295+ if filename == total_filename :
296+ # ignore lingering total files
297+ continue
298+
299+ if first_file :
300+ # copy and modify metadata
301+ copy_modify_metadata (outputdir , filename , total_filename )
302+
303+ first_line = True
304+ with open (os .path .join (outputdir , total_filename ), 'a' ) as target_file :
305+ with open (os .path .join (outputdir , filename ), 'r' ) as source_file :
306+ while True :
307+ line = source_file .readline ()
308+ if not line :
309+ break
310+
311+ if first_line and not first_file :
312+ # skip the first line
313+ first_line = False
314+ else :
315+ # except for the first file, so the file will have the column names
316+ first_file = False
317+ target_file .write (line )
318+
319+
276320def sigterm_handler ():
277321 #collect output
278322 clam .common .status .write (statusfile , "Postprocessing after forceful abortion" , 90 ) # status update
@@ -282,19 +326,10 @@ def sigterm_handler():
282326 #tscan writes CSV file in input directory, move:
283327 os .system ("mv -f " + inputdir + "/*.csv " + outputdir )
284328
285- os .system ("cat " + outputdir + "/*.words.csv | head -n 1 > " + outputdir + "/total.word.csv" )
286- os .system ("cat " + outputdir + "/*.paragraphs.csv | head -n 1 > " + outputdir + "/total.par.csv" )
287- os .system ("cat " + outputdir + "/*.sentences.csv | head -n 1 > " + outputdir + "/total.sen.csv" )
288- os .system ("cat " + outputdir + "/*.document.csv | head -n 1 > " + outputdir + "/total.doc.csv" )
289-
290- for f in glob .glob (outputdir + "/*.words.csv" ):
291- os .system ("sed 1d " + f + " >> " + outputdir + "/total.word.csv" )
292- for f in glob .glob (outputdir + "/*.paragraphs.csv" ):
293- os .system ("sed 1d " + f + " >> " + outputdir + "/total.par.csv" )
294- for f in glob .glob (outputdir + "/*.sentences.csv" ):
295- os .system ("sed 1d " + f + " >> " + outputdir + "/total.sen.csv" )
296- for f in glob .glob (outputdir + "/*.document.csv" ):
297- os .system ("sed 1d " + f + " >> " + outputdir + "/total.doc.csv" )
329+ merge_output (outputdir , "words" )
330+ merge_output (outputdir , "paragraphs" )
331+ merge_output (outputdir , "sentences" )
332+ merge_output (outputdir , "document" )
298333 sys .exit (5 )
299334
300335signal .signal (signal .SIGTERM , sigterm_handler )
@@ -345,20 +380,10 @@ def sigterm_handler():
345380os .system ("mv -f " + inputdir + "/*.csv " + outputdir )
346381
347382#write the csv headers to a file with all results ('total.<type>.csv')
348- os .system ("cat " + outputdir + "/*.words.csv | head -n 1 > " + outputdir + "/total.word.csv" )
349- os .system ("cat " + outputdir + "/*.paragraphs.csv | head -n 1 > " + outputdir + "/total.par.csv" )
350- os .system ("cat " + outputdir + "/*.sentences.csv | head -n 1 > " + outputdir + "/total.sen.csv" )
351- os .system ("cat " + outputdir + "/*.document.csv | head -n 1 > " + outputdir + "/total.doc.csv" )
352-
353- #move the contents of the files to the total files
354- for f in glob .glob (outputdir + "/*.words.csv" ):
355- os .system ("sed 1d \" " + f + "\" >> " + outputdir + "/total.word.csv" )
356- for f in glob .glob (outputdir + "/*.paragraphs.csv" ):
357- os .system ("sed 1d \" " + f + "\" >> " + outputdir + "/total.par.csv" )
358- for f in glob .glob (outputdir + "/*.sentences.csv" ):
359- os .system ("sed 1d \" " + f + "\" >> " + outputdir + "/total.sen.csv" )
360- for f in glob .glob (outputdir + "/*.document.csv" ):
361- os .system ("sed 1d \" " + f + "\" >> " + outputdir + "/total.doc.csv" )
383+ merge_output (outputdir , "words" )
384+ merge_output (outputdir , "paragraphs" )
385+ merge_output (outputdir , "sentences" )
386+ merge_output (outputdir , "document" )
362387
363388# Merge all the Alpino output
364389if 'alpinoOutput' in clamdata and clamdata ['alpinoOutput' ] != 'no' :
0 commit comments