Skip to content

Commit 7fcca16

Browse files
committed
Fixes for merging very large result sets
1 parent 778c3a0 commit 7fcca16

1 file changed

Lines changed: 52 additions & 27 deletions

File tree

webservice/tscanservice/tscanwrapper.py

Lines changed: 52 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import shutil
2424
import glob
2525
import signal
26+
import re
2627

2728
from lxml import etree
2829

@@ -273,6 +274,49 @@ def init_alpino_lookup(configfile, inputdir, outputdir):
273274
inputfiles.append(str(inputfile))
274275

275276

277+
def copy_modify_metadata(outputdir: str, source_filename: str, total_filename: str) -> None:
278+
with open(os.path.join(outputdir, f".{source_filename}.METADATA"), 'r') as source_file:
279+
with open(os.path.join(outputdir, f".{total_filename}.METADATA"), 'w') as target_file:
280+
content = source_file.read()
281+
content = content.replace("Document statistics", "Corpus statistics")
282+
content = content.replace(re.sub(r'\.[^\.]+\.csv$', '', source_filename), "TOTAL")
283+
target_file.write(content)
284+
285+
286+
def merge_output(outputdir: str, type: str) -> None:
287+
first_file = True
288+
total_filename = f"total.{type}.csv"
289+
290+
# create the empty target file
291+
open(total_filename, 'w').close()
292+
293+
for f in glob.glob(f"{outputdir}/*.{type}.csv"):
294+
filename = os.path.basename(f)
295+
if filename == total_filename:
296+
# ignore lingering total files
297+
continue
298+
299+
if first_file:
300+
# copy and modify metadata
301+
copy_modify_metadata(outputdir, filename, total_filename)
302+
303+
first_line = True
304+
with open(os.path.join(outputdir, total_filename), 'a') as target_file:
305+
with open(os.path.join(outputdir, filename), 'r') as source_file:
306+
while True:
307+
line = source_file.readline()
308+
if not line:
309+
break
310+
311+
if first_line and not first_file:
312+
# skip the first line
313+
first_line = False
314+
else:
315+
# except for the first file, so the file will have the column names
316+
first_file = False
317+
target_file.write(line)
318+
319+
276320
def sigterm_handler():
277321
#collect output
278322
clam.common.status.write(statusfile, "Postprocessing after forceful abortion", 90) # status update
@@ -282,19 +326,10 @@ def sigterm_handler():
282326
#tscan writes CSV file in input directory, move:
283327
os.system("mv -f " + inputdir + "/*.csv " + outputdir)
284328

285-
os.system("cat " + outputdir + "/*.words.csv | head -n 1 > " + outputdir + "/total.word.csv")
286-
os.system("cat " + outputdir + "/*.paragraphs.csv | head -n 1 > " + outputdir + "/total.par.csv")
287-
os.system("cat " + outputdir + "/*.sentences.csv | head -n 1 > " + outputdir + "/total.sen.csv")
288-
os.system("cat " + outputdir + "/*.document.csv | head -n 1 > " + outputdir + "/total.doc.csv")
289-
290-
for f in glob.glob(outputdir + "/*.words.csv"):
291-
os.system("sed 1d " + f + " >> " + outputdir + "/total.word.csv")
292-
for f in glob.glob(outputdir + "/*.paragraphs.csv"):
293-
os.system("sed 1d " + f + " >> " + outputdir + "/total.par.csv")
294-
for f in glob.glob(outputdir + "/*.sentences.csv"):
295-
os.system("sed 1d " + f + " >> " + outputdir + "/total.sen.csv")
296-
for f in glob.glob(outputdir + "/*.document.csv"):
297-
os.system("sed 1d " + f + " >> " + outputdir + "/total.doc.csv")
329+
merge_output(outputdir, "words")
330+
merge_output(outputdir, "paragraphs")
331+
merge_output(outputdir, "sentences")
332+
merge_output(outputdir, "document")
298333
sys.exit(5)
299334

300335
signal.signal(signal.SIGTERM, sigterm_handler)
@@ -345,20 +380,10 @@ def sigterm_handler():
345380
os.system("mv -f " + inputdir + "/*.csv " + outputdir)
346381

347382
#write the csv headers to a file with all results ('total.<type>.csv')
348-
os.system("cat " + outputdir + "/*.words.csv | head -n 1 > " + outputdir + "/total.word.csv")
349-
os.system("cat " + outputdir + "/*.paragraphs.csv | head -n 1 > " + outputdir + "/total.par.csv")
350-
os.system("cat " + outputdir + "/*.sentences.csv | head -n 1 > " + outputdir + "/total.sen.csv")
351-
os.system("cat " + outputdir + "/*.document.csv | head -n 1 > " + outputdir + "/total.doc.csv")
352-
353-
#move the contents of the files to the total files
354-
for f in glob.glob(outputdir + "/*.words.csv"):
355-
os.system("sed 1d \"" + f + "\" >> " + outputdir + "/total.word.csv")
356-
for f in glob.glob(outputdir + "/*.paragraphs.csv"):
357-
os.system("sed 1d \"" + f + "\" >> " + outputdir + "/total.par.csv")
358-
for f in glob.glob(outputdir + "/*.sentences.csv"):
359-
os.system("sed 1d \"" + f + "\" >> " + outputdir + "/total.sen.csv")
360-
for f in glob.glob(outputdir + "/*.document.csv"):
361-
os.system("sed 1d \"" + f + "\" >> " + outputdir + "/total.doc.csv")
383+
merge_output(outputdir, "words")
384+
merge_output(outputdir, "paragraphs")
385+
merge_output(outputdir, "sentences")
386+
merge_output(outputdir, "document")
362387

363388
# Merge all the Alpino output
364389
if 'alpinoOutput' in clamdata and clamdata['alpinoOutput'] != 'no':

0 commit comments

Comments
 (0)