From ecf009854e56cb42f3ba6991a9d9d70d3fc29ba3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Kopp?= <kopp@ufal.mff.cuni.cz> Date: Mon, 20 Jan 2025 10:02:31 +0100 Subject: [PATCH] move character validation to separate file #846 --- Scripts/check-chars.pl | 40 ++++++++++++++++++++++++++++ Scripts/validate-parlamint.pl | 49 ++++++++++------------------------- 2 files changed, 53 insertions(+), 36 deletions(-) create mode 100755 Scripts/check-chars.pl diff --git a/Scripts/check-chars.pl b/Scripts/check-chars.pl new file mode 100755 index 000000000..539711baa --- /dev/null +++ b/Scripts/check-chars.pl @@ -0,0 +1,40 @@ +#!/usr/bin/env perl +use warnings; +use strict; +use utf8; +my @INFILES = glob(shift); +binmode(STDOUT,'utf8'); +binmode(STDERR,'utf8'); + +foreach my $file (@INFILES) { + chars($file); +} + +# Check if $file contains bad characters +sub chars { + my $file = shift; + my %c; + my @bad = (); + my ($fName) = $file =~ m|([^/]+)$| + or die "FATAL ERROR: Bad file '$file'\n"; + print STDERR "INFO: Char validation for $fName\n"; + open(IN, '<:utf8', $file); + undef $/; + my $txt = <IN>; + undef %c; + for my $c (split(//, $txt)) {$c{$c}++} + for my $c (sort keys %c) { + if (ord($c) == hex('00A0') or #NO-BREAK SPACE + ord($c) == hex('2011') or #NON-BREAKING HYPHEN + ord($c) == hex('00AD') or #SOFT HYPHEN + ord($c) == hex('FFFD') or #REPLACEMENT CHAR + (ord($c) >= hex('2000') and ord($c) <= hex('200A')) or #NON-STANDARD SPACES + (ord($c) >= hex('E000') and ord($c) <= hex('F8FF')) #PUA + ) { + my $message = sprintf("U+%X (%dx)", ord($c), $c{$c}); + push(@bad, $message) + } + } + print STDERR "WARN: File $fName contains bad chars: " . join('; ', @bad) . "\n" + if @bad +} \ No newline at end of file diff --git a/Scripts/validate-parlamint.pl b/Scripts/validate-parlamint.pl index f0afe9804..67a6ea8fe 100755 --- a/Scripts/validate-parlamint.pl +++ b/Scripts/validate-parlamint.pl @@ -40,6 +40,7 @@ sub usage $Compose = "$Bin/parlamint-composite-teiHeader.xsl"; $Links = "$Bin/check-links.xsl"; +$Chars = "$Bin/check-chars.pl"; $Valid = "$Bin/validate-parlamint.xsl"; $Valid_particDesc = "$Bin/validate-parlamint-particDesc.xsl"; $Includes = "$Bin/get-includes.xsl"; @@ -87,7 +88,7 @@ sub validate { my $interfix = $type; $interfix =~ s/^TEI//; print STDERR "INFO: Validating $type root $rootFile\n"; - &chars($rootFile); + &run($Chars, $rootFile); &run("$Jing $schemaDir/ParlaMint-teiCorpus$interfix.rng", $rootFile); &run("$Saxon outDir=$tmpDir -xsl:$Compose", $rootFile); &run("$Jing $schemaDir/ParlaMint.odd.rng", "$tmpDir/$fileName"); @@ -100,12 +101,12 @@ sub validate { if (-e $file) { if($file =~ m/ParlaMint-(?:[A-Z]{2}(?:-[A-Z0-9]{1,3})?(?:-[a-z]{2,3})?)?.?(taxonomy|listPerson|listOrg).*\.xml/){ print STDERR "INFO: Validating file included in teiHeader $file\n"; - &chars($file); + &run($Chars, $file); &run("$Jing $schemaDir/ParlaMint-$1.rng", $file); &run("$Saxon meta=$rootFile -xsl:$Links", $file); } else { print STDERR "INFO: Validating component $type file $file\n"; - &chars($file); + &run($Chars, $file); &run("$Jing $schemaDir/ParlaMint-TEI$interfix.rng", $file); &run("$Jing $schemaDir/ParlaMint.odd.rng", $file); &run("$Saxon -xsl:$Valid", $file); @@ -116,54 +117,30 @@ sub validate { } } -# Check if $file contains bad characters -sub chars { - my $file = shift; - my %c; - my @bad = (); - my ($fName) = $file =~ m|([^/]+)$| - or die "FATAL ERROR: Bad file '$file'\n"; - print STDERR "INFO: Char validation for $fName\n"; - open(IN, '<:utf8', $file); - undef $/; - my $txt = <IN>; - undef %c; - for $c (split(//, $txt)) {$c{$c}++} - for $c (sort keys %c) { - if (ord($c) == hex('00A0') or #NO-BREAK SPACE - ord($c) == hex('2011') or #NON-BREAKING HYPHEN - ord($c) == hex('00AD') or #SOFT HYPHEN - ord($c) == hex('FFFD') or #REPLACEMENT CHAR - (ord($c) >= hex('2000') and ord($c) <= hex('200A')) or #NON-STANDARD SPACES - (ord($c) >= hex('E000') and ord($c) <= hex('F8FF')) #PUA - ) { - $message = sprintf("U+%X (%dx)", ord($c), $c{$c}); - push(@bad, $message) - } - } - print STDERR "WARN: File $fName contains bad chars: " . join('; ', @bad) . "\n" - if @bad -} - + sub run { my $command = shift; my $file = shift; my ($fName) = $file =~ m|([^/]+)$| or die "FATAL ERROR: Bad file '$file'\n"; + my $msg = ''; if ($command =~ /$Jing/) { - print STDERR "INFO: XML validation for $fName\n" + $msg = "INFO: XML validation for $fName\n" } elsif ($command =~ /$Compose/) { } + elsif ($command =~ /$Chars/) { + } elsif ($command =~ /$Valid/) { - print STDERR "INFO: Content validaton for $fName\n" + $msg = "INFO: Content validaton for $fName\n" } elsif ($command =~ /$Valid_particDesc/) { - print STDERR "INFO: particDesc content validaton for $fName\n" + $msg = "INFO: particDesc content validaton for $fName\n" } elsif ($command =~ /$Links/) { - print STDERR "INFO: Link checking for $fName\n" + $msg = "INFO: Link checking for $fName\n" } else {die "FATAL ERROR: Weird command $command!\n"} + print STDERR $msg; `$command $file 1>&2`; }