-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathconllu_cut.pl
executable file
·89 lines (84 loc) · 2.35 KB
/
conllu_cut.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env perl
# Takes a CoNLL-U file, cuts a section out of it, and prints the section.
# The section is identified by the id of the first and the last sentence.
# Copyright © 2022 Dan Zeman <[email protected]>
# License: GNU GPL
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
use Getopt::Long;
sub usage
{
print STDERR ("Usage: perl conllu_cut.pl --first sent350 --last sent367 < whole.conllu > section.conllu\n");
print STDERR (" where sent350 and sent367 are sent_ids of the first and the last sentence included in the output\n");
print STDERR (" if --first is omitted, section starts at the beginning of the input file\n");
print STDERR (" if --last is omitted, section ends at the end of the input file\n");
}
my $firstsid;
my $lastsid;
GetOptions
(
'first=s' => \$firstsid,
'last=s' => \$lastsid
);
if(!defined($firstsid) && !defined($lastsid))
{
print STDERR ("WARNING: Neither the first nor the last sentence specified; the entire input will be passed through.\n");
}
my $inside = !defined($firstsid);
my @sentence = ();
while(<>)
{
s/\r?\n$//;
push(@sentence, $_);
if(m/^\s*$/)
{
process_sentence(@sentence);
@sentence = ();
}
}
# If we encountered the end of the section, we exited the script in process_sentence().
# Being here means that we either want the section to reach the end of the input,
# or we failed to find the beginning of the section.
if(!$inside)
{
print STDERR ("WARNING: The first sentence of the section, '$firstsid', was not found.\n");
}
elsif(defined($lastid))
{
print STDERR ("WARNING: The last sentence of the section, '$lastsid', was not found.\n");
}
sub process_sentence()
{
my @sentence = @_;
# Get the sentence id.
my $sid;
foreach my $line (@sentence)
{
if($line =~ m/^\#\s*sent_id\s*=\s*(\S+)$/)
{
$sid = $1;
last;
}
}
if(!defined($sid))
{
print STDERR ("WARNING: Sentence has no sent_id\n");
$sid = '';
}
if(!$inside && $sid eq $firstsid)
{
$inside = 1;
}
if($inside)
{
print(join("\n", @sentence), "\n");
if(defined($lastsid) && $sid eq $lastsid)
{
# No need to read the rest of the input.
exit(0);
}
}
}