-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathperlgrep.perl
executable file
·193 lines (175 loc) · 6.13 KB
/
perlgrep.perl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/perl -sw
# *-*-perl-*-*
eval 'exec perl -Ssw $0 "$@"'
if 0;
#!/usr/bin/perl -sw
#
# perlgrep.perl: quick n' dirty version of (e)grep via perl regex matching
#
# This faciliates the use of tab characters, as well the use of Perl's
# extended regular expression pattern matching (eg, \d for [0-9]).
#
# NOTES:
# - The -para option now supports paragraph matching (same as perl -0 option)
# perlgrep.perl -d=6 -i coling comps.bib |& less
# - If escape sequences are used in patterns, then the pattern should be
# single quoted rather than double quoted when using csh or bash. Otherwise,
# an error similar to the following might occur.
# $ perlgrep "\\TODO{[^}]+}[^\s%]" *.tex
# Unrecognized escape \T passed through before HERE mark in regex m/\T << HERE ODO {[^}]+}[^\s%]/ at /e/cartera-de-tomas/bin/perlgrep.perl line 79, <> line 1.
#
#
# TODO2: convert to Python
#
# TODO:
# - *** Track down missing files:
# ex: perl-grep -c '\x00' test_sort_json_annots.d*.19nov21.*.log
# [see /home/tomohara/osr/docs/_adhoc.22nov21.log]
# - ** Add test suite against regular grep.
# - Use parentheses around print and printf throughout.
#
# Load in the common module, making sure the script dir is in the Perl lib path
BEGIN {
my $dir = $0; $dir =~ s/[^\\\/]*$//; unshift(@INC, $dir);
require 'common.perl';
}
# Specify additional diagnostics and strict variable usage, excepting those
# for command-line arguments (see init_var's in &init).
use strict;
use vars qw/$v $i $para $slurp $context $C $A $B $n $max $show_filename $w $c $h/;
# Determine values of command-line arguments
# TODO: use expanded names for all options (e.g., -not_matching for -v)
&init_var(*v, &FALSE); # grep -v option to show lines not matching
&init_var(*i, &FALSE); # grep -i for case insensitive matching
&init_var(*para, &FALSE); # paragraph input mode
&init_var(*slurp, &FALSE); # apply the pattern to entire files
&init_var(*context, 0); # grep 'context' (i.e., before & after) option
&init_var(*C, $context); # alias for -context
&init_var(*A, $C); # grep 'after' context option
&init_var(*B, $C); # grep 'before' context option
&init_var(*n, &FALSE); # grep line-number option
&init_var(*c, &FALSE); # just show count
my($just_count) = $c;
&init_var(*max, &MAXINT); # maximum number of matches to show
&init_var(*h, &FALSE); # grep -h option to hide filename
&init_var(*show_filename, # include file name in output
((scalar @ARGV) > 2) && !$h);
&init_var(*w, &FALSE); # match word boundaries
# Show usage statement if insufficient arguments
#
if (!defined($ARGV[0])) {
my($options) = "options = [-v] [-i] [-w] [-para] [-A=n | -B=n | -C=n] [-max=N] [-show_filename | -h] [-c]";
my($example) = "examples:\n\n$script_name -v \"\\t\\t\" 052698_nb_ci_062498_nb55.diff\n\n";
$example .= "perlgrep.perl -para -i \"coling\" comps.bib | & less\n\n";
$example .= "$0 -show_filename -max=1 \"\\r\\n\" *.lisp\n\n";
print STDERR "\nusage: $script_name [options]\n\n$options\n\n$example\n\n";
&exit();
}
my($pattern) = shift @ARGV;
select(STDOUT); $| = 1; # set stdout unbuffered
# Optionally set paragraph input mode
$/ = "" if ($para); # paragraph input mode
undef $/ if ($slurp); # complete-file input mode
# Modify the pattern to reflect word boundaries
if ($w) {
$pattern = "\\b$pattern\\b";
}
my(@before); # lines before current target context
&debug_print(&TL_DETAILED, "checking for pattern '$pattern'; just_count=$just_count\n");
my($hit_count) = 0;
my($after_context) = 0;
my($current_file) = defined($ARGV[0]) ? $ARGV[0] : "<stdin>";
my($record_num) = 0;
my($para_line_num) = 1;
my($current_count) = 0;
my($undisplayed_counts) = &TRUE;
# TODO: use Iterator::Files to account for problems with files haing leadng spaces
# See http://search.cpan.org/~jv/Iterator-Diamond-1.01/lib/Iterator/Files.pm.
while (<>) {
my($num_para_lines) = 0;
&debug_print(&TL_VERY_VERBOSE, "[$current_file] ");
if ($para) {
&debug_out(&TL_VERY_DETAILED, "P%d L%d:\t%s", $., $para_line_num, $_);
my(@para_lines) = split(/\n/, $_);
$num_para_lines = ($#para_lines + 1);
&assert($num_para_lines > 0);
&debug_out(&TL_VERY_VERBOSE, "# para lines: %d\n", $num_para_lines);
}
else {
&dump_line("$_", &TL_VERY_VERBOSE);
}
$record_num++;
my($include) = &FALSE;
# See if the pattern matches the current line
# NOTE: s qualifier treats string as single line (in case -para specified)
if (($i && /$pattern/is) || (/$pattern/s)) {
$include = (! $v);
$after_context = ($include ? ($A + 1) : 0);
}
else {
$include = $v;
$after_context--;
}
&debug_out(&TL_VERY_VERBOSE, "include=%d; after_context=%d\n", $include, $after_context);
if ($include) {
$current_count++;
$undisplayed_counts = &TRUE;
if (! $just_count) {
if ($B > 0) {
printf "%s", join("", @before);
@before = ();
}
if ($show_filename) {
print "$current_file:";
}
if ($n) {
printf("%d: ", ($para ? $para_line_num : $record_num));
}
printf("%s", $_);
}
# See whether to stop the search due to max hit count
if (++$hit_count >= $max) {
&debug_out(&TL_DETAILED, "max hits (%d) reached\n", $max);
last;
}
}
elsif ($after_context > 0) {
printf "%s", $_ unless ($just_count);
}
elsif ($B > 0) {
# update the lines-before context
# NOTE: This is only done for lines not otherwise printed
# TODO: look into a more efficient queue implementation
push (@before, $_);
shift @before if ($#before == $B);
}
if (eof) {
# Display the counts for the previous file
if ($just_count) {
if ($show_filename) {
print "$current_file:";
}
print "$current_count\n";
}
# Update the current-file status indicators
$current_file = defined($ARGV[0]) ? $ARGV[0] : "<stdin>";
$record_num = 0;
$para_line_num = 1;
$current_count = 0;
$undisplayed_counts = &FALSE;
}
else {
## TEST: $current_file = defined($ARGV[0]) ? $ARGV[0] : "<stdin>";
#
if ($para) {
$para_line_num += ($num_para_lines + 1);
}
}
}
# Show the final counts unless just displayed
if ($just_count && $undisplayed_counts) {
if ($show_filename) {
print "$current_file:";
}
print "$current_count\n";
}