-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcount_it.perl
executable file
·318 lines (294 loc) · 12.5 KB
/
count_it.perl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# *-*-perl-*-*
eval 'exec perl -Ssw $0 "$@"'
if 0;
#!/usr/bin/perl -sw
#
# count_it.perl: script to count the occurrences of a pattern in the input
#
# NOTE: This is a simple script that turns out to be very useful
# for a variety of tasks, especially in corpus analysis.
#
# examples:
#
# tabulating most commonly used commands:
#
# $ history | count_it.perl "^\s*\d+\s*(\S+)" - | less
# ps_mine 13
# w 11
# gr 7
# top 6
# ...
#
# tabulating part-of-speech usage for particular words
#
# $ count_it "(outside\/\S+)" ~/OpenMind/data/omcsraw.tag
# outside/IN 502
# outside/RB 137
# outside/NN 53
# outside/JJ 19
#
# Copyright (c) 2000-2001 Cycorp, Inc. All rights reserved.
# Portions Copyright (c) 1996-1999 Tom O'Hara (at NMSU). All rights reserved.
#
# TODO:
# - Add more descriptions of options to usage text.
# - Add automatic carriage return removal to avoid problems such as CR being pulled in for "([^;]+)" (see ~/fast/port-notes.txt).
# - Add paratheses to print calls.
# - Fix bug with backslash-style references:
# ¢ echo "abba" | count-it -d=6 -restore='\2' '(.(.))' - 2>&1 | grep restoring | head
# restoring 'SCALAR(0xa5c900)' to current text
# restoring 'SCALAR(0xd32d70)' to current text
# ...
# restoring 'SCALAR(0xa5caf8)' to current text
#
# Load in the common module, making sure the script dir is in Perl's lib path
BEGIN {
my $dir = `dirname $0`; chomp $dir; unshift(@INC, $dir);
require 'common.perl';
use vars qw/$utf8 $BOM $verbose/;
require 'extra.perl';
}
# Specify additional diagnostics and strict variable usage, excepting those
# for command-line arguments (see init_var's in &init).
use strict "refs"; # allow string deferencing (for -occurrences support)
use vars qw/$utf8 $i $ignore_case $i $ignore_case $foldcase $fold $preserve
$para $slurp $multi_per_line $one_per_line $freq_first $alpha $compact $cumulative
$occurrences $occurrence_field $percents $multiple $nonsingletons $min2 $min_freq $trim $unaccent $pattern_file $pattern $locale $chomp/;
use vars qw/$nonsingleton $restore $field $show_zeros/;
# Check the command-line options
# Each variable initialized corresponds to -var=value commandline option
&init_var(*i, &FALSE); # ignore case in the pattern matching
&init_var(*ignore_case, $i); # alias for -i option
&init_var(*foldcase, $i); # fold (convert) text to lowercase
&init_var(*fold, $i); # alias for -foldcase
&init_var(*preserve, ! ($foldcase || $fold)); # preserve the case of text matching pattern
&init_var(*para, &FALSE); # apply the pattern to paragraphs not lines
&init_var(*slurp, &FALSE); # apply the pattern to entire files
&init_var(*freq_first, &FALSE); # put frequency counts first (ie, <freq><tab><data>)
&init_var(*alpha, &FALSE); # alphabetical sort
&init_var(*compact, &FALSE); # compact all whitespace sequences
&init_var(*cumulative, &FALSE); # include column for cumulative counts
&init_var(*occurrences, &FALSE); # the tags being counted are actually occurrence counts
&init_var(*occurrence_field, # field giving occurrence count (e.g., 1 for $1)
$occurrences ? $occurrences + 1 : 0);
&init_var(*field, 1); # field number to use for pattern
&init_var(*percents, &FALSE); # shows the relative percents
&init_var(*min2, &FALSE); # alias for -nonsingletons
&init_var(*multiple, $min2); # "
&init_var(*nonsingleton, $multiple); # "
&init_var(*nonsingletons, $nonsingleton); # omit cases that occur once
&init_var(*show_zeros, &FALSE); # initialize pattern's count to 0
my($min_freq_default) = ($show_zeros ? 0 : ($nonsingletons ? 2 : 1));
&init_var(*min_freq, # min frequency to show in output
$min_freq_default);
&init_var(*trim, &FALSE); # trim whitespace in matched text
&init_var(*unaccent, &FALSE); # remove accent marks from input
&init_var(*pattern_file, ""); # file with pattern (to circumvent shell UTF8 issues)
my($default_pattern) = (($pattern_file ne "") ? &read_file($pattern_file) : "");
&init_var_exp(*pattern, $default_pattern); # regex pattern to check for
&init_var_exp(*locale, &FALSE); # honor environment locale settings
&init_var(*chomp, &FALSE); # strip newline at end (TODO make default unless \n in pattern)
&init_var(*restore, ""); # portion of matching text to be restored
# Get the pattern and options from the command line
# Process the command-line options
if (!defined($ARGV[0])) {
my($options) = "options = [-i(gnore)] [-alpha] [-freq_first] [-para] [-preserve] [-foldcase]\n";
$options .= " [-compact] [-min_freq=N | -nonsingletons | -min2] [-slurp] [-unaccent] [-chomp]\n";
$options .= " [-occurrences] [-cumulative] [-multi_per_line|-one_per_line] [-restore] [-show_zeros]\n";
my($example) = "examples:\n\nls | $script_name '\\.([^\\.]+)\$'\n\n";
$example .= "$0 '(outside\/\\S+)' omcsraw.tag\n\n";
$example .= "perl -CIOE -Sw $script_name '(.)' - < wiki-lang-info/utf8/da >| /tmp/da.freq\n\n";
$example .= "$script_name -restore='\$3' '((\\S+\\s+)((\\S+\\s+){2}\\S+))' time-tracking-aug21.list | head\n\n";
my($note) = "";
$note .= "notes:\n\nThe patterns are regular expressions using Perl's extensions\n";
$note .= "See manual page for details (e.g., 'man perlre')\n\n";
$note .= "To tabulate only parts of the pattern, use parenthesis as illustrated above.\n\n";
## TODO3: show misc options only if -verbose (as in sum_file.perl)
## ... $note .= "- Use -verbose for more help or to show extracted data."
$note .= "Miscellaneous options:\n";
$note .= "-compact to treat runs of whitespace as single space\n";
$note .= "-nonsingletons to ignore item occurring just once\n";
$note .= "-para reads text in paragraph mode (rather than line)\n";
$note .= "-slurp reads the entire file at once (for long-distance patterns)\n";
$note .= "-occurrences incorporates count field (\$1 for pattern & \$2 count)\n";
$note .= "-multi_per_line allows for multple occurrences in a line (assumed unless ^ used)\n";
$note .= "-restore is used to simulate look-ahead (see example above).\n";
$note .= "-show_zeros is used to show patterns not matched.\n";
## TODO: add optional extended help with examples for misc. options
## OLD: die "\nusage: $script_name [options] pattern file ...\n\n$options\n\n$example\n$note\n";
print STDERR "\nusage: $script_name [options] pattern file ...\n\n$options\n\n$example$note\n";
&exit();
}
if ($pattern eq "") {
$pattern = $ARGV[0];
shift @ARGV;
}
# See if regex has line achor (^ or $)
## TODO: exclude \$ at end of string (e.g., "100\$")
## TODO: add unit test: (echo "12 34 56 78" | count_it.perl '\d{2}' | wc -l) => 4
my($has_line_anchor) = ((index($pattern, "^") == 0)
|| (rindex($pattern, "\$") == length($pattern)));
&init_var(*one_per_line, $has_line_anchor); # only count one instance of the pattern per line
&init_var(*multi_per_line, ! $one_per_line); # count multiple instance of the pattern per line (even when ^ and $ are specified)
&assertion(! ($one_per_line && $multi_per_line));
if ($locale) {
&debug_print(&TL_USUAL, "Enabling locale support\n");
# note: used for case-insensitive matching
eval "use locale;'"
}
# Sanity check for whether one-per-line option might be needed
# NOTE: checks against pattern need to occur prior to modification (e.g., paren addition)
# TODO: handle multiple patterns per line (e.g., set line break to null); likewise check for multiple end-of-line matching
if ((&DEBUG_LEVEL > 3) && ($pattern =~ /^\^/) && ($pattern !~ /[\$\n]$/) && (! $multi_per_line)) {
&warning("You might want to specify -multi_per_line if you want \^ and/or \$ interpretted after match removal.\n");
}
# Initialize count for pattern itself (e.g., non-regex)
# TODO2: add initialization for other patterns (e.g., -entries="red,green,blue")
my(%count); # assoc. list for pattern counting
if ($show_zeros) {
&set_entry(\%count, $pattern, 0);
}
# Put grouping parenthesis around pattern, if none present
# NOTE: Escaped parentheses are ignored.
# TODO: (" $pattern " !~ /[^\\]\(.*\)[^\\]/)
if ((($pattern =~ /\\\(.*\\\)/) && ($pattern !~ /[^\\]\(.*\)[^\\]/))
|| ($pattern !~ /\(.*\)/)) {
$pattern = "(" . $pattern . ")";
}
# Process each line of the input stream
&debug_out(&TL_DETAILED, "searching for pattern '%s'; one_per_line=%d; ignore_case=%d...\n",
$pattern, $one_per_line, $ignore_case);
&debug_print(&TL_VERBOSE, "restore=$restore\n");
my($num) = 0; # number of distinct cases (pattern instances)
$/ = "" if ($para); # paragraph input mode
$/ = 0777 if ($slurp); # complete-file input mode
## OLD:
## # Enable UTF8 I/O
## # TODO2: make this automatic in common.perl
## if ($utf8) {
## use Encode 'decode_utf8';
## }
## OLD: my(%count); # assoc. list for pattern counting
my($total_count) = 0;
#
while (<>) {
if ($utf8) {
$_ = decode_utf8($_);
}
if ($unaccent) {
$_ = ($utf8 ? &utf8_remove_diacritics($_) : &iso_remove_diacritics($_));
}
# NOTE: a chop isn't performed by default to allow for using the newline in a pattern.
# This is often more convenient than using $ (e.g., when using csh).
# TODO: add sanity check about DOS carriage returns screwing up pattern matching
if ($chomp) {
chomp;
&dump_line("$_\n");
}
else {
&dump_line();
}
# Perform optional transformations
# - In paragraph mode treat newlines as spaces unless mentioned in the pattern.
# - All whitespace sequences to single spaces
if ($para && ($pattern !~ /\n/)) {
s/\n/ /g;
}
if ($compact) {
s/\s+/ /g;
}
my($text) = $_;
while (($text ne "") && ($text ne "\n")) {
&debug_out(&TL_VERY_VERBOSE, "text=%s\n", $text);
my($found) = &FALSE;
my($tag);
# Try to extract tag from the text
# NOTE: s qualifier treats string as single line (in case -para specified)
if (($ignore_case == 1) && ($text =~ /$pattern/si)) {
$tag = $1;
if ($field > 1) {
&debug_print(&TL_MOST_VERBOSE, "eval \"\$${field}\"");
$tag = eval "\$${field}";
}
$tag = &to_lower($tag) unless ($preserve);
## OLD: $text = $';
$found = &TRUE;
}
elsif (($ignore_case == 0) && ($text =~ /$pattern/s)) {
$tag = $1;
## BAD: $tag = $1 if ($field == 1) else "eval \$${field}";
if ($field > 1) {
&debug_print(&TL_MOST_VERBOSE, "eval \"\$${field}\"");
$tag = eval "\$${field}";
}
## OLD: $text = $';
$found = &TRUE;
}
#
# Update the current line being matched
if ($found && ($restore ne "")) {
my($restore_text) = eval "$restore";
&debug_print(&TL_DETAILED, "restoring '$restore_text' to current text\n");
$text = $restore_text . $'; # '
&debug_print(&TL_VERBOSE, "current text='$text'\n");
}
else {
$text = $'; # '
}
# Update the hash table of the patterns
if ($found) {
$tag = &trim($tag) if ($trim);
## TODO: add unit test for differences using optional suffixes vs. lookahead (e.g., '\w{3}((, )|$)' vs. '\w{3}(?=(, )|$)'
## OLD: &debug_print(&TL_VERY_DETAILED, "adding: '$tag'; \$\&='$&'\n");
&debug_print(&TL_VERY_DETAILED, "adding: '$tag'\n\t\$\&='$&'\n");
$num++;
my($count) = 1;
if ($occurrence_field) {
$count = eval "\$${occurrence_field}";
&debug_print(&TL_VERY_DETAILED, "count: $count\n");
}
&incr_entry(\%count, $tag, $count);
$total_count += $count;
}
# Stop when not found or if just one match per line
if ((! $found) || ($one_per_line)) {
$text = "";
}
}
}
&debug_print(&TL_VERBOSE, "$num patterns found\n");
## OLD: print("total occurrence count is ${total_count}\n") if ($occurrences);
if ($occurrences || $verbose) {
print("total occurrence count is ${total_count}\n");
}
&trace_assoc_array(\%count);
print("Frequency of $pattern\n") if ($verbose);
my($sort_function) = ($alpha ? \&sorted_hash_keys_alphabetic : \&sorted_hash_keys_reverse_numeric);
my($cumulative_tag_count) = 0;
foreach my $tag (&$sort_function(\%count)) {
my($tag_count) = &get_entry(\%count, $tag, 0);
last if ($tag_count < $min_freq);
## TODO3: streamline the above short circuit with below inclusion checks
## OLD if ($tag_count != 0) ...
if (($tag_count != 0) || ($show_zeros)) {
$tag =~ s/\n$//;
if ($freq_first) {
print $tag_count, "\t", $tag;
}
else {
print $tag, "\t", $tag_count;
}
if ($percents) {
print "\t", &round($tag_count / $total_count);
}
# Show optional cumulative count column
if ($cumulative) {
$cumulative_tag_count += $tag_count;
print "\t${cumulative_tag_count}";
if ($percents) {
print "\t", &round($cumulative_tag_count / $total_count);
}
}
print "\n";
}
}