-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcalc_entropy.perl
executable file
·386 lines (354 loc) · 13.2 KB
/
calc_entropy.perl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
# *-*-perl-*-*
eval 'exec perl -Ssw $0 "$@"'
if 0;
#!/usr/local/bin/perl -sw
#
# calc_entropy.perl: calculates entropy of a class using the output of a
# frequency tabulation program (in particular count_it.perl).
#
# sample input:
#
# # Frequency of 'between' in Penn Treebank II WSJ annotations
# 24 pp-clr
# 6 pp-dir
# 4 pp-ext
# 35 pp-loc
# 2 pp-nom
# 7 pp-prd
# 42 pp-tmp
#
# sample output:
#
# # word classes freq entropy max_prob
# between 7 120 2.230 0.350
#
#
# sample usage (simplified input):
# $ calc_entropy.perl -simple .25 .25 .25 .25
# 2.000
#
# TODO:
# - Remove assumptions about word-oriented data.
# - Move just_freq code into separate script (new calc_relative_frequency.perl).
# - Add usgae example using relative frequency usgae:
# $ calc_entropy.perl -verbose -last coca-w1.data > coca-w1.rfreq
# $ cut -f3,5 > coca-w1.coca-w1.rfreq
#
# Load in the common module, making sure the script dir is in the Perl lib path
BEGIN {
my $dir = `dirname $0`; chomp $dir; unshift(@INC, $dir);
require 'common.perl';
use vars qw/$verbose $precision $strict/;
require 'extra.perl';
}
# Specify additional diagnostics and strict variable usage, excepting those
# for command-line arguments (see init_var's in &init).
use strict;
no strict "refs"; # allow for symbolic file handles
use vars qw/$class_filter $max_count $label $word $last $freq_last
$freq_first $header $show_header $skip_header $classes $show_classes
$simple $normalize $fix $alpha $preserve $cumulative $just_freq $no_comments/;
use vars qw /$strip_comments $first $class_width/;
if (! defined($ARGV[0])) {
my($options) = "options = [-max_count=N] [-word=w] [-class_filter=\"list\"] [-simple] [-normalize] [-cumulative] [-no_comments] [-verbose]";
$options .= "[-[freq_]first] [-[freq_]last] [-label=text] [-[show_|skip_]header] [-[show_]classes] [-alpha] [-preserve] [-just_freq] [-class_width=N]";
$options .= "\nother options = " . &COMMON_OPTIONS;
my($example) = "ex: $script_name -d=5 -class_filter=\"pp-bnf pp-dir pp-mnr pp-prp pp-tmp pp=ext pp-loc\" as.freq\n\n";
$example = "examples:\n\n$script_name -verbose -simple .47 .42 .06 .05\n\n";
$example .= "$script_name -freq_last all-fe.freq\n\n";
$example .= "ls /etc | count_it.perl '\\w+' | $script_name -last -verbose -\n\n";
#
$example .= "file=bridge-over-troubled-waters.chords\n";
$example .= "count_it.perl '^(\\S+)\\t' \$file >| \$file.freq\n";
$example .= "$script_name -no_comments -verbose -last \$file.freq | cut.perl -f='3,5' - >| \$file.rfreq\n\n";
print STDERR "\nusage: $script_name [options]\n\n$options\n\n$example";
# TODO: put notes before examples as with other scripts
print STDERR "notes:\n";
print STDERR "- The -strip_comments option is an alias for old -no_comments=0.\n";
print STDERR "- Use -verbose for more details.\n";
if ($verbose) {
# TODO: move into common.perl as new function
my($detailed_usage) = &run_command("extract_matches.perl -fields=2 'init_var\\(\\*(\\w+).*\\s# (.*)' '$0'");
$detailed_usage =~ s/^/ /mg;
$detailed_usage =~ s/\t/: /g;
print STDERR "- Option descriptions:\n$detailed_usage\n";
}
&exit();
}
# Customize how init_var works (see common.perl)
&set_init_var_export(&TRUE); # treat init_var as init_var_exp
&block_init_var_via_env($strict); # block use of env. vars in init_var
&init_var(*class_filter, "");
if ($class_filter ne "") {
$class_filter = &to_lower(" $class_filter ");
}
&init_var(*max_count, &MAXINT); # maximum number of cases to process
&init_var(*label, ""); # label for entropy display
&init_var(*word, "n/a"); # word over which distribution is made
&init_var(*last, &FALSE); # alias for -freq_last
&init_var(*freq_last, $last); # frequency occurs last in the data
&init_var(*freq_first, ! $freq_last); # frequency occurs first in the data
&init_var(*just_freq, &FALSE); # just output relative frequency
&init_var(*header, (! $just_freq)); # alias for show_header
&init_var(*show_header, $header); # display comment header?
&init_var(*skip_header, ! $show_header); # should the header be skipped?
&init_var(*classes, &FALSE); # alias for -show_classes
&init_var(*show_classes, $classes); # display class information?
&init_var(*simple, &FALSE); # data just contains the probabilities
&init_var(*fix, &FALSE); # ensure tab-delimited input
&init_var(*normalize, &FALSE); # normalize the probabilities?
&init_var(*alpha, &FALSE); # show keys alphabetized
&init_var(*preserve, &FALSE); # preserve order of keys
&init_var(*cumulative, &FALSE); # show cumulative probability
## OLD: &init_var(*no_comments, &FALSE); # used to bypass comment stripping
&init_var(*strip_comments, &FALSE); # alias for (confusing) -no_comments=0
&init_var(*no_comments, ! $strip_comments); # used to bypass comment stripping
&init_var(*class_width, 0); # width to use for label (if > 0)
my($LOG2) = log(2);
## DEBUG: printf "round(4.00078) => %s\n", round(4.00078);
# Optionally, normalize the data and reinvoke over that
if ($normalize && $simple) {
&debug_print(&TL_DETAILED, "Re-invoking with normalized/simple data\n");
my($data) = ((defined($ARGV[0]) && ($ARGV[0] ne "-")) ? "@ARGV" : join("\n", <STDIN>));
my($normalized_data) = &run_command_over("normalize.perl - <", $data);
printf "%s\n", &run_command_over("$0 -simple -normalize=0 - <", $normalized_data);
&exit();
}
# Check for simplified version of data (i.e., just the probabilities)
if ($simple) {
my(@data);
# See if command line should be used
if (($#ARGV >= 0) && ($ARGV[0] ne "-")) {
@data = @ARGV;
}
# Otherwise get from standard input
else {
@data = &tokenize(join(" ", <STDIN>));
}
&simple_calc_entropy(@data);
&exit();
}
# If no files explicitly given, process standard input
if (!defined($ARGV[0])) {
®ular_calc_entropy("STDIN", $word);
&exit();
}
# Process each file on the command line
## OLD: printf "# word\tclasses\tfreq\tentropy\tmax_prob\n" unless ($skip_header);
foreach my $file (@ARGV) {
if ($word eq "n/a") {
$word = &remove_dir(&basename($file, ".freq"));
}
if (!open(FREQ, "<$file")) {
&warning("unable to read $file\n");
next;
}
®ular_calc_entropy("FREQ", $word);
close(FREQ);
$word = "n/a";
}
#------------------------------------------------------------------------------
# regular_calc_entropy(handle, label): Calculate entropy for the probability distribution,
# taken from input HANDLE for class label (e.g., word).
#
# sample input:
#
# green 0.5
# eggs 0.25
# with 0.125
# spam 0.125
#
# sample output:
# # class freq prob -p lg(p)
# # green 0 0.500 0.500
# # eggs 0 0.250 0.500
# # with 0 0.125 0.375
# # spam 0 0.125 0.375
# # total 1 1.000 1.750
#
# # word classes freq entropy max_prob
# - 4 1 1.750 0.500
#
sub regular_calc_entropy {
&debug_print(&TL_VERBOSE, "regular_calc_entropy(@_)\n");
# TODO: rename word to label
my($handle, $word) = @_;
my(%class_frequency);
my(@keys);
my($total_freq) = 0;
my($count) = 0;
my($max_label) = "";
my($max_label_len) = 0;
while (<$handle>) {
&dump_line();
chomp;
# Skip comments and blank lines
## BAD: s/#.*// unless ($no_comments);
s/#.*// if ($no_comments);
next if (/^\s*$/);
## OLD: s/\s\s+/\t/g if ($fix); # make sure input is tab-delimited
s/\s+/\t/g if ($fix); # make sure input is tab-delimited
if (! m/\t/) {
&warning("unexpected input at line $. ($_)\nUse -fix if not tab-delimited.\n");
next;
}
# Get the frequency and class name, skipping items not in the filter
my($freq, $class, $rest);
if ($freq_first) {
($freq, $class, $rest) = split(/\t/, $_);
}
else {
($class, $freq, $rest) = split(/\t/, $_);
}
$rest = "" if (!defined($rest));
&debug_print(&TL_DETAILED, "class='$class' freq='$freq' rest='$rest'\n");
# Keep track of max label length, etc.
if (length($class) > $max_label_len) {
$max_label_len = length($class);
$max_label = $class;
}
# See if the item should be ignored
if (! &is_numeric($freq)) {
&warning("unexpected input at line $. ($_)\nUse -last if class comes first\n");
## return;
next;
}
if (($class =~ /^total/i) && ($freq == $total_freq)) {
&debug_print(&TL_DETAILED, "skipping totals class '$class'\n");
next;
}
if (($class_filter ne "") && (index($class_filter, $class) == -1)) {
&debug_print(&TL_DETAILED, "skipping filtered class $class\n");
next;
}
# Tabulate frequency
push(@keys, $class);
&incr_entry(\%class_frequency, $class, $freq);
$total_freq += $freq;
last if (++$count == $max_count);
}
close($handle);
# Make sure sufficient data and show summary
if ($total_freq == 0) {
# TODO: add argument for filename for error message
&warning("unexpected distribution for $handle (all 0)\n");
return;
}
&debug_print(&TL_VERBOSE, "max label: $max_label; len: $max_label_len\n");
my($entropy) = 0.0;
my($max_prob) = 0.0;
my($sum_p) = 0.0;
## OLD: print "#\t\tclass\tfreq\tprob\t-p lg(p)\n" if ($verbose && (! $just_freq));
my($class) = "class";
my($label_width) = (($class_width > 0) ? $class_width : $max_label_len);
if (length($class) < $label_width) {
$class .= (" " x ($label_width - length($class)));
}
print("#\t\t$class\tfreq\tprob\t-p lg(p)\n") if ($verbose && (! $just_freq));
my($num_classes) = 0;
my(@sorted_keys) = ($alpha ? sort(@keys) :
$preserve ? @keys :
&sorted_hash_keys_reverse_numeric(\%class_frequency));
foreach my $class (@sorted_keys) {
&debug_print(&TL_VERBOSE, "class: $class\n");
my($freq) = &get_entry(\%class_frequency, $class);
next if ($freq == 0);
my($prob) = $freq / $total_freq;
$sum_p += $prob;
$max_prob = &max($prob, $max_prob);
my($p_lg_p) = ($prob > 0) ? (- $prob * (log($prob)/$LOG2)) : 0;
$entropy += $p_lg_p;
my($prob_value) = ($cumulative ? $sum_p : $prob);
if ($just_freq) {
print "$class\t" if ($verbose);
print &round($prob_value), "\n";
}
elsif ($verbose) {
## OLD: printf ("#\t\t%s\t%d\t%s\t%s\n", $class, $freq, &round($prob_value), &round($p_lg_p));
my($label_width) = (($class_width > 0) ? $class_width : $max_label_len);
&debug_print(&TL_VERBOSE, "width: $label_width\n");
## TEST: printf("#\t\t%.*s%d\t%s\t%s\n", $label_width, $class, $freq, &round($prob_value), &round($p_lg_p));
if (length($class) < $label_width) {
$class .= (" " x ($label_width - length($class)));
}
printf("#\t\t%s\t%d\t%s\t%s\n", $class, $freq, &round($prob_value), &round($p_lg_p));
}
$num_classes++;
}
if ($just_freq) {
# do nothing
}
elsif ($verbose) {
printf ("#\t\ttotal\t%d\t%s\t%s\n", $total_freq, &round(1.0), &round($entropy));
print "\n";
printf "# word\tclasses\tfreq\tentropy\tmax_prob\n" unless ($skip_header);
printf "%s\t%d\t%d\t%s\t%s\n", $word, $num_classes, $total_freq, &round($entropy), &round($max_prob);
}
elsif (! $just_freq) {
print &round($entropy), "\n";
}
}
# simple_calc_entropy(dist): Calculate entropy for the probability distribution
#
# example input:
# 0.5 0.25 0.125 0.125
#
# example output:
# 1.750
#
sub simple_calc_entropy {
&debug_print(&TL_VERBOSE, "simple_calc_entropy(@_)\n");
my(@data) = @_;
my($label_spec) = ($label ne "") ? "$label\t" : "";
my($label_header) = ($label ne "") ? "\t" : "";
my($entropy) = 0.0;
my($max_prob) = 0.0;
if ($verbose && (! $just_freq)) {
printf "#\tprob\t-p lg(p) max p\n";
printf "#%s\n", "-" x 32;
}
my($num_classes) = (1 + $#data);
my($sum_p) = 0.0;
foreach my $prob (@data) {
$sum_p += $prob;
$max_prob = &max($prob, $max_prob);
my($p_lg_p) = ($prob > 0) ? (- $prob * (log($prob)/$LOG2)) : 0;
$entropy += $p_lg_p;
my($prob_value) = ($cumulative ? $sum_p : $prob);
## OLD: printf "#\t%.3f\t%.3f\n", $prob_value, $p_lg_p if ($verbose && (! $just_freq));
printf "#\t%s\t%s\n", &round($prob_value), &round($p_lg_p) if ($verbose && (! $just_freq));
print &round($prob_value), "\n" if ($just_freq);
}
if ($verbose && (! $just_freq)) {
printf "#%s\n", "-" x 32;
printf "# word\tclasses\tfreq\tentropy\tmax_prob\n" unless ($skip_header);
## OLD: printf "#\t%.3f\t%.3f\t %.3f\n", $sum_p, $entropy, $max_prob;
printf "#\t%s\t%s\t %s\n", &round($sum_p), &round($entropy), &round($max_prob);
}
&debug_print(&TL_VERBOSE, "simple_calc_entropy(@data) => $entropy\n");
if (! $skip_header) {
print $label_header;
print "Classes\t" if ($show_classes);
print "Entropy\n";
}
if (! $just_freq) {
print $label_spec;
print "${num_classes}\t" if ($show_classes);
print &round($entropy), "\n";
}
}
#------------------------------------------------------------------------
# DEBUG: define local version of round
## # round(number, [decimal places])
## #
## # Rounds the number to the specified number of decimal places, usually 3.
## # TODO: see why version from common.perl not used
## #
## sub round {
## my($number, $places) = @_;
## $places = $::precision if (!defined($places));
## my($format) = "%.${places}f";
## my($result) = sprintf($format, $number);
## &debug_print(&TL_ALL, "round($number, [$places]) => $result\n");
## return ($result);
## }