-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathqd_trans_spanish.perl
executable file
·360 lines (316 loc) · 11.7 KB
/
qd_trans_spanish.perl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
# *-*-perl-*-*
eval 'exec perl -Ssw $0 "$@"'
if 0;
# -*- coding: utf-8 -*-
#!/usr/bin/perl -sw
#
# qd_trans_spanish.perl: quick and dirty translation of Spanish text.
# Currently, this is just a dictionary lookup.
#
# Example invocation (see .emacs.trans-sp):
# - perl -Ssw qd_trans_spanish.perl -maxlen=256 -redirect -sp_remove_diacritics -verbose -
#
# NOTE:
# - via spanish.perl:
# $sp_word_chars = "\\wáéíóúñü-";
# $sp_word_pattern = "([$sp_word_chars]+)([^$sp_word_chars])"
# - The UNDER_EMACS environment variable is defined via my ~/.emacs config.
#
# TODO:
# - * Make utf8 the default.
# - Use strict-mode w/ variable-specific import exceptions.
# - Remove verb conjugations during -pretty -first_sense output.
# - Use edit distance to suggest terms when no results (e.g., tearal => teatral).
# - add support for UTF-8 input as done in spanish.perl (also see count_it.perl)
# - Make sure space after semicolons in verb conjugations:
# ex: +reir => ... Subj. Pres.: ría, rías, ría, ríamos, riáis, rían.
# - Record words looked up in order to find out which words not being learned (i.e., requiring repeated lookups).
# - Make sure global and/or ignore is used for regex changes involving listing optonally with word in pattern.
# - Put word at start of line if past or present participle. For example,
# "\t1. (subir past_part) intr. to go up" => "subdir\n\t1. (past_part) intr. to go up"
# - Put verb conjugation on separate line (from last verbal sense). For example,
# "\t7. to make a slip, to blunder. \ Conjug. like contar." => "\t7. to make a slip, to blunder.\n\tConjug. like contar."
#
# Copyright (c) 2005-2022 Tom O'Hara and New Mexico State University.
# Freely available via GNU General Public License (see GNU_public_license.txt).
#
BEGIN {
my $dir = $0; $dir =~ s/[^\\\/]*$//; unshift(@INC, $dir);
require 'common.perl';
}
require 'spanish.perl';
use vars qw/$sp_options $sp_word_pattern $first_sense $subsenses $redirect $sp_remove_diacritics $verbose $trans_line/;
## OLD: use vars qw/$utf8 $prompt/;
## NOTE: utf8 handled by spanish.perl
## TODO1: straighten out perl utf8 morass
use vars qw/$prompt/;
my($misc_usage_notes) = <<END_MISC_NOTES;
Notes:
- Use special indicators before tokens to modify output
+ for longer lines
@ for prettyprinting (e.g., verb conjugations)
& for multi-line output (assumes @)
* for all of the above
- Other special indicators:
< entire line translation
> english translation
! run shell command (with aliases enabled)
END_MISC_NOTES
sub usage {
local ($program_name) = $0;
$program_name =~ s/\.?\.?\/.*\///;
select(STDERR);
print <<USAGE_END;
Usage: $program_name [options] spanish_file ...
options = $sp_options [-sp_make_dict=0|1] [-pretty] [-brief] [-maxlen=N] [-full] [-multi] [-trans_line] [-prompt=text]
Quick and dirty (word-level) translation for the Spanish text.
Example:
$program_name luna_y_sol.text >! luna_y_sol.info
USAGE_END
print($misc_usage_notes);
&exit();
}
# If command line arguments are missing, give a usage message.
# $#ARGV = (# of arguments) - 1
if ($#ARGV < 0) {
&usage();
}
&init_var(*brief, &FALSE); # just show first sense in pretty printing
## OLD: &init_var(*pretty, $brief); # pretty print the entry
## OLD: &init_var(*maxlen, 512); # maximum line length for translation output
&init_var(*full, &FALSE); # show full line (i.e., no truncation at $maxlen)
&init_var(*multi, &FALSE); # show output on multiple lines
&init_var(*trans_line, &FALSE); # translate entire line
&init_var(*pretty, $multi); # pretty print the entry
&init_var(*maxlen, ($multi ? 1024 : 512)); # maximum length for translation output
# Note: Special processing for prompt to support usage under Emacs shell
# (n.b, UNDER_EMACS must be set explicitly).
# TODO: just default to "> " regardless
## TEST
## my($prompt_default) = (&get_env("UNDER_EMACS") ? "> " : "");
## &init_var(*prompt, $prompt_default); # prompt for input
## TODO: disable buffering
&init_var(*prompt, ""); # prompt for input
&init_var(*debug_level, &DEBUG_LEVEL); # workaround for quirk under Emacs
# Temp workround
## TODO: &debug_on(3);
if ($debug_level != &DEBUG_LEVEL) {
&debug_on($debug_level);
}
use vars qw/$preserve_temp/;
&debug_print(3, "preserve_temp=$preserve_temp\n");
# Set stdout unbuffered if prompt used
if ($prompt ne "") {
&debug_print(&TL_DETAILED, "Setting STDOUT unbuffered\n");
select(STDOUT); $| = 1;
## TEST
## select(STDIN); $| = 1;
## select(STDERR); $| = 1;
}
# Set defaults for the dictionary, etc. These can be specified on command
# line, as follows:
#
# perl -s trans_spanish.perl -user_dict=test_dict.list < text_file
#
# $user_dict = "" unless defined($user_dict);
&init_var(*sp_make_dict, &FALSE); # for subsetting the bilingual dictionary
if ($brief) {
$first_sense = &TRUE;
$subsenses = &TRUE;
}
# Read in the Spanish dictionary
#
&sp_read_dicts();
if ($sp_make_dict == &TRUE) {
open (TEMP_DICT, ">sp_temp.dict");
}
# Print a header comment
# TODO: reconcile the notes here with the ones in the usage statement.
if ($verbose) {
print "# qd_trans_spanish\n";
my($misc_usage_comments) = $misc_usage_notes;
# note: m modifier treats string as multiple lines.
$misc_usage_comments =~ s/^/# /gm;
print($misc_usage_comments);
## OLD:
## print "# Notes:\n";
## print "# - Use + prefix to show entire translation (instead of $maxlen chars).\n";
## print "# - Use @ prefix to prettyprint the entry.\n";
## print "# - Use & prefix to use multiple lines when prettyprinting.\n";
print "#\n";
print "# word: translation\n";
print "#\n";
}
# Get the translations for each word in the text. Output each word with
# its translation on a separate line.
#
print($prompt);
outer:
while (<>) {
chomp; s/\r//; # TODO: define my_chomp
&dump_line();
# Check for special directives (e.g., '+' for ignoring max-len restriction)
my($full_line) = $full;
my($pretty_print) = $pretty;
my($multi_line) = $multi;
my($translate_line) = $trans_line;
my($translate_english) = &FALSE;
my($log) = (&TEMP_DIR . "_qd_trans_spanish.$$.log");
inner:
while ($_ =~ /^[\+@&\*\<\>\!]/) {
if ($_ =~ /^\+/) { # mnemonic: longer length
$_ = $'; # ' (maldito emacs)
$full_line = &TRUE;
}
if ($_ =~ /^@/) { # mnemonic ???: perl @ for "@all" prettyprint?
$_ = $'; # ' (maldito emacs)
$pretty_print = &TRUE;
}
if ($_ =~ /^&/) { # mnemonic: line AND'ed together w/ newline (not tab)
$_ = $'; # ' (maldito emacs)
$pretty_print = &TRUE;
$multi_line = &TRUE;
## OLD: $full_line = &TRUE;
}
if ($_ =~ /^\*/) { # mnemonic: * wildcard ('.' conflict w/ period)
$_ = $'; # ' (maldito emacs)
$full_line = &TRUE;
$pretty_print = &TRUE;
$multi_line = &TRUE;
}
if ($_ =~ /^\</) {
$_ = $'; # ' (maldito emacs)
$translate_line = &TRUE;
}
if ($_ =~ /^\>/) {
$_ = $'; # ' (maldito emacs)
$translate_line = &TRUE;
$translate_english = &TRUE;
}
# With ! indicator, run remainder as shell command and resume
# note: useful for sp-, etc. aliases
if ($_ =~ /^!/) { # ' (maldito emacs)
my($command) = $';
## OLD: print(&run_command("bash -i -c '$command' 2>| $log"));
print(&to_utf8(&run_command("bash -i -c '$command' 2>| $log")));
print("\n");
debug_out(6, "log: {\n%s}\n", &read_file($log));
next outer;
}
}
# Translate entire line
# note: doesn't proceed with word lookup (TODO: allow for this along with pruning common terms)
$text = "$_";
if ($translate_line) {
my($from) = ($translate_english ? "en" : "es");
my($to) = ($translate_english ? "es" : "en");
&debug_print(5, "running translation: f='$from' t='$to' txt='$text'\n");
my($env_spec) = "SHOW_ELAPSED=1 FROM=$from TO=$to";
my($save_trace_level) = &DEBUG_LEVEL;
&debug_on(3);
## OLD:
## print(&run_command_over("$env_spec machine_translation.py 2>| $log",
## "$text\n", 3));
print(&to_utf8(&run_command_over("$env_spec machine_translation.py 2>| $log",
"$text\n", 3)));
print("\n\n");
debug_out(6, "log: {\n%s}\n", &read_file($log));
&debug_on($save_trace_level);
next;
}
# Tokenize the line
$text .= "\n";
while ($text =~ /$sp_word_pattern/i) {
$pre_delim = $`;
$word = $1;
$post_delim = $2;
$rest = $'; # ' (maldito emacs)
if ($pre_delim !~ /^[\t\n ]*$/) {
&debug_out(5, "pre: %s (%X)", $pre_delim, $pre_delim);
print "$pre_delim\n";
}
&debug_print(5, "$word ");
## $word =~ tr/A-Z ÁÉÍÓÚÑÜ/a-záéíóúñü/;
$word_lower = &sp_to_lower($word);
# Get the word translation and truncate if necessary
$translation = &sp_lookup_word($word_lower);
if ((length($translation) > $maxlen) && (! $full_line)) {
my($end) = index($translation, " ", $maxlen);
$end = ($maxlen - 1) if ($end == -1);
$translation = substr($translation, 0, $end) . " ...";
}
# Fix quirks in the listing (e.g., no space after semicolon)
# TODO: remove '~' preceding phrasal entries (e.g., "de ~nada" => "de nada"); make this part of sp_lookup_word
$translation =~ s/;(\S)/; $1/g;
# Print the entry with optional pretty printing
# Note: Adds colon after each entry unless ends in one (e.g., 'Verb conjugations:)
if ($pretty_print) {
my($listing) = &sp_pp_entry($word, $translation);
&debug_print(6, "listing: $listing\n");
## OLD: $listing =~ s/\n/:\t/g;
# Replace newlines with ":<TAB>" and remove colon after last entry
$listing =~ s/([^:])\n/$1:\t/g;
$listing =~ s/:\t$//;
if ($multi_line) {
# Restore newlines if multi-line output desired (n.b., replaces colons with semi-colons; TODO: retain first colon as used for word itself such as if there are accents added).
$listing =~ s/:\t/;\n\t/g;
# HACK: fixup missing tab (TODO: fix upstream)
$listing =~ s/\n([^\t])/\n\t$1/g;
# Put indicator of additional entry for word at start of line
# ex: "ser (1)...\n\t1. (2) subst. v. to be" => "ser(1)...\nser (2):\n\t1. subst. v. to be"
$listing =~ s/\n\t(\d\.) (\(\d\)) /\n$word $2:\n\t$1 /ig;
# Similarly put indicator when verb conjugations given, especially after a set of noun definitions
# Note: should only be one but sp_pp_entry over-generates (e.g., showing verb conjugations
# ex: "\t1. (animar pres_ind:2ps) tr. to animate.;" => "animar \t1. (pres_ind:2ps) tr. to animate.;"
$listing =~ s/\n\t(\d\.) \(([\w]*) ([^\:]+:[^\:]+)\) /\n$2:\n\t$1 ($3) /ig;
# sense verb person:POS
}
print "$listing\n";
}
else {
print "$word:\t$translation\n";
}
# Make a version of the Spanish-english dictionary containing just
# the words encountered.
if ($sp_make_dict == &TRUE) {
print TEMP_DICT "$word\t$word\t$translation\n";
}
if ($post_delim !~ /^[\t\n ]*$/) {
&debug_out(5, "post: %s (%X)", $post_delim, $post_delim);
print "$post_delim\n";
}
$text = $rest;
}
print "$text\n"; # print remainder
print($prompt) if (! eof);
}
# Cleanup
if ($sp_make_dict == &TRUE) {
close(TEMP_DICT);
}
&exit();
#-------------------------------------------------------------------------------
# check_word: lookup the translation for a word
#
sub check_word {
local($word) = @_;
local($verb, $tense) = &sp_stem_verb($word);
local($noun, $person) = &sp_stem_noun($word);
if ($verb ne "") {
print "$word:\t($verb, $tense)\t$sp_trans{$verb}\n";
}
if ($noun ne "") {
print "$word:\t($noun, $person)\t$sp_trans{$noun}\n";
}
}
# to_utf8 (text): encodes as UTF8 if enabled
#
sub to_utf8 {
my($text) = @_;
my($result) = $text;
if ($utf8) {
$result = decode_utf8($result);
}
&debug_print(&TL_VERBOSE, "to_utf8(@_) => $text\n");
return ($result);
}