-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathencoding_converter.py
136 lines (117 loc) · 4.56 KB
/
encoding_converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#! /usr/bin/env python
#
# unicode-converter.py: converts between various file formats (e.g., Windows-1259 to UTF-8)
#
# Notes:
# - https://stackoverflow.com/questions/31207287/converting-utf-16-to-utf-8.
# - https://stackoverflow.com/questions/492483/setting-the-correct-encoding-when-piping-stdout-in-python
#
# TODO:
# - Remove extraneous byte order marks (BOM's), such as after start of file.
#
#
"""Unicode converter (e.g., UTF-16 to UTF-8)"""
import argparse
import codecs
import os
import sys
import tpo_common as tpo
import debug
import system
UTF8 = "utf-8"
UTF16 = "utf-16"
TO_UTF8 = "to-utf8"
TO_UTF16 = "to-utf16"
FROM_UTF8 = "from-utf8"
FROM_UTF16 = "from-utf16"
TO = "to"
FROM = "from"
def option(label):
"""Convert LABEL into an option (e.g., for argparse)"""
# ex: option("skip-headers") => "--skip-headers"
return "--" + label
def arg(label):
"""Convert LABEL into parsed-args key"""
# ex: arg("skip-headers") => "skip_headers"
return label.replace("-", "_")
class EncodedStdout:
"""Context for shadowing sys.stdout with encoding-specific version"""
# usage: with EncodedStdout('utf-8'):
# ...
def __init__(self, enc):
"""Constructor: save current stdout"""
self.enc = enc
self.stdout = sys.stdout
def __enter__(self):
"""Initialize context by replacing stdout"""
if sys.stdout.encoding is None:
w = codecs.getwriter(self.enc)
sys.stdout = w(sys.stdout)
def __exit__(self, exc_ty, exc_val, tb):
"""Exit context by restoring stdout"""
sys.stdout = self.stdout
def main():
"""Entry point for script"""
tpo.debug_print("main(): sys.argv=%s" % sys.argv, 4)
# Check command-line arguments
# TODO: add in detailed usage notes w/ environment option descriptions (see google_word2vec.py)
parser = argparse.ArgumentParser(description="TODO: what the script does")
# TODO: use capitalized script description but lowercase argument help
parser.add_argument(option(FROM_UTF8), default=False, action='store_true',
help="Convert from UTF8")
parser.add_argument(option(FROM_UTF16), default=False, action='store_true',
help="Convert from UTF-16")
parser.add_argument(option(TO_UTF8), default=False, action='store_true',
help="Convert to UTF8")
parser.add_argument(option(TO_UTF16), default=False, action='store_true',
help="Convert to UTF-16")
parser.add_argument(option(FROM), default="",
help="Input encoding")
parser.add_argument(option(TO), default="",
help="Output encoding")
parser.add_argument("filename", nargs='?', default='-',
help="Input filename")
args = vars(parser.parse_args())
tpo.debug_print("args = %s" % args, 5)
to_encoding = args['to']
from_encoding = args['from']
if not to_encoding and args[arg(TO_UTF8)]:
to_encoding = UTF8
if not to_encoding and args[arg(TO_UTF16)]:
to_encoding = UTF16
if not from_encoding and args[arg(FROM_UTF8)]:
from_encoding = UTF8
if not from_encoding and args[arg(FROM_UTF16)]:
from_encoding = UTF16
filename = args['filename']
debug.trace_fmtd(4, "from={fe} to={te} filename={f}", fe=from_encoding, te=to_encoding, f=filename)
# TODO: put supporting code in module to cut down on boilerplate code
input_stream = sys.stdin
if filename != "-":
if not os.path.exists(filename):
system.print_stderr("Unable to find {f}", f=filename)
return
## input_stream = open(filename, "r")
input_stream = codecs.open(filename, "rb", from_encoding)
if not input_stream:
system.print_stderr("Unable to open {f}", f=filename)
return
else:
tpo.debug_print("Processing stdin", 6)
input_stream.encoding = from_encoding
## BAD: sys.stdout.encoding = to_encoding
## sys.stdout = codecs.getwriter('utf8')(sys.stdout)
# Process input line by line
line_num = 0
with EncodedStdout(to_encoding):
for line in input_stream:
## line = line.decode(from_encoding, 'ignore')
debug.trace_fmtd(6, "L{n}: {l}", n=line_num, l=line)
line_num += 1
## sys.stdout.write(line.encode(to_encoding))
## sys.stdout.write(line.encode(to_encoding, 'ignore'))
sys.stdout.write(line)
return
#------------------------------------------------------------------------
if __name__ == '__main__':
main()