-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbash2python.py
executable file
·1647 lines (1526 loc) · 80 KB
/
bash2python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
#
# This script converts from Bash snippets to Python. This is not intended as a general purpose
# conversion utility, which is a major undertaking given the complexities of Bash syntax.
# Instead, this is intended to capture commonly used constructs, such as by Tom O'Hara
# during shell interactions. For such constructs, see header commments in following:
# https://github.com/tomasohara/shell-scripts/blob/main/template.bash
# The code is adhoc, but that is appropriate given the nature of the task, namely
# conversion accounting for idiosyncratic conventions.
#
# There is optional support for doing the converion via OpenAI's Codex, which was trained on
# Github. See bash2python_diff.py for code that invokes both the regex approach
# and the Codex approach, showing the conversions in a side-by-side diff listing.
#
# Notes:
# - This just outputs the Python code, along with import for helper module and initialization.
# The user might have to do some minor fixup's before the code will run properly.
# - Simple Bash statements get converted into shell invocation calls (a la os.system).
# pushd ~/; cp -f .[a-z]* /tmp; popd => run("pushd ~/; cp -f .[a-z]* /tmp; popd")
# - Simple variable assignments get translated directly, but complex runs are converted into echo command.
# log="run-experiment.log"; => log = "run-experiment.log"
# today=$(date '+%d%b%y') => today = run("echo \"date '+%d%b%y'\"")
# - Simple loops get converted into Python loops, namely for-in and while:
# for v in abc def: do echo $v; done => for v in ["abc", "def"]: gh.run("echo {v}")
# - Unsupported or unrecognized constructs are flagged as runtime errors:
# for (( i=0; i<10; i++ )); do echo $i; done
# =>
# # not supported: for (( i=0; i<10; i++ )); do echo $i; done
# raise NotImplementedError()
# - The result will likely need manual revision, a la Python 2 to 3 converion (2to3).
# - Global specifications for code linter(s):
# # pylint: disable=eval-used
# - OpenAI API reference: https://platform.openai.com/docs/api-reference
# - Bash manual: https://www.gnu.org/software/bash/manual/bash.html
# - Devloped by Tana Alvarez and Tom O'Hara.
# - Regex cheatsheet:
# (?:regex) non-capturing group
# (?<!regex) negative lookbehind
# (?!regex) negative lookahead
# (?=regex) positive lookahead
# *? and +? non-greedy match
# - See https://www.rexegg.com/regex-quickstart.html for comprehensive cheatsheet.
#
# Tips:
# - **** Err on the side of special case code rather than general purpose. It is easier to
# isolate problems when decomposed for special case handling=. In addition, one of the
# updates clobbered special case conversion of for-in loops with support for list iteration.
# - *** Always add new test cases when making significant changes.
# - ** Avoid regular string searching or replacements (e.g., "done" in line => re.search('\bdone\b', line).
# - * Be liberal with variables (e.g., don't reuse same variable for different purpose and make name self explanatory)@.
#
# TODO1:
# - Add a bunch of more sanity checks (e.g., via debug.assertion).
# - Clean up var_replace, which is a veritable achilles heel. There should be a separate version that
# converts partly converted bash code.
# TODO2:
# - Make sure pattern matching accounts for word boundaries (e.g., line.replace("FOR", ...) => re.sub(r"\bFOR\b", "...", line).
# - Also make sure strings excluded from matches (e.g., "# This for loop ..." =/=> "# This FOR loop").
# - Use bashlex and/or shlex for sanity checks on the regex parsing used here.
# TODO3:
# - Flag constructs not yet implemented:
# -- C-style for loops (maybe use cursed-for module--more for extensing Python syntax).
# -- Bash arrays and hashes
# TODO4:
# - Add option for Black formatting
#
"""Bash snippet to Python conversion using heuristics for common constructs"""
# Standard modules
from collections import defaultdict
import json
import os
import re
import subprocess
# Installed modules
import click
import diskcache
import openai
# Local modules
import mezcla
from mezcla.bash_ast import BashAST
from mezcla import debug
from mezcla import glue_helpers as gh
from mezcla.main import Main, BRIEF_USAGE, INDENT
from mezcla.my_regex import my_re
from mezcla import system
from mezcla.text_utils import version_to_number
# Version check
debug.assertion(version_to_number("1.3.4") <= version_to_number(mezcla.__version__))
# Constants
TL = debug.TL
CODEX_PREFIX = "# codex: "
NEWLINE = "\n" # for use in regex's
# Environment constants
USER = system.getenv_text("USER", "unknown-user",
"User ID")
USE_MEZCLA = system.getenv_bool("USE_MEZCLA", (USER == "tomohara"),
"Whether to use mezcla support, such as for regex tracing")
INIT_FILE = system.getenv_value("INIT_FILE", None,
"File to source before running Bash command")
DISK_CACHE = system.getenv_value("DISK_CACHE", None,
"Path to directory with disk cache")
OPENAI_API_KEY = system.getenv_value("OPENAI_API_KEY", None,
"API key for OpenAI")
OPENAI_PAUSE = system.getenv_number("OPENAI_PAUSE", 1.0,
"Delay in seconds after each OpenAI API call")
JUST_CODEX = system.getenv_bool("JUST_CODEX", False,
"Only do conversion via Codex")
USE_CODEX = system.getenv_bool("USE_CODEX", (OPENAI_API_KEY or JUST_CODEX),
"Whether to use codex support")
## TEMP: nuclear option to disable Codex
SKIP_CODEX = system.getenv_bool("SKIP_CODEX", (not USE_CODEX),
"Temporary hack to ensure codex not invoked")
INLINE_CODEX = system.getenv_bool("INLINE_CODEX", False,
"Add inline comments with Codex conversion")
INCLUDE_ORIGINAL = system.getenv_bool("INCLUDE_ORIGINAL", False,
"Include original code as comment")
SKIP_HEADER = system.getenv_bool("SKIP_HEADER", False,
"Omit header with imports and run definition")
INCLUDE_HEADER = (not SKIP_HEADER)
SKIP_COMMENTS = system.getenv_bool("SKIP_COMMENTS", False,
"Omit comments from generated code")
STRIP_INPUT = system.getenv_bool("STRIP_INPUT", False,
"Strip blank lines and comments from input code")
OUTPUT_BASENAME = system.getenv_value("OUTPUT_BASENAME", None,
"Basename for optional output files such as Codex json responses")
EXPORT_RESPONSE = system.getenv_bool("EXPORT_RESPONSE", (OUTPUT_BASENAME is not None),
"Export Codex reponses to JSON")
MODEL_NAME = system.getenv_text("MODEL_NAME", "text-davinci-002",
description="Name of OpenAI model/engine")
TEMPERATURE = system.getenv_float("TEMPERATURE", 0.6,
description="Next token probability")
PARA_SPLIT = system.getenv_bool("PARA_SPLIT", False,
"Split code in Perl-style paragraph mode")
BASHLEX_CHECKS = system.getenv_bool("BASHLEX_CHECKS", False,
"Includee bashlex sanity checks over AST--abstract syntax tree")
SKIP_VAR_INIT = system.getenv_bool("SKIP_VAR_INIT", False,
"Don't initialize variables")
# Global settings
regular_re = re
if USE_MEZCLA:
re = my_re
PYTHON_HEADER = """# Output from bash2python.py
'''Python code from Bash snippet'''
import os
import sys
from mezcla.glue_helpers import run_via_bash
from mezcla import system
INIT_FILE = system.getenv_value("INIT_FILE", None,
"File to source before running Bash command")
def run(command, skip_print=False):
'''Runs COMMAND and return output. Also, prints non-empty output unless SKIP_PRINT'''
result = run_via_bash(command, init_file=INIT_FILE)
if (not skip_print) and result:
print(result)
return result
def arg(arg_num):
'''Returns Nth arg or ""'''
return (sys.argv[arg_num] if (arg_num < len(sys.argv)) else "")
"""
# TODO1: rework run via class so that the commands can be batched
# TODO2: track down quirk with gh.run("echo ...") not outputting result--by design?
# TODO3: isolate stderr from stdout
#...............................................................................
def get_bash_var_hash():
"""Return a lookup hash for checking whether Bash variable is defined
Note: this includes environment variables as well as regular ones"""
# Sample listing from Bash set command:
# ANACONDA_HOME=/
# BASH=/bin/bash
# ...
# zless ()
# {
# zcat "$@" | $PAGER
# }
# Extract variables from set command output
# TODO2: track down the source of the hangup when invoked via test-all.bash
var_hash = defaultdict(bool)
bash_variable_listing = gh.run_via_bash("set", init_file=INIT_FILE,
trace_level=7)
for line in bash_variable_listing.splitlines():
if my_re.search(r"^([A-Za-z0-9_]+)=", line):
var_hash[my_re.group(1)] = True
# Run sanity checks
if debug.detailed_debugging():
env_vars = sorted(list(os.environ.keys()))
bash_vars = sorted(list(var_hash.keys()))
debug.trace_expr(5, bash_vars, max_len=4096)
debug.trace_expr(5, env_vars, max_len=4096)
debug.assertion(not system.difference(env_vars, bash_vars))
debug.assertion(sorted(system.intersection(env_vars, bash_vars)) == env_vars)
debug.trace(6, f"get_bash_var_hash() => {var_hash}")
return var_hash
def embedded_in_quoted_string(subtext, text):
"""Whether SUBTEXT of TEXT in contained in a quote"""
# ex: embedded_in_quoted_string("#", "echo '#-sign'")
# ex: not embedded_in_quoted_string("#", 'let x++ # incr x w/ init')
# ex: ex: embedded_in_quoted_string("$fu", "echo '$fu'")
in_subtext = subtext
subtext = my_re.escape(subtext)
result = (my_re.search(fr"'[^']*{subtext}[^']*'", text) or
my_re.search(rf'"[^"]*{subtext}[^"]*"', text))
debug.trace(7, f"embedded_in_quoted_string{(in_subtext, text)} => {result}")
return result
last_line = None
#
def trace_line(line, label=None):
"""Trace LINE if changed"""
if label is None:
label = "ln"
global last_line
if line != last_line:
debug.trace(6, f"{label}=\t{line!r}")
last_line = line
## TEST
def remove_extra_quotes(line, label=None):
"""Remove extraneous quote characters from LINE"""
# EX: remove_extra_quotes('f"print("{os.getenv("HOME")}")"') => 'f"print({os.getenv("HOME")})"'
## TODO
## line = my_re.sub(r'("[^"]*)"([^"]*)"([^"]*")', line,
## r"\1\2\3")
while (my_re.search(r'^(.*)("[^"]*)"([^"]*)"([^"]*")(.*)$', line) or
my_re.search("'^(.*)('[^']*)'([^']*)'([^']*')(.*)$", line)):
line = "".join(my_re.groups())
trace_line(line, label=label)
return line
def fix_embedded_quotes(line, label=None, reset_on_spaces=False):
"""Fix quote characters embedded in other quote characters in LINE
Notes:
- Embedded double quotes escaped with '\' unless already quoted.
- Outer single quotes changed to double quotes if embedded
- If RESET_ON_SPACES then embedding status reset if space encountered
"""
# EX: fix_embedded_quotes('print(f"{os.getenv("HOME")}")') => 'print(f"{os.getenv(\\"HOME\\")})"'
# EX: fix_embedded_quotes(' "abc"def"ghi" ') => ' "abc\"def\"ghi" '
# EX: fix_embedded_quotes(" 'abc'def\"ghi' ") => ' "abc'def\"ghi" '
# EX: fix_embedded_quotes(' "abc" "def" ', reset_on_spaces=False) => (' "abc\" \"def" ')
# EX: fix_embedded_quotes(' "abc" "def" ', reset_on_spaces=True) => (' "abc" "def" ')
quote_num = 0
start = 0
new_line_buffer = []
first_quote = last_quote = -1
def adjust_quotes():
"""Switch to outer double quote if embedded; reset state"""
nonlocal first_quote, last_quote, quote_num, new_line_buffer
new_line_buffer[first_quote] = '"'
if new_line_buffer[first_quote - 1] == '\\':
new_line_buffer[first_quote - 1] = ""
new_line_buffer[last_quote] = '"'
if new_line_buffer[last_quote - 1] == '\\':
new_line_buffer[last_quote - 1] = ""
first_quote = last_quote = -1
quote_num = 0
# Escape embedded double quotes, keeping track of outer quote positions
while (start < len(line)):
if (line[start] == '"') or (line[start] == "'"):
quote_num += 1
if (quote_num == 1):
first_quote = len(new_line_buffer)
else:
if ((start > 0) and (line[start] == '"') and (line[start - 1] != "\\")):
new_line_buffer.append("\\")
last_quote = len(new_line_buffer)
debug.trace_expr(8, start, line[start], quote_num, first_quote, last_quote)
new_line_buffer.append(line[start])
# Reset quote status on spaces (n.b., removes final escape)
if ((line[start] == " ") and (first_quote >= 0) and reset_on_spaces):
adjust_quotes()
# Advance to next character in input line
start += 1
# Change outer quote to double if any embedded (n.b., removes final escape)
if (first_quote >= 0):
adjust_quotes()
new_line = "".join(new_line_buffer)
trace_line(new_line, label=label)
debug.trace(7, f"fix_embedded_quotes({line!r}) => {new_line!r}")
return new_line
def has_multiple_statements(line):
"""Indicates whether LINE has two or more statements
Returns tuple with first split otherwise None
"""
# EX: has_multiple_statements("echo 1; echo 2; echo 3") => ("echo 1;", " echo 2; echo 3")
result = None
if (my_re.search(r"^(.*?(?:\s*\S+\s*));((?:\s*\S+\s*).*)$", line) and
(not regular_re.search(r"""(['"])[^\1]*;\1$""", line))):
result = tuple(my_re.groups())
if BASHLEX_CHECKS:
ast_pp = BashAST(line).dump()
debug.assertion(bool(result) == ast_pp.startswith("ListNode"))
debug.trace(7, f"has_multiple_statements({line!r}) => {result}")
return result
def split_statements(line):
"""Split LINE into separate statements"""
# EX: split_statements("echo 1; echo 2; echo 3") => ["echo 1", "echo 2", "echo 3"]
# TODO3: account for braces
in_line = line
statements = []
while line:
line_split = has_multiple_statements(line)
if not line_split:
statements.append(line.strip())
break
(line, remainder) = line_split
statements.append(line.strip())
line = remainder
debug.trace(6, f"split_statements({in_line!r}) => {statements!r}")
return statements
#................................................................................
class Bash2Python:
"""Returns a Python-like file based on Bash input"""
KEYWORD_MAP = {
"function": "def",
"true": "pass"}
LOOP_CONTROL = ["break", "continue"]
def __init__(self, bash, skip_comments=None, strip_input=None, segment_prefix=None, segment_divider=None, skip_var_init=None):
"""Class initializer: using BASH command to convert
Note: Optionally SKIPs_COMMENT (conversion annotations), STRIPs_INPUT (input comments and blank lines), sets SEGMENT_PREFIX for filtering, and changes SEGMENT_DIVIDER for command iteration (or newline)"""
## TODO3: skip_comments=>skip_annotations
debug.trace(6, "Bash2Python.__init__()")
self.cmd = bash
self.bash_var_hash = get_bash_var_hash()
self.string_variables = []
self.numeric_variables = []
if skip_comments is None:
skip_comments = SKIP_COMMENTS
self.skip_comments = skip_comments
if strip_input is None:
strip_input = STRIP_INPUT
self.strip_input = strip_input
if segment_divider is None:
# note: PARA_SPLIT emulates input done by bash2python_diff.py
segment_divider = ("\n\n" if PARA_SPLIT else "\n")
self.segment_divider = segment_divider
self.segment_prefix = segment_prefix
if skip_var_init is None:
skip_var_init = SKIP_VAR_INIT
self.skip_var_init = skip_var_init
self.global_ast = None
self.cache = None
self.codex_count = 0
self.line_num = 0
if DISK_CACHE:
# notes:
# - Uses JSON for serialization due to issues with pickling using tuples. See
# https://grantjenks.com/docs/diskcache/tutorial.html
# - The cache itself is still stored via SQLite3.
# - Changing disk type without regenerating cache can lead to subtle errors.
self.cache = diskcache.Cache(DISK_CACHE,
disk=diskcache.core.JSONDisk,
disk_compress_level=0, # no compression
cull_limit=0, # no automatic pruning
)
debug.trace_object(5, self, label=f"{self.__class__.__name__} instance")
def contains_embedded_for(self, bash_snippet):
"""Simple method to get perl regex detection of embedded for loops"""
# NOTE: obsolete
debug.assertion(not system.file_exists(bash_snippet))
bash_snippet = re.sub(r"#.*$", "", bash_snippet, flags=re.MULTILINE) # Remove comments
bash_snippet = re.sub(r"'([^'\\]*(?:\\.[^'\\]*)*)'", "", bash_snippet) # Remove single-quoted strings
bash_snippet = re.sub(r'"([^"\\]*(?:\\.[^"\\]*)*)"', "", bash_snippet) # Remove double-quoted strings
temp_file = gh.get_temp_file()
system.write_file(temp_file, bash_snippet)
command = f"echo '{bash_snippet}' | perl -0777 -ne 's/.*/\\L$&/; s/\\bdone\\b/D/g; print(\"Warning: embedded for: $&\n\") if (/\\bfor\\b[^D]*\\bfor\\b/m);'"
process = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
output, _ = process.communicate()
result = output.decode()
debug.trace(6, f"contains_embedded_for({bash_snippet!r})\n\tself={self} => {result}")
return result
def map_keyword(self, line):
"""Perform conversion for single keyword statement"""
in_line = line
if my_re.search(r"^(\s*)(\w+)(.*)$", line):
indent = my_re.group(1)
keyword = my_re.group(2)
remainder = my_re.group(3)
line = indent + self.KEYWORD_MAP.get(keyword, keyword) + remainder
debug.trace(5, f"map_keyword({in_line!r}) => {line!r}")
return line
def codex_convert(self, line):
"""Uses OpenAI Codex to translate Bash to Python"""
result = self.codex_convert_aux(line)
debug.trace(6, f"codex_convert({line!r}) => {result!r}")
return result
def codex_convert_aux(self, line):
"""Helper to codex_convert"""
debug.trace(6, f"codex_convert_aux({line!r})")
# Strip segment comments (see bash2python_diff.py)
# ex: "#s# Segment 1\nfu=3\n" => "fu=3\n"
comment = ""
if self.segment_prefix:
debug.assertion(self.segment_prefix.startswith("#"))
while my_re.search(fr"^({self.segment_prefix}[^{NEWLINE}]*{NEWLINE})+(.*)", line,
flags=my_re.DOTALL):
comment += my_re.group(1)
line = my_re.group(2)
debug.trace_expr(6, comment, line)
if my_re.search(fr"^({self.segment_prefix}[^{NEWLINE}]*{NEWLINE})", line, flags=my_re.MULTILINE):
debug.trace(5, f"FYI: Stripping residual segment comments in line: {my_re.group(1)!r}")
line = my_re.sub(fr"^{self.segment_prefix}[^{NEWLINE}]*{NEWLINE}", "", line, flags=my_re.MULTILINE)
debug.trace_expr(6, line)
# note: normalize for sake of caching
line = line.rstrip("\n") + "\n"
# Note: Non-code uses both prefix and comment indicator (n.b., former stripped in convert_snippet below)
if SKIP_CODEX:
return CODEX_PREFIX + "# internal warning (SKIP_CODEX set)"
debug.assertion(USE_CODEX)
if not USE_CODEX:
return CODEX_PREFIX + "# internal error (USE_CODEX not set)"
if not line.strip():
return CODEX_PREFIX + "# blank line"
# Make sure OpenAI key set
# note: Apply for an API Key at https://beta.openai.com
debug.assertion(OPENAI_API_KEY or DISK_CACHE)
if (not (((OPENAI_API_KEY or "").strip()) or DISK_CACHE)):
return CODEX_PREFIX + "# OPENAI_API_KEY not set"
openai.api_key = OPENAI_API_KEY
# Define the code generation prompt
TARGET_LANG = system.getenv_text("TARGET_LANG", "Python",
"Target language for Codex")
prompt = f"Convert this Bash snippet to {TARGET_LANG}:\n{line}"
debug.trace_expr(6, prompt)
# Call the Codex API
try:
# Optionally check cache
params = {
"engine": MODEL_NAME,
"prompt": prompt,
"max_tokens": 3 * len(line.split()),
"n": 1,
"stop": None,
"temperature": TEMPERATURE,
}
params_tuple = tuple(list(params.values()))
response = None
# Check if the response is already in the cache before calling the API
if self.cache is not None:
debug.trace(5, "Checking cached Codex result")
response = self.cache.get(params_tuple)
# Submit request to OpenAI
if response is None:
# If the response is not in the cache, make the API call and store the result in the cache
debug.trace(5, "Submitting Codex request")
response = openai.Completion.create(**params)
if self.cache is not None:
self.cache[params_tuple] = response
system.sleep(OPENAI_PAUSE)
if EXPORT_RESPONSE:
self.codex_count += 1
json_filename = f"{OUTPUT_BASENAME}-codex-{self.codex_count}.json"
response["prompt"] = prompt
system.write_file(json_filename, json.dumps(response, default=str))
debug.trace_expr(5, response, max_len=4096)
# Extract text for first choice and convert into single-line comment
comment += CODEX_PREFIX + response["choices"][0]["text"].replace("\n", "\n" + CODEX_PREFIX).rstrip()
except openai.APIError: # Catch OpenAI API errors
system.print_exception_info("Error communicating with OpenAI API in codex_convert")
except IOError: # Catch file I/O errors
system.print_exception_info("File I/O error in codex_convert")
except: # Catch all other exceptions
system.print_exception_info("Unexpected error in codex_convert")
return comment
def var_replace(self, line, other_vars=None, indent=None, is_loop=False,
converted_statement=False):
"""Replaces bash variables with python variables and also derive run call for LINE
Notes:
- Optional converts OTHER_VARS in line.
- Use INDENT to overide the line space indentation.
"""
# TODO1: return (was-converted, python, remainder), as with process_xyz functions
def derive_indentation(line):
"""Derives indentation"""
debug.trace(7, f"in derive_indentation({line!r})")
nonlocal indent
if my_re.search(r"^(\s+)(.*)", line):
if not indent:
indent = my_re.group(1)
line = my_re.group(2)
trace_line(line, "ln2")
return line
def handle_arithmetic_expansion(line):
"""Returns corrected arithmetic expansion"""
# - ex: $(( x + y )) => f"{x + y}"
debug.trace(7, f"in handle_arithmetic_expansion({line!r})")
while my_re.search(r"(.*)\$\(\( *(.*) *\)\)(.*)", line):
debug.trace(4, "processing arithmetic expansion")
(pre, expression, post) = my_re.groups()
# Note: converts expression to f-string, omitting (user) quotes as arithmetic
line = pre.strip("'\"") + "{" + expression + "}" + post.strip("'\"")
trace_line(line, "ln2a")
nonlocal has_python_var
has_python_var = True
# Make note of variable references
self.numeric_variables += my_re.findall(r"\b[a-z][a-z0-9]*", expression, flags=my_re.IGNORECASE)
trace_line(line, "ln3")
return line
def replace_var_references(line):
"""Processes variables (e.g., $x => {x})"""
debug.trace(7, f"in replace_var_references({line!r})")
# Inicialize variables
nonlocal bash_commands
bash_commands = re.findall(r'\$\(.*\)', line) # finds all bash commands
nonlocal has_python_var, has_default
bash_vars_with_defaults = re.findall(r'\$\{\w+:-[^\}]+\}', line)
bash_vars = re.findall(r"\$[A-Za-z][A-Za-z0-9_]*", line)
bash_commmon_special_vars = re.findall(r'\$[\$\?\@\*0-9]', line)
if other_vars:
bash_vars += other_vars
has_default = False
# Iterate through the list of variables to convert Bash variable syntax to Python variable syntax
for var in system.unique_items(bash_vars + bash_vars_with_defaults + bash_commmon_special_vars):
debug.assertion(var.startswith("$"))
converted = False
# If the variable is in the bash_vars_with_defaults list
if var in bash_vars_with_defaults:
# Extract the variable name
var_name = re.search(r'\$\{(\w+):-[^\}]+\}', var).group(1)
# Extract the default value
var_default = re.search(r'\$\{\w+:-(.*)\}', var).group(1)
# If the variable name is uppercase (assumed to be an environment variable), replace it with Python os.getenv syntax
if my_re.search(r"^[A-Z0-9_]+$", var_name):
replacement = f'os.getenv("{var_name}", "{var_default}")'
else:
# Replace the variable with Python syntax and a check for None
replacement = f"{{{var_name}}} if {{{var_name}}} is not None else '{var_default}'"
# Replace the Bash variable in the line with the Python-style variable
var_regex = my_re.escape(var)
line = my_re.sub(fr"{var_regex}", replacement, line)
trace_line(line, "ln3a")
has_python_var = True
has_default = True
converted = True
elif var in bash_commmon_special_vars:
# If the variable is a commonly used special variable, replace it with a function run call
# exs: '[ $? -eq 0 ]' => 'run("echo $?") -gt 0'; "$1"' => "{sys.argv[1]}"
if (is_loop and var in ["$?"]):
old_line = line
var_escape = my_re.escape(var)
line = my_re.sub(fr"(?<!run\(\"echo ){var_escape}",
f"int(run(\"echo {var}\"))", line)
debug.trace(4, "Bash special variable in loop")
trace_line(line, "ln3b")
debug.assertion(line != old_line)
else:
# TODO: keep track of context (e.g., function vs. script); use lookup table
replacement = var[1:]
if my_re.search(r"\$([0-9])", var):
replacement = "arg(" + my_re.group(1) + ")"
elif my_re.search(r"\$\$", var):
replacement = 'os.getpid()'
elif my_re.search(r"\$\*", var):
replacement = '" ".join(sys.argv)'
elif my_re.search(r"\$\@", var):
replacement = '" ".join(f"{v}" for v in sys.argv)'
line = line.replace(var, f"{{{replacement}}}")
debug.trace(4, "Bash special variable not in loop")
trace_line(line, "ln3b2")
converted = True
has_python_var = True
elif ((var[1:] in self.bash_var_hash) and (var not in self.string_variables)):
# If the variable is Bash-defined variable or already processed, exclude it from conversion
# TODO3: See whether this clauser is still needed
debug.trace(4, f"Excluding Bash-defined or known var {var})")
else:
debug.trace(7, f"Regular var {var}")
# If the variable wasn't converted yet
if (not converted):
# If it's a loop, drop the $-prefix from the variable
python_var = var[1:]
if (is_loop and (not embedded_in_quoted_string(var, line))):
line = my_re.sub(fr"\${python_var}\b", python_var, line)
trace_line(line, "ln3c")
# Treat capitalized variable as from environment
elif my_re.search(r"^[A-Z]+$", python_var):
# NOTE: initialized via INIT_VAR check
line = my_re.sub(fr"\${python_var}\b", f'{{{python_var}}}', line)
trace_line(line, "ln3ca")
has_python_var = True
# Otherwise, replace the Bash variable in the line with the Python-style variable
else:
line = my_re.sub(fr"\${python_var}\b", "{" + python_var + "}", line)
trace_line(line, "ln3d")
has_python_var = True
# Make note of variable references
self.string_variables.append(python_var)
# Remove extraneous embedded quotes
## BAD: line = remove_extra_quotes(line, "ln3e")
line = fix_embedded_quotes(line, "ln3e", reset_on_spaces=True)
# Trace output
trace_line(line, "ln4")
return line
def process_conditions_and_loops(line):
"""Adhoc processing for loop lines"""
debug.trace(7, f"in process_conditions_and_loops({line!r})")
## BAD: nonlocal is_loop
# Early exit for loops
if is_loop:
# Make sure true and false capitalized
# ex: "true" => "True" when used as test (e.g., "if true; ..." => "if True: ...")
if my_re.search(r"^(\s*)(true|false)(\s*)$", line):
(pre, keyword, post) = my_re.groups()
line = pre + keyword.capitalize() + post
debug.trace(5, f"Capitalized {keyword} in {line!r}")
# note: static is a hack to account for variable assignments
if var_pos == 1:
line = f"{static}f{line}"
debug.trace(5, f"Reinstated pre-static {static!r} in line: {line!r}")
elif var_pos == 0:
line = f"f{line}{static}"
debug.trace(5, f"Reinstated post-static {static!r} in line: {line!r}")
elif my_re.search("^([^\'\"]*[^f])([\'\"].*{.*}.*[\'\"])(.*)$", line):
(pre, quoted_string, post) = my_re.groups()
line = pre + f"f{quoted_string}" + post
debug.trace(5, f"Adding f-string prefix to loop line string: {line!r}")
## TODO3 (use strings for all variables and expressions)
## # Convert unqouted numbers to strings
## while my_re.search(r"^(.* )([0-9]+)( .*)$", line):
## pre, num, post = my_re.groups()
## debug.assertion(not embedded_in_quoted_string("num", line))
## line = pre + "'" + num + "'" + post
## debug.trace(5, f"quoting number in loop test: {line!r}")
# note: gets bare variable references (excluding tokens in tests like -eq)
# HACK: excludes python tokens for already processed expression
variable_refs = my_re.findall(r"[^\w-]([a-z][a-z0-9]*)", line, flags=my_re.IGNORECASE)
self.string_variables += system.difference(variable_refs, "os getenv sys argv arg int run echo".split())
debug.trace(5, f"[early exit 3a; process_conditions_and_loops] {var_replace_call} => ({line!r}, '')")
return line, ""
# Isolate comment
inline_comment = ""
if my_re.search(r"^([^#]+)(\s*#.*)", line):
line, inline_comment = my_re.groups()
debug.assertion(not embedded_in_quoted_string("#", line))
debug.assertion("\n" not in inline_comment)
line = line.strip()
trace_line(line, "ln4.3")
if my_re.search(r"^(.*)(#.*)$", line):
line, inline_comment = my_re.groups()
debug.assertion(not embedded_in_quoted_string("#", line))
debug.assertion("\n" not in inline_comment)
line = line.strip()
trace_line(line, "ln4.4")
# Do special processing for single statement lines
is_multiline_statement = ("\n" in line)
if is_multiline_statement:
system.print_stderr("Warning: unexpected multi-line statement in var_replace")
debug.trace(5, f"[early exit 3b; process_conditions_and_loops] {var_replace_call} => ({line!r}, {inline_comment!r})")
return line, inline_comment
if has_multiple_statements(line):
debug.assertion(not embedded_in_quoted_string(";", line))
inline_comment += " # Warning: review conversion of compound statement"
if not line.strip():
debug.trace(6, "Ignoring blank line")
pass
elif my_re.search(r"^\s*(\w+)\s*=", line):
debug.trace(6, "Ignoring assignment")
pass
elif my_re.search(r"^\s*#", line):
debug.trace(6, "Ignoring comment")
pass
## TEST
## elif not has_python_var:
## trace_line(line, "ln4.75")
## pass
else:
## TEST
## # Make sure line has outer single quotes, with any internal ones quoted
## if "'" in line[1:-1]:
## # note: Bash $'...' strings allows for escaped single quotes unlike '...'
## # The regex (?<...) is for negative lookbehind
## # OLD: line = re.sub(r"[^\\]'", r"\\\\'", line)
## line = re.sub(r"(?<!\\)(')", r"\\\\\1", line)
## line = f"'{line!r}'"
##
# Make sure line has outer double quotes, with any internal ones quoted
if '"' in line:
# note: Bash $'...' strings allows for escaped single quotes unlike '...'
line = re.sub(r'(?<!\\)(")', r"\\\1", line)
line = f'"{line}"'
trace_line(line, "ln5")
debug.assertion(re.search(r"""^(['"]).*\1$""", line))
#
# Use f-strings if local Python variable to be resolved
if has_python_var:
line = "f" + line
debug.assertion(re.search(r"""^f?(['"]).*\1$""", line))
trace_line(line, "ln5.5")
return line, inline_comment
def derive_run_invocation(line):
"""Creates run() for necessary lines"""
debug.trace(7, f"in derive_run_invocation({line!r})")
nonlocal has_assignment
# Derive run invocation, with output omitted for variable assignments
## BAD: bash_commands = re.findall(r'\$\(.*\)', line) # finds all bash commands
nonlocal bash_commands
has_assignment = (variable != "")
comment = ""
trace_line(line, "ln5.55")
if (has_assignment and ("$" not in line)):
# Remove outer quotes (e.g., '"my dog"' => "my dog" and '123' => 123)
line = re.sub(r"^'(.*)'$", r"\1", line)
trace_line(line, "ln5.65")
elif not line.strip():
debug.trace(6, "ignoring empty line")
elif bash_commands:
trace_line(line, "ln5.85")
# Note: uses echo command with $(...) unless line already uses one
if (not re.search(r"^f?'echo ", line)):
line = re.sub("'", "'echo ", line, count=1)
trace_line(line, "ln5.86")
if INLINE_CODEX:
comment = self.codex_convert(line)
line = f"run({line}, skip_print={has_assignment})"
trace_line(line, "ln5.89")
elif converted_statement:
debug.trace(6, "Ignoring converted")
elif has_default:
debug.trace(6, "Ignoring has_default")
elif my_re.search(r"^\s*#", line):
debug.trace(7, "Ignoring comment")
elif INLINE_CODEX:
comment = self.codex_convert(line)
line = f"run({line})"
trace_line(line, "ln5.95")
else:
# Run shell over entire line
line = f"run({line})"
trace_line(line, "ln5.92")
debug.trace_expr(3, line, comment)
# Add variable assignment and indentation
try:
if var_pos == 1:
line = f"{static}{line}"
trace_line(line, "ln5.93")
if var_pos == 0:
line = f"{line}{static}"
trace_line(line, "ln5.94")
except:
pass
if indent:
line = indent + line
trace_line(line, "ln6")
return line
def special_case_fixups(line, inline_comment):
""""Adhoc fixups (e.g., inner to outer f-string)"""
debug.trace(7, f"in special_case_fixups({line!r}, {inline_comment!r})")
# Check for echo statement not converted
# ex: 'echo "a + b = " f"{a + b}"' => 'run(echo "a + b = " f"{a + b}")'
if my_re.search(r"^\s*echo .*", line):
debug.trace(5, "Special case unconverted-echo fixup")
line = my_re.sub(r'\bf"', '"', line)
line = f"run(f{line!r})"
trace_line(line, "ln6a")
# Convert from inner f-strings to a single outer one
matches = re.findall(r'f"{(.*?)}"', line)
if len(matches) >= 2:
debug.trace(4, f"Combining f-strings in {line!r}")
contents = ' '.join(matches)
line = re.sub(r'f"{.*?}"', '', line)
line += f'f"{contents}"'
trace_line(line, "ln6b")
# Remove backslash characters in f-strings [maldito Python limitation]
# ex: "f'a\"b'" => "f'a'b"
## TODO2: line = my_re.sub(r'(f)(?:[\"\'])) \ ([^\1]+\1)', r'\1\2\3', line, flags=my_re.VERBOSE)
## TODO3:
## if ("\\" in line):
## line = my_re.sub(r'(f)([\"\'].*?) \\" (.*[\"\'])', r'\1\2' + "'" + r'\3', line, flags=my_re.VERBOSE)
## line = my_re.sub(r'(f)([\"\'].*?) \\ (.*[\"\'])', r'\1\2\3', line, flags=my_re.VERBOSE)
## trace_line(line, "ln6c")
while (my_re.search(r"\bf\b.*\\", line)):
save_line = line
if my_re.search(r'^(.*f)([\"\'].*?) \\" (.*[\"\'].*)$', line, flags=my_re.VERBOSE):
line = (my_re.group(1) + my_re.group(2) + "'" + my_re.group(3))
if (save_line == line):
debug.trace_expr(4, "Warning: nucluear option on f-string escape fix")
line = line.replace("\\", "")
trace_line(line, "ln6c")
# Restore comment
if (not self.skip_comments):
line = (line + inline_comment)
trace_line(line, "ln7")
return line
# Main function body for var_replace
in_line = line
if indent is None:
indent = ""
var_pos = ""
variable = ""
static = ""
has_python_var = ""
has_default = ""
has_assignment = ""
var_replace_call = f"var_replace({in_line!r}, othvar={other_vars!r} ind={indent!r})"
# TODO2: straighten out spaghetti references
bash_commands = None
# Make sure the line has no compound statements
if (BASHLEX_CHECKS and line.strip() and not converted_statement):
ast_pp = BashAST(line).dump()
debug.assertion(not ast_pp.startswith("CompoundNode"))
# Check for assignment
# TODO3: document what's going on!
## TODO2: if is_loop:
if my_re.search(r"^\s*(\w+)=(.*)", line):
var, value = my_re.groups()
debug.assertion(not var[0].isdigit())
debug.assertion(not value.startswith("("))
line = line.replace("[", "").replace("]", "")
## TODO2:
## if (system.is_numeric(value)):
## system.numeric_variables.append(var)
## else:
## system.numeric_variables.append(var)
## TODO2: *** Don't mix-n-match variable types with the same name: very confusing! ***
## TODO3: pre_assign, post_assign = line.split(" = ", maxsplit=1)
if " = " in line:
line = line.split(" = ")
else:
line = line.split("=")
trace_line(line, "ln0a")
if "$" in line[0] and "$" in line[1]:
line = " = ".join(line)
trace_line(line, "ln0b")
elif "$" in line[0]:
variable = line[0]
static = " = " + line[1]
var_pos = 0
elif "$" in line[1]:
variable = line[1]
static = line[0] + " = "
var_pos = 1
elif not "$" in line[0] and not "$" in line[1]:
result = " = ".join(line)
converted = True
debug.trace(5, f"[early exit 1] {var_replace_call} => ({converted}, {result!r}, '')")
return (converted, result, "")
if variable != "":
# TODO2: add constraint to avoid clobbering entire line
line = variable
trace_line(line, "ln0c")
# Check for assignment within expression statement; ex: (( z = x + y ))
if my_re.search(r"^\s+\(\(\s*(\w+)\s*=.*\)\)", line):
variable = my_re.group(1)
# HACK: check for Python assignment if already converted
## TODO2:
## if converted_statement and my_re.search(r"^\s*(\w+)\s*=", line):
## variable = my_re.group(1)
if my_re.search(r"^\s*(\w+)\s*=", line):
variable = my_re.group(1)
converted_statement = True
debug.trace_expr(5, static, var_pos, variable, converted_statement)
trace_line(line, "ln1")
line = derive_indentation(line)
line = handle_arithmetic_expansion(line)
line = replace_var_references(line)
line, inline_comment = process_conditions_and_loops(line)
if not is_loop:
line = derive_run_invocation(line)
else:
debug.assertion(not inline_comment.strip())
line = special_case_fixups(line, inline_comment)
converted = True
debug.trace(5, f"{var_replace_call} => ({converted}, {line!r}, '')\n\tself={self}")
return (converted, line, "")
def operators(self, line):
"""Returns line with operators converted to Python equivalents
Note: Assumes the line contains a single test expression (in isolation).
"""
# Dictionary with Bash operators and their Python equivalents
# Split into binary and unary operators
binary_operators = {
# TODO2: drop space around '=' and '!=' (and use regex boundry matching)
" = ": " == ",
" != ": " != ",
"-eq ": " == ",
"-ne ": " != ",
"-gt ": " > ",
"-ge ": " >= ",
"-lt ": " < ",
"-le ": " <= ",
"&&": " and ",
r"\|\|": " or ", # NOTE: need to escape | for Python
}
unary_operators = {
" ! ": " not ",
"-z ": " '' == ",
"-n ": " '' != ",
}
file_operators = {
"-d": ("os.path.isdir({})"),
"-f": ("os.path.isfile({})"),
"-e": ("os.path.exists({})"),
"-L": ("os.path.islink({})"),
"-r": ("os.access({}, os.R_OK)"),
"-w": ("os.access({}, os.W_OK)"),
"-x": ("os.access({}, os.X_OK)"),
"-s": ("os.path.getsize({}) > 0"),
}
in_line = line
# Combine binary and unary operators
operators = {**binary_operators, **unary_operators}
# Iterate over operators and replace them with Python equivalents
# TODO3: account for token boundaries (e.g., make sure [ or ] not in string)
for bash_operator, python_equivalent in operators.items():
## OLD:
line = re.sub(fr"(\S*) *{bash_operator} *(\S*)", fr"\1{python_equivalent}\2", line)
## BAD: line = re.sub(fr"(?<!\S) *{bash_operator} *(?!\S)", python_equivalent, line)
## NEW: line = re.sub(fr"(?<!\S) +{bash_operator.strip()} +(?!\S)", python_equivalent, line)
trace_line(line, "ln8a")
# Likewise handle file operators (TODO3: handle unquoted tokens)
# ex: '[ -f "$filename" ]' => 'os.path.isfile(f"filename")'; likewise '[ -f f"{sys.argv[1]}" ]'
for bash_operator, python_template in file_operators.items():
## OLD:
## line = re.sub(fr"([\[\(]\s*){bash_operator}\s+(f?{quoted_string})",