Skip to content

Commit

Permalink
Use a sed script to vis-encode non-ASCII bytes for mtree
Browse files Browse the repository at this point in the history
Bazel's Starlark does not provide access to a string's bytes, only its
codepoints, so we are unable to do this escaping in Starlark. So a
second pass is needed, at least until the spec and implementation work
to get a [`bytes` type](bazelbuild/starlark#112) lands.

Fixes bazel-contrib#794
  • Loading branch information
plobsing committed Nov 22, 2024
1 parent 4d77429 commit b9996f8
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 5 deletions.
1 change: 1 addition & 0 deletions lib/private/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ exports_files(
"modify_mtree.awk",
"parse_status_file.jq",
"parse_status_file.yq",
"vis_escape_nonascii.sed",
],
visibility = ["//visibility:public"],
)
Expand Down
3 changes: 3 additions & 0 deletions lib/private/gen_vis_scripts/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@ run_binary(
name = "run_gen_vis_scripts",
outs = [
"vis_escape_ascii.bzl",
"vis_escape_nonascii.sed",
],
args = [
"vis_escape_ascii.bzl=$(location vis_escape_ascii.bzl)",
"vis_escape_nonascii.sed=$(location vis_escape_nonascii.sed)",
],
tool = ":gen_vis_scripts",
)
Expand All @@ -26,5 +28,6 @@ write_source_files(
#
files = {
"//lib/private:vis_escape_ascii.bzl": ":vis_escape_ascii.bzl",
"//lib/private:vis_escape_nonascii.sed": ":vis_escape_nonascii.sed",
},
)
15 changes: 15 additions & 0 deletions lib/private/gen_vis_scripts/gen_vis_scripts.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ func main() {
switch name {
case "vis_escape_ascii.bzl":
writeEscapeASCIIBzl(f)
case "vis_escape_nonascii.sed":
writeEscapeNonASCIISed(f)
default:
log.Fatal("unknown generated content:", name)
}
Expand Down Expand Up @@ -63,3 +65,16 @@ VIS_ESCAPE_ASCII = maketrans({
}
fmt.Fprintln(w, "})")
}

func writeEscapeNonASCIISed(w io.Writer) {
fmt.Fprintln(w, strings.TrimSpace(`
# Code generated by gen_vis_scripts. DO NOT EDIT.
# Replace non-ASCII bytes with their octal escape sequences.
# Escaping of ASCII is done in Starlark prior to writing content out.
`))
fmt.Fprintln(w, "")

for i := 0x80; i <= 0xFF; i++ {
fmt.Fprintf(w, `s/\x%02[1]x/\\%03[1]o/g%[2]c`, i, newline)
}
}
26 changes: 21 additions & 5 deletions lib/private/tar.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -121,11 +121,13 @@ Possible values:
values = [-1, 0, 1],
),
"_compute_unused_inputs_flag": attr.label(default = Label("//lib:tar_compute_unused_inputs")),
"_vis_escape_nonascii": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape_nonascii.sed")),
}

_mtree_attrs = {
"srcs": attr.label_list(doc = "Files that are placed into the tar", allow_files = True),
"out": attr.output(doc = "Resulting specification file to write"),
"_vis_escape_nonascii": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape_nonascii.sed")),
}

def _add_compression_args(compress, args):
Expand Down Expand Up @@ -255,14 +257,14 @@ def _configured_unused_inputs_file(ctx, srcs, keep):
# See also: https://github.com/bazel-contrib/bazel-lib/issues/794
ctx.actions.run_shell(
outputs = [unused_inputs],
inputs = [prunable_inputs, keep_inputs, ctx.file.mtree],
inputs = [prunable_inputs, keep_inputs, ctx.file.mtree, ctx.file._vis_escape_nonascii],
tools = [coreutils],
command = '''
"$COREUTILS" join -v 1 \\
<("$COREUTILS" sort -u "$PRUNABLE_INPUTS") \\
<(sed -f "$VIS_ESCAPE_NONASCII" "$PRUNABLE_INPUTS" | "$COREUTILS" sort -u) \\
<("$COREUTILS" sort -u \\
<(grep -o '\\bcontents\\?=\\S*' "$MTREE" | "$COREUTILS" cut -d'=' -f 2-) \\
"$KEEP_INPUTS" \\
<(sed -f "$VIS_ESCAPE_NONASCII" "$KEEP_INPUTS") \\
) \\
| "$COREUTILS" cut -d' ' -f 2- \\
> "$UNUSED_INPUTS"
Expand All @@ -273,6 +275,7 @@ def _configured_unused_inputs_file(ctx, srcs, keep):
"KEEP_INPUTS": keep_inputs.path,
"MTREE": ctx.file.mtree.path,
"UNUSED_INPUTS": unused_inputs.path,
"VIS_ESCAPE_NONASCII": ctx.file._vis_escape_nonascii.path,
},
mnemonic = "UnusedTarInputs",
toolchain = "@aspect_bazel_lib//lib:coreutils_toolchain_type",
Expand Down Expand Up @@ -373,7 +376,8 @@ def _to_rlocation_path(file, workspace):
return workspace + "/" + file.short_path

def _vis_encode(filename):
# TODO(#794): correctly encode all filenames by using vis(3) (or porting it)
# Escaping of non-ASCII bytes cannot be performed within Starlark.
# After writing content out, a second pass is performed with vis_escape_nonascii.sed.
return str_translate(filename, VIS_ESCAPE_ASCII)

def _expand(file, expander, transform = to_repository_relative_path):
Expand Down Expand Up @@ -401,6 +405,7 @@ def _expand(file, expander, transform = to_repository_relative_path):

def _mtree_impl(ctx):
out = ctx.outputs.out or ctx.actions.declare_file(ctx.attr.name + ".spec")
unescaped = ctx.actions.declare_file(ctx.attr.name + ".spec.unescaped")

content = ctx.actions.args()
content.set_param_file_format("multiline")
Expand Down Expand Up @@ -445,7 +450,18 @@ def _mtree_impl(ctx):
_mtree_line(_vis_encode(runfiles_dir + "/_repo_mapping"), "file", content = _vis_encode(repo_mapping.path)),
)

ctx.actions.write(out, content = content)
ctx.actions.write(unescaped, content = content)
ctx.actions.run_shell(
outputs = [out],
inputs = [unescaped, ctx.file._vis_escape_nonascii],
command = 'sed -f "$VIS_ESCAPE_NONASCII" "$UNESCAPED" > "$OUT"',
env = {
"VIS_ESCAPE_NONASCII": ctx.file._vis_escape_nonascii.path,
"UNESCAPED": unescaped.path,
"OUT": out.path,
},
mnemonic = "EscapeNonAscii",
)

return DefaultInfo(files = depset([out]), runfiles = ctx.runfiles([out]))

Expand Down
132 changes: 132 additions & 0 deletions lib/private/vis_escape_nonascii.sed
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# Code generated by gen_vis_scripts. DO NOT EDIT.
# Replace non-ASCII bytes with their octal escape sequences.
# Escaping of ASCII is done in Starlark prior to writing content out.

s/\x80/\\200/g
s/\x81/\\201/g
s/\x82/\\202/g
s/\x83/\\203/g
s/\x84/\\204/g
s/\x85/\\205/g
s/\x86/\\206/g
s/\x87/\\207/g
s/\x88/\\210/g
s/\x89/\\211/g
s/\x8a/\\212/g
s/\x8b/\\213/g
s/\x8c/\\214/g
s/\x8d/\\215/g
s/\x8e/\\216/g
s/\x8f/\\217/g
s/\x90/\\220/g
s/\x91/\\221/g
s/\x92/\\222/g
s/\x93/\\223/g
s/\x94/\\224/g
s/\x95/\\225/g
s/\x96/\\226/g
s/\x97/\\227/g
s/\x98/\\230/g
s/\x99/\\231/g
s/\x9a/\\232/g
s/\x9b/\\233/g
s/\x9c/\\234/g
s/\x9d/\\235/g
s/\x9e/\\236/g
s/\x9f/\\237/g
s/\xa0/\\240/g
s/\xa1/\\241/g
s/\xa2/\\242/g
s/\xa3/\\243/g
s/\xa4/\\244/g
s/\xa5/\\245/g
s/\xa6/\\246/g
s/\xa7/\\247/g
s/\xa8/\\250/g
s/\xa9/\\251/g
s/\xaa/\\252/g
s/\xab/\\253/g
s/\xac/\\254/g
s/\xad/\\255/g
s/\xae/\\256/g
s/\xaf/\\257/g
s/\xb0/\\260/g
s/\xb1/\\261/g
s/\xb2/\\262/g
s/\xb3/\\263/g
s/\xb4/\\264/g
s/\xb5/\\265/g
s/\xb6/\\266/g
s/\xb7/\\267/g
s/\xb8/\\270/g
s/\xb9/\\271/g
s/\xba/\\272/g
s/\xbb/\\273/g
s/\xbc/\\274/g
s/\xbd/\\275/g
s/\xbe/\\276/g
s/\xbf/\\277/g
s/\xc0/\\300/g
s/\xc1/\\301/g
s/\xc2/\\302/g
s/\xc3/\\303/g
s/\xc4/\\304/g
s/\xc5/\\305/g
s/\xc6/\\306/g
s/\xc7/\\307/g
s/\xc8/\\310/g
s/\xc9/\\311/g
s/\xca/\\312/g
s/\xcb/\\313/g
s/\xcc/\\314/g
s/\xcd/\\315/g
s/\xce/\\316/g
s/\xcf/\\317/g
s/\xd0/\\320/g
s/\xd1/\\321/g
s/\xd2/\\322/g
s/\xd3/\\323/g
s/\xd4/\\324/g
s/\xd5/\\325/g
s/\xd6/\\326/g
s/\xd7/\\327/g
s/\xd8/\\330/g
s/\xd9/\\331/g
s/\xda/\\332/g
s/\xdb/\\333/g
s/\xdc/\\334/g
s/\xdd/\\335/g
s/\xde/\\336/g
s/\xdf/\\337/g
s/\xe0/\\340/g
s/\xe1/\\341/g
s/\xe2/\\342/g
s/\xe3/\\343/g
s/\xe4/\\344/g
s/\xe5/\\345/g
s/\xe6/\\346/g
s/\xe7/\\347/g
s/\xe8/\\350/g
s/\xe9/\\351/g
s/\xea/\\352/g
s/\xeb/\\353/g
s/\xec/\\354/g
s/\xed/\\355/g
s/\xee/\\356/g
s/\xef/\\357/g
s/\xf0/\\360/g
s/\xf1/\\361/g
s/\xf2/\\362/g
s/\xf3/\\363/g
s/\xf4/\\364/g
s/\xf5/\\365/g
s/\xf6/\\366/g
s/\xf7/\\367/g
s/\xf8/\\370/g
s/\xf9/\\371/g
s/\xfa/\\372/g
s/\xfb/\\373/g
s/\xfc/\\374/g
s/\xfd/\\375/g
s/\xfe/\\376/g
s/\xff/\\377/g

0 comments on commit b9996f8

Please sign in to comment.