Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: vis-encode all path characters except visible ASCII #985

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/tar.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

41 changes: 40 additions & 1 deletion lib/private/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
load("//lib:run_binary.bzl", "run_binary")
load("//lib:utils.bzl", "is_bazel_7_or_greater")
load("//lib:write_source_files.bzl", "write_source_files")

exports_files(
[
Expand All @@ -8,6 +10,9 @@ exports_files(
"modify_mtree.awk",
"parse_status_file.jq",
"parse_status_file.yq",
"unvis_canonical.sed",
"vis_canonicalize.sed",
"vis_escape_nonascii.sed",
],
visibility = ["//visibility:public"],
)
Expand Down Expand Up @@ -279,9 +284,13 @@ bzl_library(

bzl_library(
name = "tar",
srcs = ["tar.bzl"],
srcs = [
"tar.bzl",
"vis_escape_ascii.bzl",
],
visibility = ["//lib:__subpackages__"],
deps = [
":strings.bzl",
"@aspect_bazel_lib//lib:paths",
"@bazel_skylib//rules:common_settings",
],
Expand Down Expand Up @@ -362,10 +371,40 @@ bzl_library(
name = "strings",
srcs = ["strings.bzl"],
visibility = ["//lib:__subpackages__"],
deps = [
"@bazel_skylib//lib:types",
],
)

bzl_library(
name = "zstd_toolchain",
srcs = ["zstd_toolchain.bzl"],
visibility = ["//lib:__subpackages__"],
)

run_binary(
name = "run_gen_vis_scripts",
outs = [
"_unvis_canonical.sed",
"_vis_canonicalize.sed",
"_vis_escape_ascii.bzl",
"_vis_escape_nonascii.sed",
],
args = [
"unvis_canonical.sed=$(location _unvis_canonical.sed)",
"vis_canonicalize.sed=$(location _vis_canonicalize.sed)",
"vis_escape_ascii.bzl=$(location _vis_escape_ascii.bzl)",
"vis_escape_nonascii.sed=$(location _vis_escape_nonascii.sed)",
],
tool = "//tools/gen_vis_scripts",
)

write_source_files(
name = "write_vis_scripts",
files = {
"unvis_canonical.sed": ":_unvis_canonical.sed",
"vis_canonicalize.sed": ":_vis_canonicalize.sed",
"vis_escape_ascii.bzl": ":_vis_escape_ascii.bzl",
"vis_escape_nonascii.sed": ":_vis_escape_nonascii.sed",
},
)
103 changes: 103 additions & 0 deletions lib/private/strings.bzl
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"String utilities"

load("@bazel_skylib//lib:types.bzl", "types")

CHAR_TO_INT = {
"\0": 0,
"\1": 1,
Expand Down Expand Up @@ -653,3 +655,104 @@ def split_args(s):
if arg != "":
args.append(arg)
return args

def maketrans(x):
"""
Return a translation table usable with translate().

Subset of Python [builtin](https://docs.python.org/3.10/library/stdtypes.html#str.maketrans)
of the same name.

Translation of Unicode codepoints outside of U+0000..U+00FF (Basic Latin + Latin-1) is currently not
possible. Entries for characters outside this range will trigger a failure.

Args:
x: dictionary mapping Unicode ordinals (integers) or characters (length-1 strings)
to Unicode ordinals, strings, or None. Character keys will be converted to ordinals.

Returns:
dict. The translation table.
"""

if not types.is_dict(x):
fail("if you give only one argument to maketrans it must be a dict")

table = {}

for (k, v) in x.items():
if types.is_int(k):
if k > 0xFF:
fail("most Unicode is unsupported")
table[k] = v
elif types.is_string(k):
if len(k) != 1:
fail("string keys in translate table must be of length 1")
codepoint = ord(k)
if codepoint == None:
fail("could not compute ord('{}'), most Unicode is unsupported".format(k))
table[codepoint] = v
else:
fail("keys in translate table must be strings or integers")

return table

def translate(s, table):
"""
Replace characters a string according to a translation table.

Subset of Python [builtin](https://docs.python.org/3.10/library/stdtypes.html#str.translate)
of the same name.

Characters with entries in the table are replaced in the output.
Characters mapped to None are deleted.
Characters absent from the table are mirrored to the output untouched.

Translation of Unicode codepoints outside of U+0000..U+00FF (Basic Latin + Latin-1) is currently not
possible. Characters outside this range will be silently mirrored to the output without consulting
the translation table.

Args:
s: str. Input string upon which to perform replacements.
table: dict. Translation table. Maps from Unicode ordinals (ints) keys to other Unicode ordinals, strings, or None.

Returns:
str. Output string derived from input string with substitutions and deletions applied from table.
"""

if not types.is_string(s):
fail("first argument to translate must be a string")
if not types.is_dict(table):
fail("second argument to translate must be a dict")

parts = []
lit_start = None # Index of start of current run of literal (i.e. no-op translation) content, or None.
for (i, c) in enumerate(s.elems()):
codepoint = ord(c)
if codepoint != None and codepoint in table:
# Terminate the current literal run, if any.
if lit_start != None:
parts.append(s[lit_start:i])
lit_start = None

replacement = table[codepoint]
if replacement == None:
pass
elif types.is_int(replacement):
parts.append(chr(replacement))
elif types.is_string(replacement):
parts.append(replacement)
else:
fail("character mapping must return integer, None or str")

else: # No entry in translation table.
if lit_start == None:
lit_start = i

# Flush the caudal literal run, if any.
if lit_start != None:
parts.append(s[lit_start:])
lit_start = None

if len(parts) == 1:
return parts[0]
return "".join(parts)
70 changes: 45 additions & 25 deletions lib/private/tar.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

load("@bazel_skylib//rules:common_settings.bzl", "BuildSettingInfo")
load("//lib:paths.bzl", "to_repository_relative_path")
load(":strings.bzl", str_translate = "translate")
load(":vis_escape_ascii.bzl", "VIS_ESCAPE_ASCII")

TAR_TOOLCHAIN_TYPE = "@aspect_bazel_lib//lib:tar_toolchain_type"

Expand Down Expand Up @@ -103,10 +105,8 @@ parallelism of builds. Pruned files do not need to be transferred to remote-exec
workers, which can reduce network costs.

Risks: pruning an actually-used input file can lead to unexpected, incorrect results. The
comparison performed between `srcs` and `mtree` is currently inexact and may fail to
handle handwritten or externally-derived mtree specifications. However, it is safe to use
this feature when the lines found in `mtree` are derived from one or more `mtree_spec`
rules, filtered and/or merged on whole-line basis only.
comparison performed between `srcs` and `mtree` is exact. There are no known
circumstances where incorrect results are anticipated.

Possible values:

Expand All @@ -119,11 +119,15 @@ Possible values:
values = [-1, 0, 1],
),
"_compute_unused_inputs_flag": attr.label(default = Label("//lib:tar_compute_unused_inputs")),
"_unvis_canonical": attr.label(allow_single_file = True, default = Label("//lib/private:unvis_canonical.sed")),
"_vis_canonicalize": attr.label(allow_single_file = True, default = Label("//lib/private:vis_canonicalize.sed")),
"_vis_escape_nonascii": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape_nonascii.sed")),
}

_mtree_attrs = {
"srcs": attr.label_list(doc = "Files that are placed into the tar", allow_files = True),
"out": attr.output(doc = "Resulting specification file to write"),
"_vis_escape_nonascii": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape_nonascii.sed")),
}

def _add_compression_args(compress, args):
Expand Down Expand Up @@ -188,15 +192,9 @@ def _is_unprunable(file):
def _fmt_pruanble_inputs_line(file):
if _is_unprunable(file):
return None

# The tar.prunable_inputs.txt file has a two columns:
# 1. vis-encoded paths of the files, used in comparison
# 2. un-vis-encoded paths of the files, used for reporting back to Bazel after filtering
path = file.path
return _vis_encode(path) + " " + path
return _vis_encode(file.path)

def _fmt_keep_inputs_line(file):
# The tar.keep_inputs.txt file has a single column of vis-encoded paths of the files to keep.
return _vis_encode(file.path)

def _configured_unused_inputs_file(ctx, srcs, keep):
Expand Down Expand Up @@ -243,26 +241,33 @@ def _configured_unused_inputs_file(ctx, srcs, keep):
# * are not found in any content= or contents= keyword in the MTREE
# * are not in the hardcoded KEEP_INPUTS set
#
# Comparison and filtering of PRUNABLE_INPUTS is performed in the vis-encoded representation, stored in field 1,
# before being written out in the un-vis-encoded form Bazel understands, from field 2.
# Comparison and filtering of PRUNABLE_INPUTS is performed in the vis-encoded representation
# before being written out in the un-vis-encoded form Bazel understands.
#
# Note: bsdtar (libarchive) accepts both content= and contents= to identify source file:
# ref https://github.com/libarchive/libarchive/blob/a90e9d84ec147be2ef6a720955f3b315cb54bca3/libarchive/archive_read_support_format_mtree.c#L1640
#
# TODO: Make comparison exact by converting all inputs to a canonical vis-encoded form before comparing.
# See also: https://github.com/bazel-contrib/bazel-lib/issues/794
ctx.actions.run_shell(
outputs = [unused_inputs],
inputs = [prunable_inputs, keep_inputs, ctx.file.mtree],
inputs = [
prunable_inputs,
keep_inputs,
ctx.file.mtree,
ctx.file._unvis_canonical,
ctx.file._vis_canonicalize,
ctx.file._vis_escape_nonascii,
],
tools = [coreutils],
command = '''
"$COREUTILS" join -v 1 \\
<("$COREUTILS" sort -u "$PRUNABLE_INPUTS") \\
<(sed -f "$VIS_ESCAPE_NONASCII" "$PRUNABLE_INPUTS" | "$COREUTILS" sort -u) \\
<("$COREUTILS" sort -u \\
<(grep -o '\\bcontents\\?=\\S*' "$MTREE" | "$COREUTILS" cut -d'=' -f 2-) \\
"$KEEP_INPUTS" \\
<(grep -o '\\bcontents\\?=\\S*' "$MTREE" \\
| "$COREUTILS" cut -d'=' -f 2- \\
| sed -Ef "$VIS_CANONICALIZE" \\
) \\
<(sed -f "$VIS_ESCAPE_NONASCII" "$KEEP_INPUTS") \\
) \\
| "$COREUTILS" cut -d' ' -f 2- \\
| sed -f "$UNVIS_CANONICAL" \\
> "$UNUSED_INPUTS"
''',
env = {
Expand All @@ -271,14 +276,16 @@ def _configured_unused_inputs_file(ctx, srcs, keep):
"KEEP_INPUTS": keep_inputs.path,
"MTREE": ctx.file.mtree.path,
"UNUSED_INPUTS": unused_inputs.path,
"UNVIS_CANONICAL": ctx.file._unvis_canonical.path,
"VIS_CANONICALIZE": ctx.file._vis_canonicalize.path,
"VIS_ESCAPE_NONASCII": ctx.file._vis_escape_nonascii.path,
},
mnemonic = "UnusedTarInputs",
toolchain = "@aspect_bazel_lib//lib:coreutils_toolchain_type",
)

return unused_inputs


# TODO(3.0): Access field directly after minimum bazel_compatibility advanced to or beyond v7.0.0.
def _repo_mapping_manifest(files_to_run):
return getattr(files_to_run, "repo_mapping_manifest", None)
Expand Down Expand Up @@ -372,8 +379,9 @@ def _to_rlocation_path(file, workspace):
return workspace + "/" + file.short_path

def _vis_encode(filename):
# TODO(#794): correctly encode all filenames by using vis(3) (or porting it)
return filename.replace(" ", "\\040")
# Escaping of non-ASCII bytes cannot be performed within Starlark.
# After writing content out, a second pass is performed with vis_escape_nonascii.sed.
return str_translate(filename, VIS_ESCAPE_ASCII)

def _expand(file, expander, transform = to_repository_relative_path):
expanded = expander.expand(file)
Expand All @@ -400,6 +408,7 @@ def _expand(file, expander, transform = to_repository_relative_path):

def _mtree_impl(ctx):
out = ctx.outputs.out or ctx.actions.declare_file(ctx.attr.name + ".spec")
unescaped = ctx.actions.declare_file(ctx.attr.name + ".spec.unescaped")

content = ctx.actions.args()
content.set_param_file_format("multiline")
Expand Down Expand Up @@ -444,7 +453,18 @@ def _mtree_impl(ctx):
_mtree_line(_vis_encode(runfiles_dir + "/_repo_mapping"), "file", content = _vis_encode(repo_mapping.path)),
)

ctx.actions.write(out, content = content)
ctx.actions.write(unescaped, content = content)
ctx.actions.run_shell(
outputs = [out],
inputs = [unescaped, ctx.file._vis_escape_nonascii],
command = 'sed -f "$VIS_ESCAPE_NONASCII" "$UNESCAPED" > "$OUT"',
env = {
"VIS_ESCAPE_NONASCII": ctx.file._vis_escape_nonascii.path,
"UNESCAPED": unescaped.path,
"OUT": out.path,
},
mnemonic = "EscapeNonAscii",
)

return DefaultInfo(files = depset([out]), runfiles = ctx.runfiles([out]))

Expand Down
Loading
Loading