bazel-contrib · plobsing · Oct 14, 2024 · Oct 14, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/docs/tar.md b/docs/tar.md
diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel
@@ -1,5 +1,7 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("//lib:run_binary.bzl", "run_binary")
 load("//lib:utils.bzl", "is_bazel_7_or_greater")
+load("//lib:write_source_files.bzl", "write_source_files")
 
 exports_files(
     [
@@ -8,6 +10,9 @@ exports_files(
         "modify_mtree.awk",
         "parse_status_file.jq",
         "parse_status_file.yq",
+        "unvis_canonical.sed",
+        "vis_canonicalize.sed",
+        "vis_escape_nonascii.sed",
     ],
     visibility = ["//visibility:public"],
 )
@@ -279,9 +284,13 @@ bzl_library(
 
 bzl_library(
     name = "tar",
-    srcs = ["tar.bzl"],
+    srcs = [
+        "tar.bzl",
+        "vis_escape_ascii.bzl",
+    ],
     visibility = ["//lib:__subpackages__"],
     deps = [
+        ":strings.bzl",
         "@aspect_bazel_lib//lib:paths",
         "@bazel_skylib//rules:common_settings",
     ],
@@ -362,10 +371,40 @@ bzl_library(
     name = "strings",
     srcs = ["strings.bzl"],
     visibility = ["//lib:__subpackages__"],
+    deps = [
+        "@bazel_skylib//lib:types",
+    ],
 )
 
 bzl_library(
     name = "zstd_toolchain",
     srcs = ["zstd_toolchain.bzl"],
     visibility = ["//lib:__subpackages__"],
 )
+
+run_binary(
+    name = "run_gen_vis_scripts",
+    outs = [
+        "_unvis_canonical.sed",
+        "_vis_canonicalize.sed",
+        "_vis_escape_ascii.bzl",
+        "_vis_escape_nonascii.sed",
+    ],
+    args = [
+        "unvis_canonical.sed=$(location _unvis_canonical.sed)",
+        "vis_canonicalize.sed=$(location _vis_canonicalize.sed)",
+        "vis_escape_ascii.bzl=$(location _vis_escape_ascii.bzl)",
+        "vis_escape_nonascii.sed=$(location _vis_escape_nonascii.sed)",
+    ],
+    tool = "//tools/gen_vis_scripts",
+)
+
+write_source_files(
+    name = "write_vis_scripts",
+    files = {
+        "unvis_canonical.sed": ":_unvis_canonical.sed",
+        "vis_canonicalize.sed": ":_vis_canonicalize.sed",
+        "vis_escape_ascii.bzl": ":_vis_escape_ascii.bzl",
+        "vis_escape_nonascii.sed": ":_vis_escape_nonascii.sed",
+    },
+)
diff --git a/lib/private/strings.bzl b/lib/private/strings.bzl
@@ -1,5 +1,7 @@
 "String utilities"
 
+load("@bazel_skylib//lib:types.bzl", "types")
+
 CHAR_TO_INT = {
     "\0": 0,
     "\1": 1,
@@ -653,3 +655,104 @@ def split_args(s):
     if arg != "":
         args.append(arg)
     return args
+
+def maketrans(x):
+    """
+    Return a translation table usable with translate().
+
+    Subset of Python [builtin](https://docs.python.org/3.10/library/stdtypes.html#str.maketrans)
+    of the same name.
+
+    Translation of Unicode codepoints outside of U+0000..U+00FF (Basic Latin + Latin-1) is currently not
+    possible. Entries for characters outside this range will trigger a failure.
+
+    Args:
+        x: dictionary mapping Unicode ordinals (integers) or characters (length-1 strings)
+           to Unicode ordinals, strings, or None. Character keys will be converted to ordinals.
+
+    Returns:
+        dict. The translation table.
+    """
+
+    if not types.is_dict(x):
+        fail("if you give only one argument to maketrans it must be a dict")
+
+    table = {}
+
+    for (k, v) in x.items():
+        if types.is_int(k):
+            if k > 0xFF:
+                fail("most Unicode is unsupported")
+            table[k] = v
+        elif types.is_string(k):
+            if len(k) != 1:
+                fail("string keys in translate table must be of length 1")
+            codepoint = ord(k)
+            if codepoint == None:
+                fail("could not compute ord('{}'), most Unicode is unsupported".format(k))
+            table[codepoint] = v
+        else:
+            fail("keys in translate table must be strings or integers")
+
+    return table
+
+def translate(s, table):
+    """
+    Replace characters a string according to a translation table.
+
+    Subset of Python [builtin](https://docs.python.org/3.10/library/stdtypes.html#str.translate)
+    of the same name.
+
+    Characters with entries in the table are replaced in the output.
+    Characters mapped to None are deleted.
+    Characters absent from the table are mirrored to the output untouched.
+
+    Translation of Unicode codepoints outside of U+0000..U+00FF (Basic Latin + Latin-1) is currently not
+    possible. Characters outside this range will be silently mirrored to the output without consulting
+    the translation table.
+
+    Args:
+        s: str. Input string upon which to perform replacements.
+        table: dict. Translation table. Maps from Unicode ordinals (ints) keys to other Unicode ordinals, strings, or None.
+
+    Returns:
+        str. Output string derived from input string with substitutions and deletions applied from table.
+    """
+
+    if not types.is_string(s):
+        fail("first argument to translate must be a string")
+    if not types.is_dict(table):
+        fail("second argument to translate must be a dict")
+
+    parts = []
+    lit_start = None  # Index of start of current run of literal (i.e. no-op translation) content, or None.
+    for (i, c) in enumerate(s.elems()):
+        codepoint = ord(c)
+        if codepoint != None and codepoint in table:
+            # Terminate the current literal run, if any.
+            if lit_start != None:
+                parts.append(s[lit_start:i])
+                lit_start = None
+
+            replacement = table[codepoint]
+            if replacement == None:
+                pass
+            elif types.is_int(replacement):
+                parts.append(chr(replacement))
+            elif types.is_string(replacement):
+                parts.append(replacement)
+            else:
+                fail("character mapping must return integer, None or str")
+
+        else:  # No entry in translation table.
+            if lit_start == None:
+                lit_start = i
+
+    # Flush the caudal literal run, if any.
+    if lit_start != None:
+        parts.append(s[lit_start:])
+        lit_start = None
+
+    if len(parts) == 1:
+        return parts[0]
+    return "".join(parts)
diff --git a/lib/private/tar.bzl b/lib/private/tar.bzl
@@ -2,6 +2,8 @@
 
 load("@bazel_skylib//rules:common_settings.bzl", "BuildSettingInfo")
 load("//lib:paths.bzl", "to_repository_relative_path")
+load(":strings.bzl", str_translate = "translate")
+load(":vis_escape_ascii.bzl", "VIS_ESCAPE_ASCII")
 
 TAR_TOOLCHAIN_TYPE = "@aspect_bazel_lib//lib:tar_toolchain_type"
 
@@ -103,10 +105,8 @@ parallelism of builds. Pruned files do not need to be transferred to remote-exec
 workers, which can reduce network costs.
 
 Risks: pruning an actually-used input file can lead to unexpected, incorrect results. The
-comparison performed between `srcs` and `mtree` is currently inexact and may fail to
-handle handwritten or externally-derived mtree specifications. However, it is safe to use
-this feature when the lines found in `mtree` are derived from one or more `mtree_spec`
-rules, filtered and/or merged on whole-line basis only.
+comparison performed between `srcs` and `mtree` is exact. There are no known
+circumstances where incorrect results are anticipated.
 
 Possible values:
 
@@ -119,11 +119,15 @@ Possible values:
         values = [-1, 0, 1],
     ),
     "_compute_unused_inputs_flag": attr.label(default = Label("//lib:tar_compute_unused_inputs")),
+    "_unvis_canonical": attr.label(allow_single_file = True, default = Label("//lib/private:unvis_canonical.sed")),
+    "_vis_canonicalize": attr.label(allow_single_file = True, default = Label("//lib/private:vis_canonicalize.sed")),
+    "_vis_escape_nonascii": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape_nonascii.sed")),
 }
 
 _mtree_attrs = {
     "srcs": attr.label_list(doc = "Files that are placed into the tar", allow_files = True),
     "out": attr.output(doc = "Resulting specification file to write"),
+    "_vis_escape_nonascii": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape_nonascii.sed")),
 }
 
 def _add_compression_args(compress, args):
@@ -188,15 +192,9 @@ def _is_unprunable(file):
 def _fmt_pruanble_inputs_line(file):
     if _is_unprunable(file):
         return None
-
-    # The tar.prunable_inputs.txt file has a two columns:
-    #   1. vis-encoded paths of the files, used in comparison
-    #   2. un-vis-encoded paths of the files, used for reporting back to Bazel after filtering
-    path = file.path
-    return _vis_encode(path) + " " + path
+    return _vis_encode(file.path)
 
 def _fmt_keep_inputs_line(file):
-    # The tar.keep_inputs.txt file has a single column of vis-encoded paths of the files to keep.
     return _vis_encode(file.path)
 
 def _configured_unused_inputs_file(ctx, srcs, keep):
@@ -243,26 +241,33 @@ def _configured_unused_inputs_file(ctx, srcs, keep):
     #   * are not found in any content= or contents= keyword in the MTREE
     #   * are not in the hardcoded KEEP_INPUTS set
     #
-    # Comparison and filtering of PRUNABLE_INPUTS is performed in the vis-encoded representation, stored in field 1,
-    # before being written out in the un-vis-encoded form Bazel understands, from field 2.
+    # Comparison and filtering of PRUNABLE_INPUTS is performed in the vis-encoded representation
+    # before being written out in the un-vis-encoded form Bazel understands.
     #
     # Note: bsdtar (libarchive) accepts both content= and contents= to identify source file:
     # ref https://github.com/libarchive/libarchive/blob/a90e9d84ec147be2ef6a720955f3b315cb54bca3/libarchive/archive_read_support_format_mtree.c#L1640
-    #
-    # TODO: Make comparison exact by converting all inputs to a canonical vis-encoded form before comparing.
-    #       See also: https://github.com/bazel-contrib/bazel-lib/issues/794
     ctx.actions.run_shell(
         outputs = [unused_inputs],
-        inputs = [prunable_inputs, keep_inputs, ctx.file.mtree],
+        inputs = [
+            prunable_inputs,
+            keep_inputs,
+            ctx.file.mtree,
+            ctx.file._unvis_canonical,
+            ctx.file._vis_canonicalize,
+            ctx.file._vis_escape_nonascii,
+        ],
         tools = [coreutils],
         command = '''
             "$COREUTILS" join -v 1                                                            \\
-                <("$COREUTILS" sort -u "$PRUNABLE_INPUTS")                                    \\
+                <(sed -f "$VIS_ESCAPE_NONASCII" "$PRUNABLE_INPUTS" | "$COREUTILS" sort -u)    \\
                 <("$COREUTILS" sort -u                                                        \\
-                    <(grep -o '\\bcontents\\?=\\S*' "$MTREE" | "$COREUTILS" cut -d'=' -f 2-)  \\
-                    "$KEEP_INPUTS"                                                            \\
+                    <(grep -o '\\bcontents\\?=\\S*' "$MTREE"                                  \\
+                        | "$COREUTILS" cut -d'=' -f 2-                                        \\
+                        | sed -Ef "$VIS_CANONICALIZE"                                         \\
+                    )                                                                         \\
+                    <(sed -f "$VIS_ESCAPE_NONASCII" "$KEEP_INPUTS")                           \\
                 )                                                                             \\
-                | "$COREUTILS" cut -d' ' -f 2-                                                \\
+                | sed -f "$UNVIS_CANONICAL"                                                   \\
                 > "$UNUSED_INPUTS"
         ''',
         env = {
@@ -271,14 +276,16 @@ def _configured_unused_inputs_file(ctx, srcs, keep):
             "KEEP_INPUTS": keep_inputs.path,
             "MTREE": ctx.file.mtree.path,
             "UNUSED_INPUTS": unused_inputs.path,
+            "UNVIS_CANONICAL": ctx.file._unvis_canonical.path,
+            "VIS_CANONICALIZE": ctx.file._vis_canonicalize.path,
+            "VIS_ESCAPE_NONASCII": ctx.file._vis_escape_nonascii.path,
         },
         mnemonic = "UnusedTarInputs",
         toolchain = "@aspect_bazel_lib//lib:coreutils_toolchain_type",
     )
 
     return unused_inputs
 
-
 # TODO(3.0): Access field directly after minimum bazel_compatibility advanced to or beyond v7.0.0.
 def _repo_mapping_manifest(files_to_run):
     return getattr(files_to_run, "repo_mapping_manifest", None)
@@ -372,8 +379,9 @@ def _to_rlocation_path(file, workspace):
         return workspace + "/" + file.short_path
 
 def _vis_encode(filename):
-    # TODO(#794): correctly encode all filenames by using vis(3) (or porting it)
-    return filename.replace(" ", "\\040")
+    # Escaping of non-ASCII bytes cannot be performed within Starlark.
+    # After writing content out, a second pass is performed with vis_escape_nonascii.sed.
+    return str_translate(filename, VIS_ESCAPE_ASCII)
 
 def _expand(file, expander, transform = to_repository_relative_path):
     expanded = expander.expand(file)
@@ -400,6 +408,7 @@ def _expand(file, expander, transform = to_repository_relative_path):
 
 def _mtree_impl(ctx):
     out = ctx.outputs.out or ctx.actions.declare_file(ctx.attr.name + ".spec")
+    unescaped = ctx.actions.declare_file(ctx.attr.name + ".spec.unescaped")
 
     content = ctx.actions.args()
     content.set_param_file_format("multiline")
@@ -444,7 +453,18 @@ def _mtree_impl(ctx):
                 _mtree_line(_vis_encode(runfiles_dir + "/_repo_mapping"), "file", content = _vis_encode(repo_mapping.path)),
             )
 
-    ctx.actions.write(out, content = content)
+    ctx.actions.write(unescaped, content = content)
+    ctx.actions.run_shell(
+        outputs = [out],
+        inputs = [unescaped, ctx.file._vis_escape_nonascii],
+        command = 'sed -f "$VIS_ESCAPE_NONASCII" "$UNESCAPED" > "$OUT"',
+        env = {
+            "VIS_ESCAPE_NONASCII": ctx.file._vis_escape_nonascii.path,
+            "UNESCAPED": unescaped.path,
+            "OUT": out.path,
+        },
+        mnemonic = "EscapeNonAscii",
+    )
 
     return DefaultInfo(files = depset([out]), runfiles = ctx.runfiles([out]))