bazel-contrib · plobsing · Oct 14, 2024 · Oct 14, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/docs/tar.md b/docs/tar.md
diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel
@@ -8,13 +8,19 @@ exports_files(
         "modify_mtree.awk",
         "parse_status_file.jq",
         "parse_status_file.yq",
+        "unvis_canonical.sed",
+        "vis_canonicalize.sed",
+        "vis_escape_nonascii.sed",
     ],
     visibility = ["//visibility:public"],
 )
 
 exports_files(
     glob(["*.bzl"]),
-    visibility = ["//lib/private/docs:__pkg__"],
+    visibility = [
+        "//lib/private/docs:__pkg__",
+        "//lib/private/gen_vis_scripts:__pkg__",
+    ],
 )
 
 bzl_library(
@@ -279,9 +285,13 @@ bzl_library(
 
 bzl_library(
     name = "tar",
-    srcs = ["tar.bzl"],
+    srcs = [
+        "tar.bzl",
+        "vis_escape_ascii.bzl",
+    ],
     visibility = ["//lib:__subpackages__"],
     deps = [
+        ":strings.bzl",
         "@aspect_bazel_lib//lib:paths",
         "@bazel_skylib//rules:common_settings",
     ],
@@ -362,6 +372,9 @@ bzl_library(
     name = "strings",
     srcs = ["strings.bzl"],
     visibility = ["//lib:__subpackages__"],
+    deps = [
+        "@bazel_skylib//lib:types",
+    ],
 )
 
 bzl_library(

diff --git a/lib/private/gen_vis_scripts/BUILD.bazel b/lib/private/gen_vis_scripts/BUILD.bazel
@@ -0,0 +1,39 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//lib:run_binary.bzl", "run_binary")
+load("//lib:write_source_files.bzl", "write_source_files")
+
+go_binary(
+    name = "gen_vis_scripts",
+    srcs = ["gen_vis_scripts.go"],
+)
+
+run_binary(
+    name = "run_gen_vis_scripts",
+    outs = [
+        "unvis_canonical.sed",
+        "vis_canonicalize.sed",
+        "vis_escape_ascii.bzl",
+        "vis_escape_nonascii.sed",
+    ],
+    args = [
+        "unvis_canonical.sed=$(location unvis_canonical.sed)",
+        "vis_canonicalize.sed=$(location vis_canonicalize.sed)",
+        "vis_escape_ascii.bzl=$(location vis_escape_ascii.bzl)",
+        "vis_escape_nonascii.sed=$(location vis_escape_nonascii.sed)",
+    ],
+    tool = ":gen_vis_scripts",
+)
+
+write_source_files(
+    name = "write_vis_scripts",
+
+    # Required to support cross-package references.
+    check_that_out_file_exists = False,
+    #
+    files = {
+        "//lib/private:unvis_canonical.sed": ":unvis_canonical.sed",
+        "//lib/private:vis_canonicalize.sed": ":vis_canonicalize.sed",
+        "//lib/private:vis_escape_ascii.bzl": ":vis_escape_ascii.bzl",
+        "//lib/private:vis_escape_nonascii.sed": ":vis_escape_nonascii.sed",
+    },
+)
diff --git a/lib/private/gen_vis_scripts/gen_vis_scripts.go b/lib/private/gen_vis_scripts/gen_vis_scripts.go
@@ -0,0 +1,171 @@
+// Code generator for vis-encoding support scripts.
+package main
+
+import (
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"strings"
+	"unicode"
+)
+
+func main() {
+	for _, arg := range os.Args[1:] {
+		name, dest, ok := strings.Cut(arg, "=")
+		if !ok {
+			log.Fatal("invalid generation spec:", arg)
+		}
+
+		f, err := os.Create(dest)
+		if err != nil {
+			log.Fatal(err)
+		}
+		defer mustClose(f)
+
+		switch name {
+		case "vis_escape_ascii.bzl":
+			writeEscapeASCIIBzl(f)
+		case "vis_escape_nonascii.sed":
+			writeEscapeNonASCIISed(f)
+		case "vis_canonicalize.sed":
+			writeVisCanonicalizeSed(f)
+		case "unvis_canonical.sed":
+			writeUnvisCanonicalSed(f)
+		default:
+			log.Fatal("unknown generated content:", name)
+		}
+	}
+}
+
+func mustClose(f *os.File) {
+	if err := f.Close(); err != nil {
+		log.Fatal(err)
+	}
+}
+
+const newline rune = '\n'
+
+// Escape all characters identified by mtree(5) as requiring escaping. Plus whitespace.
+func shouldEscape(b byte) bool {
+	return b == '\\' || b > unicode.MaxASCII || unicode.IsSpace(rune(b)) || !unicode.IsPrint(rune(b))
+}
+
+func writeEscapeASCIIBzl(w io.Writer) {
+	fmt.Fprintln(w, strings.TrimSpace(`
+# Code generated by gen_vis_scripts. DO NOT EDIT.
+"A translation table for vis-encoding the ASCII range for mtree."
+
+load(":strings.bzl", "maketrans")
+
+VIS_ESCAPE_ASCII = maketrans({
+	`))
+
+	for i := 0; i <= unicode.MaxASCII; i++ {
+		b := byte(i)
+		if shouldEscape(b) {
+			fmt.Fprintf(w, `    %[1]d: r"\%03[1]o",%[2]c`, b, newline)
+		}
+	}
+	fmt.Fprintln(w, "})")
+}
+
+func writeEscapeNonASCIISed(w io.Writer) {
+	fmt.Fprintln(w, strings.TrimSpace(`
+# Code generated by gen_vis_scripts. DO NOT EDIT.
+# Replace non-ASCII bytes with their octal escape sequences.
+# Escaping of ASCII is done in Starlark prior to writing content out.
+	`))
+	fmt.Fprintln(w, "")
+
+	for i := 0x80; i <= 0xFF; i++ {
+		fmt.Fprintf(w, `s/\x%02[1]x/\\%03[1]o/g%[2]c`, i, newline)
+	}
+}
+
+func writeVisCanonicalizeSed(w io.Writer) {
+	fmt.Fprintln(w, strings.TrimSpace(`
+# Code generated by gen_vis_scripts. DO NOT EDIT.
+#
+# Convert vis-encoded content to a bespoke canonical form. After canonicalization, equality checks are trivial.
+# Backslash, space characters, and all characters outside the 95 printable ASCII set are represented using escaped three-digit octal.
+# The remaining characters are not escaped; they represent themselves.
+#
+# Input is interpreted as libarchive would, with a wider set of escape sequences:
+#   * \\, \a, \b, \f, \n, \r, \t, \v have their conventional C-based meanings
+#   * \0 means NUL when not the start of an three-digit octal escape sequence
+#   * \s means SPACE
+#   * \ is valid as an ordinary backslash when not the start of a valid escape sequence
+#
+# See: https://github.com/libarchive/libarchive/blob/a90e9d84ec147be2ef6a720955f3b315cb54bca3/libarchive/archive_read_support_format_mtree.c#L1942
+
+# Escaping of backslashes must be applied first to avoid double-interpretation.
+s/\\\\|\\([^0-3abfnrstv\\]|$)/\\134\1/g
+s/\\([1-3]([^0-7]|$|[0-7]([^0-7]|$)))/\\134\1/g
+
+s/\\a/\\007/g
+s/\\b/\\008/g
+s/\\f/\\014/g
+s/\\n/\\012/g
+s/\\r/\\015/g
+s/\\s/\\040/g
+s/\\t/\\011/g
+s/\\v/\\013/g
+
+# NUL special form must be disambiguated from ordinary octal escape sequences.
+s/\\0([^0-7]|$|[0-7]([^0-7]|$))/\\000\1/g
+
+# Remove octal escaping from characters that don't need it.
+	`))
+
+	for i := 0; i <= 0xFF; i++ {
+		b := byte(i)
+		if shouldEscape(b) {
+			continue
+		}
+		if b == '/' {
+			fmt.Fprintf(w, `s:\\%03[1]o:%[1]c:g%[2]c`, b, newline)
+		} else {
+			fmt.Fprintf(w, `s/\\%03[1]o/%[1]c/g%[2]c`, b, newline)
+		}
+	}
+	fmt.Fprintln(w, "")
+
+	fmt.Fprintln(w, "# Add octal escaping for characters that need it.")
+	for i := 0; i <= 0xFF; i++ {
+		b := byte(i)
+		if !shouldEscape(b) {
+			continue
+		}
+		if b == '\\' || b == '\n' {
+			continue
+		}
+		fmt.Fprintf(w, `s/\x%02[1]x/\\%03[1]o/g%[2]c`, b, newline)
+	}
+}
+
+func writeUnvisCanonicalSed(w io.Writer) {
+	fmt.Fprintln(w, strings.TrimSpace(`
+# Code generated by gen_vis_scripts. DO NOT EDIT.
+# Replace octal escape sequences with the bytes they represent.
+# NOTE: not a fully general unvis program; assumes the canonical form produced by vis_canonicalize.sed
+	`))
+	fmt.Fprintln(w, "")
+
+	for i := 0x00; i <= 0xFF; i++ {
+		b := byte(i)
+		if b == '\\' {
+			continue
+		}
+		if !shouldEscape(b) {
+			continue
+		}
+		fmt.Fprintf(w, `s/\\%03[1]o/\x%02[1]x/g%[2]c`, b, newline)
+	}
+	fmt.Fprintln(w, "")
+
+	fmt.Fprintln(w, strings.TrimSpace(`
+# Unvis of backslash must be applied last to avoid double-interpretation.
+s/\\134/\\/g
+	`))
+}
diff --git a/lib/private/strings.bzl b/lib/private/strings.bzl
@@ -1,5 +1,7 @@
 "String utilities"
 
+load("@bazel_skylib//lib:types.bzl", "types")
+
 CHAR_TO_INT = {
     "\0": 0,
     "\1": 1,
@@ -653,3 +655,104 @@ def split_args(s):
     if arg != "":
         args.append(arg)
     return args
+
+def maketrans(x):
+    """
+    Return a translation table usable with translate().
+
+    Subset of Python [builtin](https://docs.python.org/3.10/library/stdtypes.html#str.maketrans)
+    of the same name.
+
+    Translation of Unicode codepoints outside of U+0000..U+00FF (Basic Latin + Latin-1) is currently not
+    possible. Entries for characters outside this range will trigger a failure.
+
+    Args:
+        x: dictionary mapping Unicode ordinals (integers) or characters (length-1 strings)
+           to Unicode ordinals, strings, or None. Character keys will be converted to ordinals.
+
+    Returns:
+        dict. The translation table.
+    """
+
+    if not types.is_dict(x):
+        fail("if you give only one argument to maketrans it must be a dict")
+
+    table = {}
+
+    for (k, v) in x.items():
+        if types.is_int(k):
+            if k > 0xFF:
+                fail("most Unicode is unsupported")
+            table[k] = v
+        elif types.is_string(k):
+            if len(k) != 1:
+                fail("string keys in translate table must be of length 1")
+            codepoint = ord(k)
+            if codepoint == None:
+                fail("could not compute ord('{}'), most Unicode is unsupported".format(k))
+            table[codepoint] = v
+        else:
+            fail("keys in translate table must be strings or integers")
+
+    return table
+
+def translate(s, table):
+    """
+    Replace characters a string according to a translation table.
+
+    Subset of Python [builtin](https://docs.python.org/3.10/library/stdtypes.html#str.translate)
+    of the same name.
+
+    Characters with entries in the table are replaced in the output.
+    Characters mapped to None are deleted.
+    Characters absent from the table are mirrored to the output untouched.
+
+    Translation of Unicode codepoints outside of U+0000..U+00FF (Basic Latin + Latin-1) is currently not
+    possible. Characters outside this range will be silently mirrored to the output without consulting
+    the translation table.
+
+    Args:
+        s: str. Input string upon which to perform replacements.
+        table: dict. Translation table. Maps from Unicode ordinals (ints) keys to other Unicode ordinals, strings, or None.
+
+    Returns:
+        str. Output string derived from input string with substitutions and deletions applied from table.
+    """
+
+    if not types.is_string(s):
+        fail("first argument to translate must be a string")
+    if not types.is_dict(table):
+        fail("second argument to translate must be a dict")
+
+    parts = []
+    lit_start = None  # Index of start of current run of literal (i.e. no-op translation) content, or None.
+    for (i, c) in enumerate(s.elems()):
+        codepoint = ord(c)
+        if codepoint != None and codepoint in table:
+            # Terminate the current literal run, if any.
+            if lit_start != None:
+                parts.append(s[lit_start:i])
+                lit_start = None
+
+            replacement = table[codepoint]
+            if replacement == None:
+                pass
+            elif types.is_int(replacement):
+                parts.append(chr(replacement))
+            elif types.is_string(replacement):
+                parts.append(replacement)
+            else:
+                fail("character mapping must return integer, None or str")
+
+        else:  # No entry in translation table.
+            if lit_start == None:
+                lit_start = i
+
+    # Flush the caudal literal run, if any.
+    if lit_start != None:
+        parts.append(s[lit_start:])
+        lit_start = None
+
+    if len(parts) == 1:
+        return parts[0]
+    return "".join(parts)