Skip to content

Commit 1c17917

Browse files
committed
ask the package server for zstd compressed data
Together with JuliaPackaging/PkgServer.jl#220 this allows Pkg to download registries, packages and artifacts as zst archives which based on benchmarks decompress significantly faster and also tend to be smaller. Having the registry stored as a zst archive is not a backwards compatibility issue because even on older Pkgs it can decompress these with 7z
1 parent b6e4785 commit 1c17917

File tree

6 files changed

+137
-20
lines changed

6 files changed

+137
-20
lines changed

Project.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
2323
TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
2424
Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
2525
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
26+
Zstd_jll = "3161d3a3-bdf6-5164-811a-617609db77b4"
2627
p7zip_jll = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
2728

2829
[weakdeps]
@@ -41,11 +42,12 @@ Libdl = "1.11"
4142
Logging = "1.11"
4243
Markdown = "1.11"
4344
Printf = "1.11"
44-
Random = "1.11"
4545
REPL = "1.11"
46+
Random = "1.11"
4647
SHA = "0.7, 1"
4748
TOML = "1"
4849
Tar = "1.10"
4950
UUIDs = "1.11"
51+
Zstd_jll = "1.5.7"
5052
julia = "1.12"
5153
p7zip_jll = "17.5"

docs/src/protocol.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,18 @@ The client can make GET or HEAD requests to the following resources:
136136

137137
Only the `/registries` changes - all other resources can be cached forever and the server will indicate this with the appropriate HTTP headers.
138138

139+
### Compression Negotiation
140+
141+
The Pkg protocol supports multiple compression formats.
142+
143+
- **Zstd compression** (current): Modern clients send `Accept-Encoding: zstd, gzip` to request Zstandard-compressed resources with gzip as a fallback.
144+
- **Gzip compression** (legacy): Older clients that only support gzip send `Accept-Encoding: gzip` or omit the header entirely.
145+
146+
Clients verify the actual compression format by reading file magic bytes after download to determine the correct file extension for caching:
147+
148+
- **Zstd format**: Magic bytes `0x28 0xB5 0x2F 0xFD` (4 bytes)
149+
- **Gzip format**: Magic bytes `0x1F 0x8B` (2 bytes)
150+
139151
### Reference Implementation
140152

141153
A reference implementation of the Pkg Server protocol is available at [PkgServer.jl](https://github.com/JuliaPackaging/PkgServer.jl).

src/PlatformEngines.jl

Lines changed: 98 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,17 @@
44

55
module PlatformEngines
66

7-
using SHA, Downloads, Tar
7+
using SHA, Downloads, Tar, Dates, Printf
88
import ...Pkg: Pkg, TOML, pkg_server, depots1, can_fancyprint, stderr_f, atomic_toml_write
99
using ..MiniProgressBars
10-
using Base.BinaryPlatforms, p7zip_jll
10+
using Base.BinaryPlatforms, p7zip_jll, Zstd_jll
1111

12-
export verify, unpack, package, download_verify_unpack
12+
export verify, unpack, package, download_verify_unpack, get_extract_cmd, detect_archive_format
1313

1414
const EXE7Z_LOCK = ReentrantLock()
1515
const EXE7Z = Ref{String}()
16+
const EXEZSTD_LOCK = ReentrantLock()
17+
const EXEZSTD = Ref{String}()
1618

1719
function exe7z()
1820
# If the JLL is available, use the wrapper function defined in there
@@ -28,6 +30,20 @@ function exe7z()
2830
end
2931
end
3032

33+
function exezstd()
34+
# If the JLL is available, use the wrapper function defined in there
35+
if Zstd_jll.is_available()
36+
return Zstd_jll.zstd()
37+
end
38+
39+
return lock(EXEZSTD_LOCK) do
40+
if !isassigned(EXEZSTD)
41+
EXEZSTD[] = findzstd()
42+
end
43+
return Cmd([EXEZSTD[]])
44+
end
45+
end
46+
3147
function find7z()
3248
name = "7z"
3349
Sys.iswindows() && (name = "$name.exe")
@@ -40,6 +56,18 @@ function find7z()
4056
error("7z binary not found")
4157
end
4258

59+
function findzstd()
60+
name = "zstd"
61+
Sys.iswindows() && (name = "$name.exe")
62+
for dir in (joinpath("..", "libexec"), ".")
63+
path = normpath(Sys.BINDIR::String, dir, name)
64+
isfile(path) && return path
65+
end
66+
path = Sys.which(name)
67+
path !== nothing && return path
68+
error("zstd binary not found")
69+
end
70+
4371
is_secure_url(url::AbstractString) =
4472
occursin(r"^(https://|\w+://(127\.0\.0\.1|localhost)(:\d+)?($|/))"i, url)
4573

@@ -232,6 +260,13 @@ function get_metadata_headers(url::AbstractString)
232260
end
233261
push!(headers, "Julia-CI-Variables" => join(ci_info, ';'))
234262
push!(headers, "Julia-Interactive" => string(isinteractive()))
263+
264+
# Add Accept-Encoding header only for compressed archive resources
265+
# (registries, packages, artifacts - not for metadata endpoints like /registries or /meta)
266+
if occursin(r"/(registry|package|artifact)/", url)
267+
push!(headers, "Accept-Encoding" => "zstd, gzip")
268+
end
269+
235270
for (key, val) in ENV
236271
m = match(r"^JULIA_PKG_SERVER_([A-Z0-9_]+)$"i, key)
237272
m === nothing && continue
@@ -403,22 +438,76 @@ function copy_symlinks()
403438
lowercase(var) in ("false", "f", "no", "n", "0") ? false : nothing
404439
end
405440

441+
"""
442+
detect_archive_format(tarball_path::AbstractString)
443+
444+
Detect compression format by reading file magic bytes.
445+
Returns "zstd" or "gzip".
446+
447+
Note: This is primarily used for determining the correct file extension after download.
448+
For decompression, we always use zstd as it can efficiently handle both formats.
449+
"""
450+
function detect_archive_format(tarball_path::AbstractString)
451+
file_size = filesize(tarball_path)
452+
453+
if file_size == 0
454+
error("cannot detect compression format: $tarball_path is empty")
455+
end
456+
457+
magic = open(tarball_path, "r") do io
458+
read(io, min(4, file_size))
459+
end
460+
461+
# Zstd magic number: 0x28 0xB5 0x2F 0xFD
462+
if length(magic) >= 4 && magic[1:4] == [0x28, 0xB5, 0x2F, 0xFD]
463+
return "zstd"
464+
end
465+
# Gzip magic number: 0x1F 0x8B
466+
if length(magic) >= 2 && magic[1:2] == [0x1F, 0x8B]
467+
return "gzip"
468+
end
469+
470+
# Show hex dump of magic bytes for debugging
471+
hex_dump = length(magic) > 0 ? join([@sprintf("0x%02X", b) for b in magic], " ") : "none"
472+
error("unknown compression format for $tarball_path (magic bytes: $hex_dump, expected zstd [0x28 0xB5 0x2F 0xFD] or gzip [0x1F 0x8B])")
473+
end
474+
475+
"""
476+
get_extract_cmd(tarball_path::AbstractString)
477+
478+
Get the decompression command for a tarball.
479+
Uses zstd for all decompression as it handles both zstd and gzip formats efficiently.
480+
"""
481+
function get_extract_cmd(tarball_path::AbstractString)
482+
# zstd can decompress both zstd and gzip formats, and is ~3x faster than 7z for gzip
483+
return `$(exezstd()) -d -c $tarball_path`
484+
end
485+
406486
function unpack(
407487
tarball_path::AbstractString,
408488
dest::AbstractString;
409489
verbose::Bool = false,
410490
)
411-
return Tar.extract(`$(exe7z()) x $tarball_path -so`, dest, copy_symlinks = copy_symlinks())
491+
return Tar.extract(get_extract_cmd(tarball_path), dest, copy_symlinks = copy_symlinks())
412492
end
413493

414494
"""
415495
package(src_dir::AbstractString, tarball_path::AbstractString)
416496
417497
Compress `src_dir` into a tarball located at `tarball_path`.
498+
Supports both gzip and zstd compression based on file extension.
418499
"""
419500
function package(src_dir::AbstractString, tarball_path::AbstractString; io = stderr_f())
420501
rm(tarball_path, force = true)
421-
cmd = `$(exe7z()) a -si -tgzip -mx9 $tarball_path`
502+
# Choose compression based on file extension (case-insensitive)
503+
tarball_lower = lowercase(tarball_path)
504+
if endswith(tarball_lower, ".zst") || endswith(tarball_lower, ".tar.zst")
505+
# Use zstd compression (level 19 for good compression)
506+
cmd = `$(exezstd()) -19 -c -T -o $tarball_path`
507+
else
508+
# Use gzip compression (default)
509+
cmd = `$(exe7z()) a -si -tgzip -mx9 $tarball_path`
510+
end
422511
return open(pipeline(cmd, stdout = devnull, stderr = io), write = true) do io
423512
Tar.create(src_dir, io)
424513
end
@@ -496,8 +585,8 @@ function download_verify_unpack(
496585
end
497586

498587
# If extension of url contains a recognized extension, use it, otherwise use ".gz"
499-
ext = url_ext(url)
500-
if !(ext in ["tar", "gz", "tgz", "bz2", "xz"])
588+
ext = lowercase(url_ext(url))
589+
if !(ext in ["tar", "gz", "tgz", "bz2", "xz", "zst"])
501590
ext = "gz"
502591
end
503592

@@ -538,7 +627,7 @@ function download_verify_unpack(
538627
@info("Unpacking $(tarball_path) into $(dest)...")
539628
end
540629
isnothing(progress) || progress(10000, 10000; status = "unpacking")
541-
open(`$(exe7z()) x $tarball_path -so`) do io
630+
open(get_extract_cmd(tarball_path)) do io
542631
Tar.extract(io, dest, copy_symlinks = copy_symlinks())
543632
end
544633
finally
@@ -690,7 +779,7 @@ function verify_archive_tree_hash(tar_gz::AbstractString, expected_hash::Base.SH
690779
# tarball, tree hash verification requires that the file can i) be
691780
# decompressed and ii) is a proper archive.
692781
calc_hash = try
693-
Base.SHA1(open(Tar.tree_hash, `$(exe7z()) x $tar_gz -so`))
782+
Base.SHA1(open(Tar.tree_hash, get_extract_cmd(tar_gz)))
694783
catch err
695784
@warn "unable to decompress and read archive" exception = err
696785
return false

src/Registry/Registry.jl

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ module Registry
4141
import ..Pkg
4242
using ..Pkg: depots, depots1, printpkgstyle, stderr_f, isdir_nothrow, pathrepr, pkg_server,
4343
GitTools, atomic_toml_write, create_cachedir_tag
44-
using ..Pkg.PlatformEngines: download_verify_unpack, download, download_verify, exe7z, verify_archive_tree_hash
44+
using ..Pkg.PlatformEngines: download_verify_unpack, download, download_verify, verify_archive_tree_hash, get_extract_cmd, detect_archive_format
4545
using UUIDs, LibGit2, TOML, Dates
4646
import FileWatching
4747

@@ -240,6 +240,18 @@ function check_registry_state(reg)
240240
return nothing
241241
end
242242

243+
# Detect compression format by reading file magic bytes
244+
# Returns file extension (.tar.zst or .tar.gz)
245+
# This is defensive: we request zstd, but the server might not support it yet
246+
function detect_compression_format(filepath::AbstractString)::String
247+
format = detect_archive_format(filepath)
248+
if format == "zstd"
249+
return ".tar.zst"
250+
else
251+
return ".tar.gz"
252+
end
253+
end
254+
243255
function download_registries(io::IO, regs::Vector{RegistrySpec}, depots::Union{String, Vector{String}} = depots())
244256
# Use the first depot as the target
245257
target_depot = depots1(depots)
@@ -282,8 +294,10 @@ function download_registries(io::IO, regs::Vector{RegistrySpec}, depots::Union{S
282294
reg_unc = uncompress_registry(tmp)
283295
reg.name = TOML.parse(reg_unc["Registry.toml"])["name"]::String
284296
end
285-
mv(tmp, joinpath(regdir, reg.name * ".tar.gz"); force = true)
286-
reg_info = Dict("uuid" => string(reg.uuid), "git-tree-sha1" => string(_hash), "path" => reg.name * ".tar.gz")
297+
# Detect what we actually got from the server (defensive against servers that don't support zstd yet)
298+
ext = detect_compression_format(tmp)
299+
mv(tmp, joinpath(regdir, reg.name * ext); force = true)
300+
reg_info = Dict("uuid" => string(reg.uuid), "git-tree-sha1" => string(_hash), "path" => reg.name * ext)
287301
atomic_toml_write(joinpath(regdir, reg.name * ".toml"), reg_info)
288302
registry_update_log[string(reg.uuid)] = now()
289303
printpkgstyle(io, :Added, "`$(reg.name)` registry to $(Base.contractuser(regdir))")
@@ -546,8 +560,10 @@ function update(regs::Vector{RegistrySpec}; io::IO = stderr_f(), force::Bool = t
546560
Base.rm(reg.path; recursive = true, force = true)
547561
end
548562
registry_path = dirname(reg.path)
549-
mv(tmp, joinpath(registry_path, reg.name * ".tar.gz"); force = true)
550-
reg_info = Dict("uuid" => string(reg.uuid), "git-tree-sha1" => string(hash), "path" => reg.name * ".tar.gz")
563+
# Detect what we actually got from the server (defensive against servers that don't support zstd yet)
564+
ext = detect_compression_format(tmp)
565+
mv(tmp, joinpath(registry_path, reg.name * ext); force = true)
566+
reg_info = Dict("uuid" => string(reg.uuid), "git-tree-sha1" => string(hash), "path" => reg.name * ext)
551567
atomic_toml_write(joinpath(registry_path, reg.name * ".toml"), reg_info)
552568
registry_update_log[string(reg.uuid)] = now()
553569
@label done_tarball_read

src/Registry/registry_instance.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ function uncompress_registry(tar_gz::AbstractString)
279279
data = Dict{String, String}()
280280
buf = Vector{UInt8}(undef, Tar.DEFAULT_BUFFER_SIZE)
281281
io = IOBuffer()
282-
open(`$(exe7z()) x $tar_gz -so`) do tar
282+
open(get_extract_cmd(tar_gz)) do tar
283283
Tar.read_tarball(x -> true, tar; buf = buf) do hdr, _
284284
if hdr.type == :file
285285
Tar.read_data(tar, io; size = hdr.size, buf = buf)

src/precompile.jl

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,15 +82,13 @@ function _run_precompilation_script_setup()
8282
repo = "$(escape_string(tmp))/TestPkg.jl"
8383
""",
8484
)
85-
Tar.create("registries/Registry", "registries/Registry.tar")
86-
cmd = `$(Pkg.PlatformEngines.exe7z()) a "registries/Registry.tar.gz" -tgzip "registries/Registry.tar"`
87-
run(pipeline(cmd, stdout = stdout_f(), stderr = stderr_f()))
85+
Pkg.PlatformEngines.package("registries/Registry", "registries/Registry.tar.zst")
8886
write(
8987
"registries/Registry.toml",
9088
"""
9189
git-tree-sha1 = "11b5fad51c4f98cfe0c145ceab0b8fb63fed6f81"
9290
uuid = "37c07fec-e54c-4851-934c-2e3885e4053e"
93-
path = "Registry.tar.gz"
91+
path = "Registry.tar.zst"
9492
""",
9593
)
9694
Base.rm("registries/Registry"; recursive = true)

0 commit comments

Comments
 (0)