Skip to content

Commit 3ca05c0

Browse files
committed
ask the package server for zstd compressed data
Together with JuliaPackaging/PkgServer.jl#220 this allows Pkg to download registries, packages and artifacts as zst archives which based on benchmarks decompress significantly faster and also tend to be smaller. Having the registry stored as a zst archive is not a backwards compatibility issue because even on older Pkgs it can decompress these with 7z
1 parent b6e4785 commit 3ca05c0

File tree

6 files changed

+131
-20
lines changed

6 files changed

+131
-20
lines changed

Project.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
2323
TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
2424
Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
2525
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
26+
Zstd_jll = "3161d3a3-bdf6-5164-811a-617609db77b4"
2627
p7zip_jll = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
2728

2829
[weakdeps]
@@ -41,11 +42,12 @@ Libdl = "1.11"
4142
Logging = "1.11"
4243
Markdown = "1.11"
4344
Printf = "1.11"
44-
Random = "1.11"
4545
REPL = "1.11"
46+
Random = "1.11"
4647
SHA = "0.7, 1"
4748
TOML = "1"
4849
Tar = "1.10"
4950
UUIDs = "1.11"
51+
Zstd_jll = "1.5.7"
5052
julia = "1.12"
5153
p7zip_jll = "17.5"

docs/src/protocol.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,18 @@ The client can make GET or HEAD requests to the following resources:
136136

137137
Only the `/registries` changes - all other resources can be cached forever and the server will indicate this with the appropriate HTTP headers.
138138

139+
### Compression Negotiation
140+
141+
The Pkg protocol supports multiple compression formats.
142+
143+
- **Zstd compression** (current): Modern clients send `Accept-Encoding: zstd, gzip` to request Zstandard-compressed resources with gzip as a fallback.
144+
- **Gzip compression** (legacy): Older clients that only support gzip send `Accept-Encoding: gzip` or omit the header entirely.
145+
146+
Clients verify the actual compression format by reading file magic bytes after download to determine the correct file extension for caching:
147+
148+
- **Zstd format**: Magic bytes `0x28 0xB5 0x2F 0xFD` (4 bytes)
149+
- **Gzip format**: Magic bytes `0x1F 0x8B` (2 bytes)
150+
139151
### Reference Implementation
140152

141153
A reference implementation of the Pkg Server protocol is available at [PkgServer.jl](https://github.com/JuliaPackaging/PkgServer.jl).

src/PlatformEngines.jl

Lines changed: 95 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,17 @@
44

55
module PlatformEngines
66

7-
using SHA, Downloads, Tar
7+
using SHA, Downloads, Tar, Dates, Printf
88
import ...Pkg: Pkg, TOML, pkg_server, depots1, can_fancyprint, stderr_f, atomic_toml_write
99
using ..MiniProgressBars
10-
using Base.BinaryPlatforms, p7zip_jll
10+
using Base.BinaryPlatforms, p7zip_jll, Zstd_jll
1111

12-
export verify, unpack, package, download_verify_unpack
12+
export verify, unpack, package, download_verify_unpack, get_extract_cmd, detect_archive_format
1313

1414
const EXE7Z_LOCK = ReentrantLock()
1515
const EXE7Z = Ref{String}()
16+
const EXEZSTD_LOCK = ReentrantLock()
17+
const EXEZSTD = Ref{String}()
1618

1719
function exe7z()
1820
# If the JLL is available, use the wrapper function defined in there
@@ -28,6 +30,20 @@ function exe7z()
2830
end
2931
end
3032

33+
function exezstd()
34+
# If the JLL is available, use the wrapper function defined in there
35+
if Zstd_jll.is_available()
36+
return Zstd_jll.zstd()
37+
end
38+
39+
return lock(EXEZSTD_LOCK) do
40+
if !isassigned(EXEZSTD)
41+
EXEZSTD[] = findzstd()
42+
end
43+
return Cmd([EXEZSTD[]])
44+
end
45+
end
46+
3147
function find7z()
3248
name = "7z"
3349
Sys.iswindows() && (name = "$name.exe")
@@ -40,6 +56,18 @@ function find7z()
4056
error("7z binary not found")
4157
end
4258

59+
function findzstd()
60+
name = "zstd"
61+
Sys.iswindows() && (name = "$name.exe")
62+
for dir in (joinpath("..", "libexec"), ".")
63+
path = normpath(Sys.BINDIR::String, dir, name)
64+
isfile(path) && return path
65+
end
66+
path = Sys.which(name)
67+
path !== nothing && return path
68+
error("zstd binary not found")
69+
end
70+
4371
is_secure_url(url::AbstractString) =
4472
occursin(r"^(https://|\w+://(127\.0\.0\.1|localhost)(:\d+)?($|/))"i, url)
4573

@@ -232,6 +260,13 @@ function get_metadata_headers(url::AbstractString)
232260
end
233261
push!(headers, "Julia-CI-Variables" => join(ci_info, ';'))
234262
push!(headers, "Julia-Interactive" => string(isinteractive()))
263+
264+
# Add Accept-Encoding header only for compressed archive resources
265+
# (registries, packages, artifacts - not for metadata endpoints like /registries or /meta)
266+
if occursin(r"/(registry|package|artifact)/", url)
267+
push!(headers, "Accept-Encoding" => "zstd, gzip")
268+
end
269+
235270
for (key, val) in ENV
236271
m = match(r"^JULIA_PKG_SERVER_([A-Z0-9_]+)$"i, key)
237272
m === nothing && continue
@@ -403,22 +438,73 @@ function copy_symlinks()
403438
lowercase(var) in ("false", "f", "no", "n", "0") ? false : nothing
404439
end
405440

441+
"""
442+
detect_archive_format(tarball_path::AbstractString)
443+
444+
Detect compression format by reading file magic bytes.
445+
Returns "zstd" or "gzip".
446+
"""
447+
function detect_archive_format(tarball_path::AbstractString)
448+
file_size = filesize(tarball_path)
449+
450+
if file_size == 0
451+
error("cannot detect compression format: $tarball_path is empty")
452+
end
453+
454+
magic = open(tarball_path, "r") do io
455+
read(io, min(4, file_size))
456+
end
457+
458+
# Zstd magic number: 0x28 0xB5 0x2F 0xFD
459+
if length(magic) >= 4 && magic[1:4] == [0x28, 0xB5, 0x2F, 0xFD]
460+
return "zstd"
461+
end
462+
# Gzip magic number: 0x1F 0x8B
463+
if length(magic) >= 2 && magic[1:2] == [0x1F, 0x8B]
464+
return "gzip"
465+
end
466+
467+
# Show hex dump of magic bytes for debugging
468+
hex_dump = length(magic) > 0 ? join([@sprintf("0x%02X", b) for b in magic], " ") : "none"
469+
error("unknown compression format for $tarball_path (magic bytes: $hex_dump, expected zstd [0x28 0xB5 0x2F 0xFD] or gzip [0x1F 0x8B])")
470+
end
471+
472+
"""
473+
get_extract_cmd(tarball_path::AbstractString)
474+
475+
Get the decompression command for a tarball.
476+
Uses zstd for all decompression as it handles both zstd and gzip formats efficiently.
477+
"""
478+
function get_extract_cmd(tarball_path::AbstractString)
479+
# zstd can decompress both zstd and gzip formats, and is ~3x faster than 7z for gzip
480+
return `$(exezstd()) -d -c $tarball_path`
481+
end
482+
406483
function unpack(
407484
tarball_path::AbstractString,
408485
dest::AbstractString;
409486
verbose::Bool = false,
410487
)
411-
return Tar.extract(`$(exe7z()) x $tarball_path -so`, dest, copy_symlinks = copy_symlinks())
488+
return Tar.extract(get_extract_cmd(tarball_path), dest, copy_symlinks = copy_symlinks())
412489
end
413490

414491
"""
415492
package(src_dir::AbstractString, tarball_path::AbstractString)
416493
417494
Compress `src_dir` into a tarball located at `tarball_path`.
495+
Supports both gzip and zstd compression based on file extension.
418496
"""
419497
function package(src_dir::AbstractString, tarball_path::AbstractString; io = stderr_f())
420498
rm(tarball_path, force = true)
421-
cmd = `$(exe7z()) a -si -tgzip -mx9 $tarball_path`
499+
# Choose compression based on file extension (case-insensitive)
500+
tarball_lower = lowercase(tarball_path)
501+
if endswith(tarball_lower, ".zst") || endswith(tarball_lower, ".tar.zst")
502+
# Use zstd compression (level 19 for good compression)
503+
cmd = `$(exezstd()) -19 -c -T -o $tarball_path`
504+
else
505+
# Use gzip compression (default)
506+
cmd = `$(exe7z()) a -si -tgzip -mx9 $tarball_path`
507+
end
422508
return open(pipeline(cmd, stdout = devnull, stderr = io), write = true) do io
423509
Tar.create(src_dir, io)
424510
end
@@ -496,8 +582,8 @@ function download_verify_unpack(
496582
end
497583

498584
# If extension of url contains a recognized extension, use it, otherwise use ".gz"
499-
ext = url_ext(url)
500-
if !(ext in ["tar", "gz", "tgz", "bz2", "xz"])
585+
ext = lowercase(url_ext(url))
586+
if !(ext in ["tar", "gz", "tgz", "bz2", "xz", "zst"])
501587
ext = "gz"
502588
end
503589

@@ -538,7 +624,7 @@ function download_verify_unpack(
538624
@info("Unpacking $(tarball_path) into $(dest)...")
539625
end
540626
isnothing(progress) || progress(10000, 10000; status = "unpacking")
541-
open(`$(exe7z()) x $tarball_path -so`) do io
627+
open(get_extract_cmd(tarball_path)) do io
542628
Tar.extract(io, dest, copy_symlinks = copy_symlinks())
543629
end
544630
finally
@@ -690,7 +776,7 @@ function verify_archive_tree_hash(tar_gz::AbstractString, expected_hash::Base.SH
690776
# tarball, tree hash verification requires that the file can i) be
691777
# decompressed and ii) is a proper archive.
692778
calc_hash = try
693-
Base.SHA1(open(Tar.tree_hash, `$(exe7z()) x $tar_gz -so`))
779+
Base.SHA1(open(Tar.tree_hash, get_extract_cmd(tar_gz)))
694780
catch err
695781
@warn "unable to decompress and read archive" exception = err
696782
return false

src/Registry/Registry.jl

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ module Registry
4141
import ..Pkg
4242
using ..Pkg: depots, depots1, printpkgstyle, stderr_f, isdir_nothrow, pathrepr, pkg_server,
4343
GitTools, atomic_toml_write, create_cachedir_tag
44-
using ..Pkg.PlatformEngines: download_verify_unpack, download, download_verify, exe7z, verify_archive_tree_hash
44+
using ..Pkg.PlatformEngines: download_verify_unpack, download, download_verify, verify_archive_tree_hash, get_extract_cmd, detect_archive_format
4545
using UUIDs, LibGit2, TOML, Dates
4646
import FileWatching
4747

@@ -240,6 +240,15 @@ function check_registry_state(reg)
240240
return nothing
241241
end
242242

243+
function detect_compression_format(filepath::AbstractString)::String
244+
format = detect_archive_format(filepath)
245+
if format == "zstd"
246+
return ".tar.zst"
247+
else
248+
return ".tar.gz"
249+
end
250+
end
251+
243252
function download_registries(io::IO, regs::Vector{RegistrySpec}, depots::Union{String, Vector{String}} = depots())
244253
# Use the first depot as the target
245254
target_depot = depots1(depots)
@@ -282,8 +291,10 @@ function download_registries(io::IO, regs::Vector{RegistrySpec}, depots::Union{S
282291
reg_unc = uncompress_registry(tmp)
283292
reg.name = TOML.parse(reg_unc["Registry.toml"])["name"]::String
284293
end
285-
mv(tmp, joinpath(regdir, reg.name * ".tar.gz"); force = true)
286-
reg_info = Dict("uuid" => string(reg.uuid), "git-tree-sha1" => string(_hash), "path" => reg.name * ".tar.gz")
294+
# Detect what we actually got from the server (defensive against servers that don't support zstd yet)
295+
ext = detect_compression_format(tmp)
296+
mv(tmp, joinpath(regdir, reg.name * ext); force = true)
297+
reg_info = Dict("uuid" => string(reg.uuid), "git-tree-sha1" => string(_hash), "path" => reg.name * ext)
287298
atomic_toml_write(joinpath(regdir, reg.name * ".toml"), reg_info)
288299
registry_update_log[string(reg.uuid)] = now()
289300
printpkgstyle(io, :Added, "`$(reg.name)` registry to $(Base.contractuser(regdir))")
@@ -546,8 +557,10 @@ function update(regs::Vector{RegistrySpec}; io::IO = stderr_f(), force::Bool = t
546557
Base.rm(reg.path; recursive = true, force = true)
547558
end
548559
registry_path = dirname(reg.path)
549-
mv(tmp, joinpath(registry_path, reg.name * ".tar.gz"); force = true)
550-
reg_info = Dict("uuid" => string(reg.uuid), "git-tree-sha1" => string(hash), "path" => reg.name * ".tar.gz")
560+
# Detect what we actually got from the server (defensive against servers that don't support zstd yet)
561+
ext = detect_compression_format(tmp)
562+
mv(tmp, joinpath(registry_path, reg.name * ext); force = true)
563+
reg_info = Dict("uuid" => string(reg.uuid), "git-tree-sha1" => string(hash), "path" => reg.name * ext)
551564
atomic_toml_write(joinpath(registry_path, reg.name * ".toml"), reg_info)
552565
registry_update_log[string(reg.uuid)] = now()
553566
@label done_tarball_read

src/Registry/registry_instance.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ function uncompress_registry(tar_gz::AbstractString)
279279
data = Dict{String, String}()
280280
buf = Vector{UInt8}(undef, Tar.DEFAULT_BUFFER_SIZE)
281281
io = IOBuffer()
282-
open(`$(exe7z()) x $tar_gz -so`) do tar
282+
open(get_extract_cmd(tar_gz)) do tar
283283
Tar.read_tarball(x -> true, tar; buf = buf) do hdr, _
284284
if hdr.type == :file
285285
Tar.read_data(tar, io; size = hdr.size, buf = buf)

src/precompile.jl

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,15 +82,13 @@ function _run_precompilation_script_setup()
8282
repo = "$(escape_string(tmp))/TestPkg.jl"
8383
""",
8484
)
85-
Tar.create("registries/Registry", "registries/Registry.tar")
86-
cmd = `$(Pkg.PlatformEngines.exe7z()) a "registries/Registry.tar.gz" -tgzip "registries/Registry.tar"`
87-
run(pipeline(cmd, stdout = stdout_f(), stderr = stderr_f()))
85+
Pkg.PlatformEngines.package("registries/Registry", "registries/Registry.tar.zst")
8886
write(
8987
"registries/Registry.toml",
9088
"""
9189
git-tree-sha1 = "11b5fad51c4f98cfe0c145ceab0b8fb63fed6f81"
9290
uuid = "37c07fec-e54c-4851-934c-2e3885e4053e"
93-
path = "Registry.tar.gz"
91+
path = "Registry.tar.zst"
9492
""",
9593
)
9694
Base.rm("registries/Registry"; recursive = true)

0 commit comments

Comments
 (0)