Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pure dec enc with gz #143

Merged
merged 26 commits into from
Aug 4, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
6c54ca0
revise a decoder and encoder, being pure
hannesm Feb 1, 2024
c67f945
remove stuff
hannesm Feb 2, 2024
9ccc73b
wip
hannesm Feb 3, 2024
ebabd3c
fix
hannesm Feb 3, 2024
ce9337b
proposed API
hannesm Feb 3, 2024
50f6659
add filter
hannesm Feb 3, 2024
1b4ae55
initial compiling tar_unix
hannesm Feb 3, 2024
984ffe0
remove offset nonsense
hannesm Feb 3, 2024
9c1c120
lwt-unix
hannesm Feb 4, 2024
29d884e
further work, get tests a bit more up to speed
hannesm Feb 4, 2024
281883b
more tests are working now
hannesm Feb 4, 2024
60d6faa
revive transform test
hannesm Feb 4, 2024
462063b
test tar_unix, use fold for list
hannesm Feb 4, 2024
2b49b1f
document write_header
hannesm Feb 4, 2024
2388f62
Purify fold and move it into Tar with a GADT, use it then for Tar_gz …
dinosaure Feb 7, 2024
8b308a9
Keep the bind as is and Tar_gz does not require the run function (/cc…
dinosaure Feb 7, 2024
14681fe
Implement Tar_gz.gzipped : _ Tar.t -> _ Tar.t
reynir Feb 7, 2024
d5ad1df
Fix the otar binary
dinosaure Feb 7, 2024
c7c81d2
Implement the high kind polymorphism to fix the lwt_unix layer
dinosaure Feb 7, 2024
906d6dc
Add a comment to explain the hkp trick
dinosaure Feb 21, 2024
b8b4ff6
Minor: qualify opens, fix tests
reynir May 7, 2024
0cfd771
Partially implement tar_eio, stub out remainder
reynir May 7, 2024
c24cd1b
Seek returns unit, improve documentation
robur-team May 9, 2024
576dcff
Remove [`Msg of string] from Tar_unix.decode_error
robur-team May 9, 2024
b1c10d0
Document Tar.fold
reynir May 9, 2024
890c1fe
Fixups
reynir May 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion bin/otar.ml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*)

(*
let () = Printexc.record_backtrace true

module Tar_gz = Tar_gz.Make
Expand Down Expand Up @@ -129,3 +129,4 @@ let () = match Sys.argv with
| _ ->
let cmd = Filename.basename Sys.argv.(0) in
Format.eprintf "%s <directory> [<filename.tar.gz>]\n%s list <filename.tar.gz>\n" cmd cmd
*)
412 changes: 190 additions & 222 deletions lib/tar.ml

Large diffs are not rendered by default.

86 changes: 46 additions & 40 deletions lib/tar.mli
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
{e %%VERSION%% - {{:%%PKG_HOMEPAGE%% }homepage}} *)

(** The type of errors that may occur. *)
type error = [`Checksum_mismatch | `Corrupt_pax_header | `Zero_block | `Unmarshal of string]
type error = [ `Checksum_mismatch | `Corrupt_pax_header | `Zero_block | `Unmarshal of string ]

(** [pp_error ppf e] pretty prints the error [e] on the formatter [ppf]. *)
val pp_error : Format.formatter -> [< error] -> unit
Expand Down Expand Up @@ -123,7 +123,7 @@ module Header : sig
(** Unmarshal a header block, returning [None] if it's all zeroes.
This header block may be preceded by an [?extended] block which
will override some fields. *)
val unmarshal : ?extended:Extended.t -> string -> (t, [`Zero_block | `Checksum_mismatch | `Unmarshal of string]) result
val unmarshal : ?extended:Extended.t -> string -> (t, [> `Zero_block | `Checksum_mismatch | `Unmarshal of string]) result

(** Marshal a header block, computing and inserting the checksum. *)
val marshal : ?level:compatibility -> bytes -> t -> (unit, [> `Msg of string ]) result
Expand All @@ -139,47 +139,53 @@ module Header : sig
val to_sectors: t -> int64
end

module type ASYNC = sig
type 'a t
val ( >>= ): 'a t -> ('a -> 'b t) -> 'b t
val return: 'a -> 'a t
end
(** {1 Decoding and encoding of a whole archive} *)

module type READER = sig
type in_channel
type 'a io
val really_read: in_channel -> bytes -> unit io
val skip: in_channel -> int -> unit io
end
(** The type of the decode state. *)
type decode_state

module type WRITER = sig
type out_channel
type 'a io
val really_write: out_channel -> string -> unit io
end
(** [decode_state ~global ()] constructs a decode_state. *)
val decode_state : ?global:Header.Extended.t -> unit -> decode_state

module type HEADERREADER = sig
type in_channel
type 'a io

(** Returns the next header block or error [`Eof] if two consecutive
zero-filled blocks are discovered. Assumes stream is positioned at the
possible start of a header block.
@param global Holds the current global pax extended header, if
any. Needs to be given to the next call to [read]. *)
val read : global:Header.Extended.t option -> in_channel ->
(Header.t * Header.Extended.t option, [ `Eof | `Fatal of [ `Checksum_mismatch | `Corrupt_pax_header | `Unmarshal of string ] ]) result io
end
(** [decode t data] decodes [data] taking the current state [t] into account.
It may result on success in a new state, optionally some action that should
be done ([`Read] or [`Skip]), or a decoded [`Header]. Possibly a new global
PAX header is provided as well.

module type HEADERWRITER = sig
type out_channel
type 'a io
val write : ?level:Header.compatibility -> Header.t -> out_channel -> (unit, [> `Msg of string ]) result io
val write_global_extended_header : Header.Extended.t -> out_channel -> (unit, [> `Msg of string ]) result io
end
If no [`Read] or [`Skip] is returned, the new state should be used with
[decode] with the next [Header.length] sized string, which will lead to
further decoding until [`Eof] (or an error) occurs. *)
val decode : decode_state -> string ->
(decode_state * [ `Read of int | `Skip of int | `Header of Header.t ] option * Header.Extended.t option,
[ `Eof | `Fatal of error ])
result

(** [encode_header ~level hdr] encodes the header with the provided [level]
(defaults to [V7]) into a list of strings to be written to the disk.
Once a header is written, the payload (padded to multiples of
[Header.length]) should follow. *)
val encode_header : ?level:Header.compatibility ->
Header.t -> (string list, [> `Msg of string ]) result

(** [encode_global_extended_header hdr] encodes the global extended header as
a list of strings. *)
val encode_global_extended_header : ?level:Header.compatibility -> Header.Extended.t -> (string list, [> `Msg of string ]) result

(** {1 Pure implementation of [fold].} *)

type ('a, 'err) t =
| Really_read : int -> (string, 'err) t
| Read : int -> (string, 'err) t
| Seek : int -> (int, 'err) t
| Bind : ('a, 'err) t * ('a -> ('b, 'err) t) -> ('b, 'err) t
| Return : ('a, 'err) result -> ('a, 'err) t

val really_read : int -> (string, _) t
val read : int -> (string, _) t
val seek : int -> (int, _) t
val ( let* ) : ('a, 'err) t -> ('a -> ('b, 'err) t) -> ('b, 'err) t
val return : ('a, 'err) result -> ('a, 'err) t

module HeaderReader(Async: ASYNC)(Reader: READER with type 'a io = 'a Async.t) :
HEADERREADER with type in_channel = Reader.in_channel and type 'a io = 'a Async.t
type ('a, 'err) fold = (?global:Header.Extended.t -> Header.t -> 'a -> ('a, 'err) result) -> 'a -> ('a, 'err) t

module HeaderWriter(Async: ASYNC)(Writer: WRITER with type 'a io = 'a Async.t) :
HEADERWRITER with type out_channel = Writer.out_channel and type 'a io = 'a Async.t
val fold : ('a, [> `Fatal of error ]) fold
161 changes: 86 additions & 75 deletions lib/tar_gz.ml
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,10 @@
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*)

module type READER = sig
type in_channel
type 'a io
val read : in_channel -> bytes -> int io
end

external ba_get_int32_ne : De.bigstring -> int -> int32 = "%caml_bigstring_get32"
external ba_set_int32_ne : De.bigstring -> int -> int32 -> unit = "%caml_bigstring_set32"

(*
let bigstring_to_string ?(off= 0) ?len ba =
let len = match len with
| Some len -> len
Expand All @@ -41,6 +36,7 @@ let bigstring_to_string ?(off= 0) ?len ba =
Bytes.set res i v
done;
Bytes.unsafe_to_string res
*)

let bigstring_blit_string src ~src_off dst ~dst_off ~len =
let len0 = len land 3 in
Expand Down Expand Up @@ -71,6 +67,89 @@ let bigstring_blit_bytes src ~src_off dst ~dst_off ~len =
Bytes.set dst (dst_off + i) v
done

type decoder =
{ mutable gz : Gz.Inf.decoder
; ic_buffer : De.bigstring
; oc_buffer : De.bigstring
; tp_length : int
; mutable pos : int }

let really_read_through_gz
: decoder -> bytes -> (unit, 'err) Tar.t
= fun ({ ic_buffer; oc_buffer; tp_length; _ } as state) res ->
let open Tar in
let rec until_full_or_end gz (res, res_off, res_len) =
match Gz.Inf.decode gz with
| `Flush gz ->
let max = De.bigstring_length oc_buffer - Gz.Inf.dst_rem gz in
let len = min res_len max in
bigstring_blit_bytes oc_buffer ~src_off:0 res ~dst_off:res_off ~len;
if len < max
then ( state.pos <- len
; state.gz <- gz
; return (Ok ()) )
else until_full_or_end (Gz.Inf.flush gz) (res, res_off + len, res_len - len)
| `End gz ->
let max = De.bigstring_length oc_buffer - Gz.Inf.dst_rem gz in
let len = min res_len max in
bigstring_blit_bytes oc_buffer ~src_off:0 res ~dst_off:res_off ~len;
if res_len > len
then return (Error `Eof)
else ( state.pos <- len
; state.gz <- gz
; return (Ok ()) )
| `Await gz ->
let* tp_buffer = Tar.read tp_length in
let len = String.length tp_buffer in
bigstring_blit_string tp_buffer ~src_off:0 ic_buffer ~dst_off:0 ~len;
let gz = Gz.Inf.src gz ic_buffer 0 len in
until_full_or_end gz (res, res_off, res_len)
| `Malformed err -> return (Error (`Gz err)) in
let max = (De.bigstring_length oc_buffer - Gz.Inf.dst_rem state.gz) - state.pos in
let len = min (Bytes.length res) max in
bigstring_blit_bytes oc_buffer ~src_off:state.pos res ~dst_off:0 ~len;
if len < max
then ( state.pos <- state.pos + len
; return (Ok ()) )
else until_full_or_end (Gz.Inf.flush state.gz) (res, len, Bytes.length res - len)

let really_read_through_gz decoder len =
let open Tar in
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we locally open Tar to get let* only? Then maybe

Suggested change
let open Tar in
let ( let* ) = Tar.( let* ) in

let res = Bytes.create len in
let* () = really_read_through_gz decoder res in
Tar.return (Ok (Bytes.unsafe_to_string res))

type error = [ `Fatal of Tar.error | `Eof | `Gz of string ]

let seek_through_gz : decoder -> int -> (int, [> error ]) Tar.t = fun state len ->
let open Tar in
let* _buf = really_read_through_gz state len in
Tar.return (Ok 0 (* XXX(dinosaure): actually, [fold] ignores the result. *))

type 'err run = { run : 'a 'err. ('a, 'err) Tar.t -> ('a, 'err) result } [@@unboxed]

let fold_with_gz
: run:[> error ] run -> _ -> _ -> _
= fun ~run:{ run } f init ->
let rec go : type a. decoder -> (a, [> error ] as 'err) Tar.t -> (a, 'err) Tar.t = fun decoder -> function
| Tar.Really_read len -> really_read_through_gz decoder len
reynir marked this conversation as resolved.
Show resolved Hide resolved
| Tar.Read _len -> assert false (* XXX(dinosaure): actually does not emit [Tar.Read]. *)
| Tar.Seek len -> seek_through_gz decoder len
| Tar.Return v -> Tar.return v
| Tar.Bind (x, f) ->
match run x with
| Ok value -> go decoder (f value)
| Error _ as err -> Tar.return err in
let decoder =
let oc_buffer = De.bigstring_create 0x1000 in
{ gz= Gz.Inf.decoder `Manual ~o:oc_buffer
; oc_buffer
; ic_buffer= De.bigstring_create 0x1000
; tp_length= 0x1000
; pos= 0 } in
go decoder (Tar.fold f init)

(*
module Make
(Async : Tar.ASYNC)
(Writer : Tar.WRITER with type 'a io = 'a Async.t)
Expand Down Expand Up @@ -108,75 +187,6 @@ module Make
go gz (str, 0, String.length str)
end

module Gz_reader = struct
type in_channel =
{ mutable gz : Gz.Inf.decoder
; ic_buffer : De.bigstring
; oc_buffer : De.bigstring
; tp_buffer : bytes
; in_channel : Reader.in_channel
; mutable pos : int }

type 'a io = 'a Async.t

let really_read
: in_channel -> bytes -> unit io
= fun ({ ic_buffer; oc_buffer; in_channel; tp_buffer; _ } as state) res ->
let rec until_full_or_end gz (res, res_off, res_len) =
match Gz.Inf.decode gz with
| `Flush gz ->
let max = De.bigstring_length oc_buffer - Gz.Inf.dst_rem gz in
let len = min res_len max in
bigstring_blit_bytes oc_buffer ~src_off:0 res ~dst_off:res_off ~len;
if len < max
then ( state.pos <- len
; state.gz <- gz
; Async.return () )
else until_full_or_end (Gz.Inf.flush gz) (res, res_off + len, res_len - len)
| `End gz ->
let max = De.bigstring_length oc_buffer - Gz.Inf.dst_rem gz in
let len = min res_len max in
bigstring_blit_bytes oc_buffer ~src_off:0 res ~dst_off:res_off ~len;
if res_len > len
then raise End_of_file
else ( state.pos <- len
; state.gz <- gz
; Async.return () )
| `Await gz ->
Reader.read in_channel tp_buffer >>= fun len ->
bigstring_blit_string (Bytes.unsafe_to_string tp_buffer) ~src_off:0 ic_buffer ~dst_off:0 ~len;
let gz = Gz.Inf.src gz ic_buffer 0 len in
until_full_or_end gz (res, res_off, res_len)
| `Malformed err -> failwith ("gzip: " ^ err) in
let max = (De.bigstring_length oc_buffer - Gz.Inf.dst_rem state.gz) - state.pos in
let len = min (Bytes.length res) max in
bigstring_blit_bytes oc_buffer ~src_off:state.pos res ~dst_off:0 ~len;
if len < max
then ( state.pos <- state.pos + len
; Async.return () )
else until_full_or_end (Gz.Inf.flush state.gz) (res, len, Bytes.length res - len)

let skip : in_channel -> int -> unit io = fun state len ->
let res = Bytes.create len in
really_read state res
end

module HeaderWriter = Tar.HeaderWriter (Async) (Gz_writer)
module HeaderReader = Tar.HeaderReader (Async) (Gz_reader)

type in_channel = Gz_reader.in_channel

let of_in_channel ~internal:oc_buffer in_channel =
{ Gz_reader.gz= Gz.Inf.decoder `Manual ~o:oc_buffer
; oc_buffer
; ic_buffer= De.bigstring_create 0x1000
; tp_buffer= Bytes.create 0x1000
; in_channel
; pos= 0 }

let really_read = Gz_reader.really_read
let skip = Gz_reader.skip

type out_channel = Gz_writer.out_channel

let of_out_channel ?bits:(w_bits= 15) ?q:(q_len= 0x1000) ~level ~mtime os out_channel =
Expand Down Expand Up @@ -230,3 +240,4 @@ module Make
| `End _gz -> Async.return () in
until_end (Gz.Def.src state.gz De.bigstring_empty 0 0)
end
*)
8 changes: 8 additions & 0 deletions lib/tar_gz.mli
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*)

type error = [ `Fatal of Tar.error | `Eof | `Gz of string ]

type 'err run = { run : 'a 'err. ('a, 'err) Tar.t -> ('a, 'err) result } [@@unboxed]

val fold_with_gz : run:[> error ] run -> ('a, [> error]) Tar.fold

(*
module type READER = sig
type in_channel
type 'a io
Expand Down Expand Up @@ -72,3 +79,4 @@ module Make
module HeaderWriter :
Tar.HEADERWRITER with type out_channel = out_channel and type 'a io = 'a Async.t
end
*)
3 changes: 2 additions & 1 deletion lib_test/dune
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@
alcotest-lwt
lwt
tar-unix
tar-mirage))
tar-mirage
))
Loading
Loading