diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ff4788b..225c86d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,10 +59,10 @@ jobs: steps: - name: Install optional tools macOS if: runner.os == 'macOS' && matrix.optional-deps - run: brew install pigz pbzip2 isa-l zstd + run: brew install pigz pbzip2 isa-l zstd lz4 - name: Install optional tools Linux if: runner.os == 'Linux' && matrix.optional-deps - run: sudo apt-get install pigz pbzip2 isal zstd + run: sudo apt-get install pigz pbzip2 isal zstd lz4 - name: Remove xz if: runner.os == 'Linux' && !matrix.optional-deps run: while which xz; do sudo rm $(which xz); done diff --git a/README.rst b/README.rst index 2e5c268..cb3ea91 100644 --- a/README.rst +++ b/README.rst @@ -26,6 +26,7 @@ Supported compression formats are: - gzip (``.gz``) - bzip2 (``.bz2``) - xz (``.xz``) +- lz4 (``.lz4``) - Zstandard (``.zst``) (optional) @@ -71,7 +72,7 @@ The function opens the file using a function suitable for the detected file format and returns an open file-like object. When writing, the file format is chosen based on the file name extension: -``.gz``, ``.bz2``, ``.xz``, ``.zst``. This can be overriden with ``format``. +``.gz``, ``.bz2``, ``.xz``, ``.zst``, ``.lz4``. This can be overriden with ``format``. If the extension is not recognized, no compression is used. When reading and a file name extension is available, the format is detected @@ -99,13 +100,13 @@ preferred locale encoding. **compresslevel**: The compression level for writing to gzip, xz and Zstandard files. If set to None, a default depending on the format is used: -gzip: 1, xz: 6, Zstandard: 3. +gzip: 1, xz: 6, Zstandard: 3, lz4: 1. This parameter is ignored for other compression formats. **format**: Override the autodetection of the input or output format. -Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``. +Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``, ``"lz4"``. **threads**: Set the number of additional threads spawned for compression or decompression. @@ -138,6 +139,9 @@ built-in support for multithreaded compression. For bz2 files, `pbzip2 (parallel bzip2) `_ is used. +For lz4 files, [python lz4](https://python-lz4.readthedocs.io/en/stable/index.html) +package is used. + ``xopen`` falls back to Python’s built-in functions (``gzip.open``, ``lzma.open``, ``bz2.open``) if none of the other methods can be used. diff --git a/pyproject.toml b/pyproject.toml index c0f5bc2..cf99591 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,8 @@ requires-python = ">=3.9" dynamic = ["version"] dependencies = [ 'isal>=1.6.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"', - 'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"' + 'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"', + 'lz4>4.3.1; platform_python_implementation != "PyPy"', ] [project.urls] diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 89f5137..68bf155 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -43,6 +43,7 @@ XOPEN_DEFAULT_BZ2_COMPRESSION = 9 XOPEN_DEFAULT_XZ_COMPRESSION = 6 XOPEN_DEFAULT_ZST_COMPRESSION = 3 +XOPEN_DEFAULT_LZ4_COMPRESSION = 0 igzip: Optional[ModuleType] isal_zlib: Optional[ModuleType] @@ -70,6 +71,11 @@ except ImportError: zstandard = None # type: ignore +try: + import lz4.frame # type: ignore +except ImportError: + lz4 = None + try: import fcntl @@ -120,6 +126,7 @@ class _ProgramSettings: "zstd": _ProgramSettings(("zstd",), tuple(range(1, 20)), "-T"), "pigz": _ProgramSettings(("pigz", "--no-name"), tuple(range(0, 10)) + (11,), "-p"), "gzip": _ProgramSettings(("gzip", "--no-name"), tuple(range(1, 10))), + "lz4": _ProgramSettings(("lz4",), tuple(range(0, 17))), } @@ -551,6 +558,57 @@ def _open_zst( return io.BufferedWriter(f) # mode "ab" and "wb" +def _open_lz4( + filename: FileOrPath, + mode: str, + compresslevel: Optional[int], + threads: Optional[int], +): + assert mode in ("rb", "ab", "wb") + if compresslevel is None: + compresslevel = XOPEN_DEFAULT_LZ4_COMPRESSION + + if lz4 is not None and (mode == "rb" or threads == 0): + # Use Python bindings + return lz4.frame.LZ4FrameFile(filename, mode, compression_level=compresslevel) + + # Attempt to use the CLI program. + # + # Notes: + # + # - Multithreading in lz4 is only supported for compression, not for decompression. + # - Older versions of lz4 (such as v1.94, which comes with Ubuntu 24.04) do not support + # multithreading. They fail if one tries to pass the -T option. + # - The newer versions use a default of -T0, which chooses the number of threads + # automatically (presumably the number of available cores). + try: + # Try with the -T option first + import copy + + program_settings = copy.copy(_PROGRAM_SETTINGS["lz4"]) + program_settings.threads_flag = "-T" + return _PipedCompressionProgram( + filename, mode, compresslevel, threads, program_settings=program_settings + ) + except FileNotFoundError: + # Binary not found, use Python bindings if available + if lz4 is not None: + return lz4.frame.LZ4FrameFile( + filename, mode, compression_level=compresslevel + ) + else: + raise + except OSError: + # Assume the problem is that the -T option is not supported and re-try without it: + return _PipedCompressionProgram( + filename, + mode, + compresslevel, + threads, + program_settings=_PROGRAM_SETTINGS["lz4"], + ) + + def _open_gz( filename: FileOrPath, mode: str, @@ -683,6 +741,10 @@ def _detect_format_from_content(filename: FileOrPath) -> Optional[str]: elif bs[:4] == b"\x28\xb5\x2f\xfd": # https://datatracker.ietf.org/doc/html/rfc8478#section-3.1.1 return "zst" + elif bs[:4] == b"\x04\x22\x4d\x18": + # https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md + return "lz4" + return None finally: if closefd: @@ -694,7 +756,7 @@ def _detect_format_from_extension(filename: Union[str, bytes]) -> Optional[str]: Attempt to detect file format from the filename extension. Return None if no format could be detected. """ - for ext in ("bz2", "xz", "gz", "zst"): + for ext in ("bz2", "xz", "gz", "zst", "lz4"): if isinstance(filename, bytes): if filename.endswith(b"." + ext.encode()): return ext @@ -717,7 +779,7 @@ def _file_or_path_to_binary_stream( # object is not binary, this will crash at a later point. return file_or_path, False # type: ignore raise TypeError( - f"Unsupported type for {file_or_path}, " f"{file_or_path.__class__.__name__}." + f"Unsupported type for {file_or_path}, {file_or_path.__class__.__name__}." ) @@ -797,6 +859,7 @@ def xopen( # noqa: C901 - .bz2 uses bzip2 compression - .xz uses xz/lzma compression - .zst uses zstandard compression + - .lz4 uses lz4 compression - otherwise, no compression is used When reading, if a file name extension is available, the format is detected @@ -808,7 +871,7 @@ def xopen( # noqa: C901 compresslevel is the compression level for writing to gzip, xz and zst files. This parameter is ignored for the other compression formats. If set to None, a default depending on the format is used: - gzip: 6, xz: 6, zstd: 3. + gzip: 6, xz: 6, zstd: 3, lz4: 0. When threads is None (the default), compressed file formats are read or written using a pipe to a subprocess running an external tool such as, @@ -828,7 +891,7 @@ def xopen( # noqa: C901 format overrides the autodetection of input and output formats. This can be useful when compressed output needs to be written to a file without an - extension. Possible values are "gz", "xz", "bz2", "zst". + extension. Possible values are "gz", "xz", "bz2", "zst", "lz4". """ if mode in ("r", "w", "a"): mode += "t" # type: ignore @@ -844,10 +907,10 @@ def xopen( # noqa: C901 elif _file_is_a_socket_or_pipe(filename): filename = open(filename, binary_mode) # type: ignore - if format not in (None, "gz", "xz", "bz2", "zst"): + if format not in (None, "gz", "xz", "bz2", "zst", "lz4"): raise ValueError( f"Format not supported: {format}. " - f"Choose one of: 'gz', 'xz', 'bz2', 'zst'" + f"Choose one of: 'gz', 'xz', 'bz2', 'zst', 'lz4'." ) detected_format = format or _detect_format_from_extension(filepath) if detected_format is None and "r" in mode: @@ -861,6 +924,8 @@ def xopen( # noqa: C901 opened_file = _open_bz2(filename, binary_mode, compresslevel, threads) elif detected_format == "zst": opened_file = _open_zst(filename, binary_mode, compresslevel, threads) + elif detected_format == "lz4": + opened_file = _open_lz4(filename, binary_mode, compresslevel, threads) else: opened_file, _ = _file_or_path_to_binary_stream(filename, binary_mode) diff --git a/tests/file.txt.lz4 b/tests/file.txt.lz4 new file mode 100644 index 0000000..5b2ed80 Binary files /dev/null and b/tests/file.txt.lz4 differ diff --git a/tests/test_piped.py b/tests/test_piped.py index 9f8afbe..eba903f 100644 --- a/tests/test_piped.py +++ b/tests/test_piped.py @@ -18,7 +18,7 @@ _ProgramSettings, ) -extensions = ["", ".gz", ".bz2", ".xz", ".zst"] +extensions = ["", ".gz", ".bz2", ".xz", ".zst", ".lz4"] try: import fcntl @@ -57,16 +57,24 @@ def available_zstd_programs(): return [] +def available_lz4_programs(): + if shutil.which("lz4"): + return [_PROGRAM_SETTINGS["lz4"]] + return [] + + PIPED_GZIP_PROGRAMS = available_gzip_programs() PIPED_BZIP2_PROGRAMS = available_bzip2_programs() PIPED_XZ_PROGRAMS = available_xz_programs() PIPED_ZST_PROGRAMS = available_zstd_programs() +PIPED_LZ4_PROGRAMS = available_lz4_programs() ALL_PROGRAMS_WITH_EXTENSION = ( list(zip(PIPED_GZIP_PROGRAMS, cycle([".gz"]))) + list(zip(PIPED_BZIP2_PROGRAMS, cycle([".bz2"]))) + list(zip(PIPED_XZ_PROGRAMS, cycle([".xz"]))) + list(zip(PIPED_ZST_PROGRAMS, cycle([".zst"]))) + + list(zip(PIPED_LZ4_PROGRAMS, cycle([".lz4"]))) ) diff --git a/tests/test_xopen.py b/tests/test_xopen.py index 9e8f816..aed77cd 100644 --- a/tests/test_xopen.py +++ b/tests/test_xopen.py @@ -2,28 +2,32 @@ Tests for the xopen.xopen function """ import bz2 -import subprocess -import sys -import tempfile -from contextlib import contextmanager import functools import gzip import io import lzma import os -from pathlib import Path import shutil +import subprocess +import sys +import tempfile +from contextlib import contextmanager +from pathlib import Path + import pytest -from xopen import xopen, _detect_format_from_content +from xopen import _detect_format_from_content, xopen +try: + import lz4.frame +except ImportError: + lz4 = None try: import zstandard except ImportError: zstandard = None - # TODO this is duplicated in test_piped.py TEST_DIR = Path(__file__).parent CONTENT_LINES = ["Testing, testing ...\n", "The second line.\n"] @@ -31,6 +35,8 @@ extensions = ["", ".gz", ".bz2", ".xz"] if shutil.which("zstd") or zstandard: extensions += [".zst"] +if shutil.which("lz4") or lz4: + extensions += [".lz4"] base = os.path.join(os.path.dirname(__file__), "file.txt") files = [base + ext for ext in extensions] @@ -369,6 +375,10 @@ def test_read_no_threads(ext): } if ext == ".zst" and zstandard is None: return + if ext == ".lz4" and lz4 is None: + return + if ext == ".lz4" and lz4.frame is not None: + klasses[".lz4"] = lz4.frame.LZ4FrameFile klass = klasses[ext] with xopen(TEST_DIR / f"file.txt{ext}", "rb", threads=0) as f: assert isinstance(f, klass), f @@ -401,6 +411,10 @@ def test_write_no_threads(tmp_path, ext): # Skip zst because if python-zstandard is not installed, # we fall back to an external process even when threads=0 return + if ext == ".lz4" and lz4 is None: + return + if ext == ".lz4" and lz4.frame is not None: + klasses[".lz4"] = lz4.frame.LZ4FrameFile klass = klasses[ext] with xopen(tmp_path / f"out{ext}", "wb", threads=0) as f: if isinstance(f, io.BufferedWriter): @@ -613,7 +627,6 @@ def test_xopen_zst_long_window_size(threads): def test_pass_file_object_for_reading(ext, threads): if ext == ".zst" and zstandard is None: return - with open(TEST_DIR / f"file.txt{ext}", "rb") as fh: with xopen(fh, mode="rb", threads=threads) as f: assert f.readline() == CONTENT_LINES[0].encode("utf-8") @@ -641,6 +654,11 @@ def test_pass_bytesio_for_reading_and_writing(ext, threads): format = None if ext == ".zst" and zstandard is None: return + if ext == ".lz4" and lz4 is None and threads == 0: + pytest.skip("lz4 not working for BytesIO in piped write mode") + if ext == ".lz4" and threads != 0: + # _PipedCompressionProgram not working on write mode + pytest.skip("lz4 not working for BytesIO in piped write mode") first_line = CONTENT_LINES[0].encode("utf-8") writer = xopen(filelike, "wb", format=format, threads=threads) writer.write(first_line)