diff --git a/pypdf/_codecs/_codecs.py b/pypdf/_codecs/_codecs.py index 9b7fd05b7..ad75f0c66 100644 --- a/pypdf/_codecs/_codecs.py +++ b/pypdf/_codecs/_codecs.py @@ -9,6 +9,8 @@ from abc import ABC, abstractmethod from typing import Dict, List +from pypdf._utils import logger_warning + class Codec(ABC): """Abstract base class for all codecs.""" @@ -209,6 +211,7 @@ def decode(self, data: bytes) -> bytes: self._byte_pointer = 0 self._next_data = 0 self._next_bits = 0 + self.max_code_value = (1 << self.MAX_BITS_PER_CODE) - 1 output_stream = io.BytesIO() @@ -250,6 +253,9 @@ def decode(self, data: bytes) -> bytes: def _add_entry_decode(self, old_string: bytes, new_char: int) -> None: new_string = old_string + bytes([new_char]) + if self._table_index > self.max_code_value: + logger_warning("Ignoring too large LZW table index.", __name__) + return self.decoding_table[self._table_index] = new_string self._table_index += 1 diff --git a/resources/lzw_decoder_table_overflow.bin b/resources/lzw_decoder_table_overflow.bin new file mode 100644 index 000000000..cd79ae33f Binary files /dev/null and b/resources/lzw_decoder_table_overflow.bin differ diff --git a/tests/test_codecs.py b/tests/test_codecs.py index e15341e83..38aaeb550 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -1,9 +1,14 @@ """Test LZW-related code.""" +from pathlib import Path import pytest from pypdf._codecs._codecs import LzwCodec +TESTS_ROOT = Path(__file__).parent.resolve() +PROJECT_ROOT = TESTS_ROOT.parent +RESOURCE_ROOT = PROJECT_ROOT / "resources" + test_cases = [ pytest.param(b"", id="Empty input"), pytest.param(b"A", id="Single character"), @@ -56,3 +61,12 @@ def test_decode_lzw(encoded, expected_decoded): codec = LzwCodec() actual_decoded = codec.decode(encoded) assert actual_decoded == expected_decoded + + +def test_lzw_decoder_table_overflow(caplog): + path = RESOURCE_ROOT / "lzw_decoder_table_overflow.bin" + codec = LzwCodec() + assert codec.decode(path.read_bytes()).startswith( + b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@' + ) + assert "Ignoring too large LZW table index." in caplog.text