Skip to content

Commit

Permalink
ROB: Improve handling of LZW decoder table overflow
Browse files Browse the repository at this point in the history
Closes #3032.
  • Loading branch information
stefan6419846 committed Feb 26, 2025
1 parent 6003a1e commit 4ba5a6c
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 0 deletions.
6 changes: 6 additions & 0 deletions pypdf/_codecs/_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from abc import ABC, abstractmethod
from typing import Dict, List

from pypdf._utils import logger_warning


class Codec(ABC):
"""Abstract base class for all codecs."""
Expand Down Expand Up @@ -209,6 +211,7 @@ def decode(self, data: bytes) -> bytes:
self._byte_pointer = 0
self._next_data = 0
self._next_bits = 0
self.max_code_value = (1 << self.MAX_BITS_PER_CODE) - 1

output_stream = io.BytesIO()

Expand Down Expand Up @@ -250,6 +253,9 @@ def decode(self, data: bytes) -> bytes:

def _add_entry_decode(self, old_string: bytes, new_char: int) -> None:
new_string = old_string + bytes([new_char])
if self._table_index > self.max_code_value:
logger_warning("Ignoring too large LZW table index.", __name__)
return
self.decoding_table[self._table_index] = new_string
self._table_index += 1

Expand Down
Binary file added resources/lzw_decoder_table_overflow.bin
Binary file not shown.
14 changes: 14 additions & 0 deletions tests/test_codecs.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
"""Test LZW-related code."""
from pathlib import Path

import pytest

from pypdf._codecs._codecs import LzwCodec

TESTS_ROOT = Path(__file__).parent.resolve()
PROJECT_ROOT = TESTS_ROOT.parent
RESOURCE_ROOT = PROJECT_ROOT / "resources"

test_cases = [
pytest.param(b"", id="Empty input"),
pytest.param(b"A", id="Single character"),
Expand Down Expand Up @@ -56,3 +61,12 @@ def test_decode_lzw(encoded, expected_decoded):
codec = LzwCodec()
actual_decoded = codec.decode(encoded)
assert actual_decoded == expected_decoded


def test_lzw_decoder_table_overflow(caplog):
path = RESOURCE_ROOT / "lzw_decoder_table_overflow.bin"
codec = LzwCodec()
assert codec.decode(path.read_bytes()).startswith(
b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@'
)
assert "Ignoring too large LZW table index." in caplog.text

0 comments on commit 4ba5a6c

Please sign in to comment.