From 4ba5a6c52fff8af9166ed3793fe672b20c9b1342 Mon Sep 17 00:00:00 2001 From: stefan6419846 <96178532+stefan6419846@users.noreply.github.com> Date: Wed, 26 Feb 2025 17:51:36 +0100 Subject: [PATCH] ROB: Improve handling of LZW decoder table overflow Closes #3032. --- pypdf/_codecs/_codecs.py | 6 ++++++ resources/lzw_decoder_table_overflow.bin | Bin 0 -> 5421 bytes tests/test_codecs.py | 14 ++++++++++++++ 3 files changed, 20 insertions(+) create mode 100644 resources/lzw_decoder_table_overflow.bin diff --git a/pypdf/_codecs/_codecs.py b/pypdf/_codecs/_codecs.py index 9b7fd05b7..ad75f0c66 100644 --- a/pypdf/_codecs/_codecs.py +++ b/pypdf/_codecs/_codecs.py @@ -9,6 +9,8 @@ from abc import ABC, abstractmethod from typing import Dict, List +from pypdf._utils import logger_warning + class Codec(ABC): """Abstract base class for all codecs.""" @@ -209,6 +211,7 @@ def decode(self, data: bytes) -> bytes: self._byte_pointer = 0 self._next_data = 0 self._next_bits = 0 + self.max_code_value = (1 << self.MAX_BITS_PER_CODE) - 1 output_stream = io.BytesIO() @@ -250,6 +253,9 @@ def decode(self, data: bytes) -> bytes: def _add_entry_decode(self, old_string: bytes, new_char: int) -> None: new_string = old_string + bytes([new_char]) + if self._table_index > self.max_code_value: + logger_warning("Ignoring too large LZW table index.", __name__) + return self.decoding_table[self._table_index] = new_string self._table_index += 1 diff --git a/resources/lzw_decoder_table_overflow.bin b/resources/lzw_decoder_table_overflow.bin new file mode 100644 index 0000000000000000000000000000000000000000..cd79ae33fa274e59064ea8425f0e9b128cfa0ded GIT binary patch literal 5421 zcmV+|71HW}32*iR>Fq}{+6$-_I(Qv$UJ01^5!}0-a zxLq$7%m=gK^!y+Mf`Op0U@#U91p-6||{o=zr{31p&4AdW^NkqBfW3Fz{8JDrZFLz&3pa5tNc zro&mN(r7c8jHW{wh{8D>4aUP!pv+`37(k$qXhbR*4v0jek!ZwfIUbNmq>^bws#z|W zOs11*#OirIpirokYDFrUPN-C>m1@Omxn8hXtd?s&om7aNYgb9Q&`tF4U=fwHx1Kx-#88vh~qg9QVN5iC|Vkaq9~dw zi=!yoI*+6&8cLI;DOy^WrYV|go2Mz-dY`B&8j7Q-Dq5PSsw$eQtE(#7y05G&8p^Y+ zD_YvOt}B}AyRR$S`oFL&8w$g*EL$4KvMie_%d;%oI?uE%8%oo)En8aGwk?}#+qW&- zdf&J%8;aw(C>&nq#!iH4>BkPVcI?N_4)lib$j>B_^U04?p7zR5gsS<=545)Y%g^M(|ICmS&jQU*MB4?<5On7V&d{Xu z3(t_${|?Ym1q%_;5j7VS(a~ih<;D(lX6VMwq;l)Wj?{kc$WH`{@yQP~mh{QbWTN-V zkCd+Z%TL6*{mc*a#{kUGB+~=UkW}9W&QOHw3C|F;_YBX_Q2eSv7!77e9X*jAlo zVcM0QiE7rCEv;iTt+S5PHts`C+&L{na^*EG)0WjYZev#7Ijv)Q=QXYKp4T_-gI?e| zErWvTHZBvU**I>aX5u=nql)V`uN!P*l*PG_QM1oU}Jc7Z+Zx~*I@YIvx0W7q7Arrbo*fgjD9eY zuQ?Zp{o5Osc`xy_Ik%kvTWg?ta2dHeSFr-yyQ+JzLB2Zo#RXgow0!Vs!#kJR2i#k} ziEVYrJU8svoU6!xZJo_L*Z|uc`_h1|#nL@^6y2Rm+=1=w*FG31;2oRhgDw@^KDa#M zo$K&~ana#FR>I{TJNj(SW#&Kj*5{s#1a1zE>p+(3>K@zlrt(8MOghQ=7chfauQa}XmJ>1RI{6q`ZLK1b2y?TZIaUdbVR8)F(+&%iV@~f zOt@PWqx`6vFwQPWSzR^Z%wmyp(r&}}(IjQm2lhYm)#VGeFCTuT@ z5(ZUGIBOTAJg}QE?lMT(Z8zc!YLasDa>RJyCT0}tgt0n1$_bMS;-r0)(>@r*sRt`2 ztT&7jCRt6mdl{sBxSTN#Hc451IpR!msr0_eQ3~rkYPB}AbY8wsin~v36&|p2{<2g{ zQ%P)92Ca22*iy?QLu-`os&vlGQA+bYYSlWk^lrdU%EM4?H6pO|4zyH@TS{zo3a#P#RVJ}?9=24=V@qt64z6>)s#8ldMQhad zt90(rQOg5AYZXGY^p3<(O4CwpbtDJ58oXJ{9bhY!ma!G;&sZkt<{VReZO%!;H)k~C8}o2-jv2f;#~k1rla_G} z>CZU_N@fgEd@aUE!WUySV+-+cGKLtu8N(c43z3#F1?bNi0|Z?SU3=}mN5S`BGvfR2 zZ}NT@yZOHx-}{f1@crk``2Pdk{J)9zMn=KcTQg$o?QgPn7Q5NI8(-^bzo^8#GVxrr}jw8)q;>Sj(YZt`9eyLqn}-@AvD@ZICic<%w+ytjz& zU9-q|F6rhxSA6+~gVVa-XzsoX$NA2T)Y}7XZ%!S`Iu?A=+QVgTP5s6>mW9*$=V|WV z56F4mj@26rZf|ZO%DN_g(;CZWZf*g_xu%EII`3-k{u9Xg?vT}66K`*hCCfTCfYVyj zXKs!K$2qo%)Oz=8?;aP)dH#~sn;UR%t|`m9MuO9t+h}gC3CFp{i`2URZ0|lB$@&g^ z(z?H7?mhd(`HqCs+T&?%&JD;p){WHr2W{_OAIf?jebV~JW$s=7#(AEG)Eet*ZtfAt zx#o}6I}dK}ek02IE`QTH&t~p^1IGETh}2s1Yi|w}$vO6s)q59j@17^idOm^Ede>*} zo(AKtZqZFh*UhlrP8j6QfYNU`{?8EnPU!j0!1qp=^w0qEPuTCzud#>TPwMdR z!tQUH>+f>uZ%Y5~BK>aq`tH*BZmRY067z6A{0`*#j=c8{l=P4M@(=`TkAuO9s__WG{Q_%5dPunzMt=J2n+?l6+}&R+CS`0>xs?@y@g z&w%PrSpd&4{!a+|&fxh@xeL!i|8E-n?(+I>%J=S~^>7OE@8a-py6x|h>u+k}u;%fP z%CaIBPbmG*0Q*kZ_|CBQO^)-=@bOQ`?$4nXPMY-a0`hO#@9(nh zZ;I;gV%4uY{_hg}ZtD5&!uN5H0WVF<9b4e<#RFw#}2aux-(DxD%OBYd}1F&TX@VO2#4HI!;7_qei zkW&ZHrwtJM5>ZbVk)r`{K?bmu3-IX?F+CP>kpj?H2oS3cQ2!EbJktP!vBog@|64@XU zxgHXk91?jm5?L`4IY1K6A9Al9a*rEwZ!&UEFmf+0at|zV?<#T6C~*%aa~~vf{~~hV zAacJRa-SS>e+F`2F>*gIavv>m|0{9dDRI9ib1x-x4q z^R+>=fj5)iL{zIY^QAzvaW<3BLsXkNGxIGpRuIbvDz}LzJI6bNxm%$u-l% zLX?j5MHII*v#~*ReK%9vL=>kpv!OtAZ8lTOLll=d^Yulv!8KF6 zLKKHL^W{Xeu{2YwK@@j3^VLJNp*fXdU)2d(RzXzN=}%RWV%0faRN+_F8B$fjOx0;% zRS{a&NmW(xPt}=YRY6_U=~z}FQq{>#RdHa|8CzCSRn_@WRiR_mNnTX(Sk@_1Rnbk= zd0|x{ThwJ%Rsm4esbp1AUex(nRxwl6*-lk~Vbv*IRMA$}2~kzCWYt+;RRLMnIaF2Q zPSuHGRWV%D*;iH(QPsI+Rr^qNqhs|;UUc(V_A66$(@phzVRa*0^vhOu15ov=WOY+s z^!r(MGgI~3PIZG}^($O-(^mEiQFXIq^;=(c16lSvRCVJ{^^0P4GhFoBS9TLo^}9dy zV_)?PT6RNJ_3KY{lVbHdU3BAD_8U@l!%X#SV09B(_DfZD^H24gV|7Da^y^r5BU1It zO?7i%^&4AuQ&smLYWI0?m(yvtWp7u%Xm?R=7prGCJ#5#PW%ns-mxFOP-D+2FZ}-V* z7h7((wP)8yZFiw&moscPjd9kkcXtPU7MXT7^Ly5XboSqRmT_~o%XwB^a(1_P7ZH87 zqjuK)e0Gm@mg##odvsRSdG=>>7QuM88-7?fa97W1cVTZAyJ$B}ZP%w}_c?5rlV!Ih zYgd18_t|O}Yj3y3XxCS6cd=)eLv1&mW>+t3_la?r<7&5ch?bRg_V0U^fpoUpdRA?7 zcE@=ZS#mbBch?PlcAs{Z`Fu8ubynqi_IGp^(RsFGa@M_gcNcyZsdqO6eAwG=*SBZ) zNo|*-X16tLSC3_PA!`?VaJSWI*Jp2c!DyFLZa1xGS3hj`nPwL&Yqy1QSKn%Pac~#Q zX*XSx*d>Q}$&tBjfmmmYnG2QJ1%-I2kGV~MSVxJN>yy|ujd>}TxFd&H$B~(9fjDK0 zc?p%c1BF65rKh*_zZm?4Ka#gTbwff!?pSqGJv0fjiFk9kRd7(|AiQ%k6A~4_(6#{<&$_Xh#1q7S$Bc@5s^7% zfY@J)83UD9{e<|Tk2ytt*guIF8b(X^XiHmG}pRn5&Q4;hFi< zpn5x|`lGFT3z>P#pZYVVdYi2K1DN^4pL#2$`jf1B`xg-)o%$1`dV8w+51F~o zpgK3Ex}U8(2bnp?pSmxmI+v`v|CqVIpE@U{x{s_n_n0}ip1L2UI)|&e?~=K%o;nw# zx__%W=bBmBpxQyFTB)tu5t-T1pjtVm+M%sm37J{RpV~2|TA8fd0hrmrpIRxU+L5eU z`IuR`p4uU$T8XRL@t4`Lo?02C+JUQD>6&~wz1xAZTv@-{p|o6i!P~*MS`Wk9;ka8V z#oPh9TS>>gy_W1YM^z59c*d|SW!qqKZ` z!TZCudPBqewETm@d&{=_6U2M#xcf84 zd<(k!Q^S)3wdo&(Z-@zGu1&^^u19lOo_sm$G%%e{rlopZ^3VbtCi z)4m1Lo%7Ls;m}>r&po})9jna!nakaW%Dr{TonzGg8Pnbe(!KT3o#W7b(a&AK&ONQo z9h=MjiOSt~$-QOO9vjpC3DVv7(Y@u+ozu^K!OmT;%{`sW9gE8SdCA>p)jl269t+a_ z`O)3y(7n~qox{$3vCUnd%sq|E9ec_BY1LjI)IJT<9sAM!>CoNR&%MRYowLn-q0C*6 z%RPO{9c$HoA=F+E(>?vt9qZ8j+0Wg_&b_tGoukZsk;`3w$~|q>o+H$L5z}4&(mn0b z9ox_S$Fgev=N?J>ABpxKIsP8M X>3(nIf3Nsp&J4iH41mTAK*As(2ch48 literal 0 HcmV?d00001 diff --git a/tests/test_codecs.py b/tests/test_codecs.py index e15341e83..38aaeb550 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -1,9 +1,14 @@ """Test LZW-related code.""" +from pathlib import Path import pytest from pypdf._codecs._codecs import LzwCodec +TESTS_ROOT = Path(__file__).parent.resolve() +PROJECT_ROOT = TESTS_ROOT.parent +RESOURCE_ROOT = PROJECT_ROOT / "resources" + test_cases = [ pytest.param(b"", id="Empty input"), pytest.param(b"A", id="Single character"), @@ -56,3 +61,12 @@ def test_decode_lzw(encoded, expected_decoded): codec = LzwCodec() actual_decoded = codec.decode(encoded) assert actual_decoded == expected_decoded + + +def test_lzw_decoder_table_overflow(caplog): + path = RESOURCE_ROOT / "lzw_decoder_table_overflow.bin" + codec = LzwCodec() + assert codec.decode(path.read_bytes()).startswith( + b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@' + ) + assert "Ignoring too large LZW table index." in caplog.text