From fa079f631e92afebbd01839acef1926ab754542b Mon Sep 17 00:00:00 2001 From: James Funk Date: Wed, 1 Jan 2025 12:29:26 -0700 Subject: [PATCH 1/9] Introduce PAX header support --- src/Parser/Header.php | 30 ++++ src/StreamReader.php | 26 +++- tests/Parser/PaxHeaderTest.php | 130 ++++++++++++++++++ .../MassiveFileStreamWrapper.php | 81 +++++++++++ 4 files changed, 265 insertions(+), 2 deletions(-) create mode 100644 tests/Parser/PaxHeaderTest.php create mode 100644 tests/ResourceGenerators/MassiveFileStreamWrapper.php diff --git a/src/Parser/Header.php b/src/Parser/Header.php index 26d1a0f..5f5d2e1 100644 --- a/src/Parser/Header.php +++ b/src/Parser/Header.php @@ -36,6 +36,7 @@ class Header { private string $content; + private array $pax = []; public function __construct(string $content) { @@ -47,6 +48,26 @@ public function __construct(string $content) $this->content = $content; } + public function harvestPaxData(string $paxData): void + { + foreach (explode(PHP_EOL, $paxData) as $record) { + if ($record === '') { + continue; + } + preg_match_all('/^(\d+) ([^=]+)=(.*)$/', $record, $matches); + if (count($matches) !== 4) { + throw new InvalidArchiveFormatException( + sprintf('Invalid Pax header record format: %s', $record) + ); + } + + $key = $matches[2][0]; + $value = $matches[3][0]; + + $this->pax[$key] = $value; + } + } + public function isValid(): bool { return $this->getMagic() === 'ustar'; @@ -60,6 +81,10 @@ public function getName(): string public function getSize(): int { + if (array_key_exists('size', $this->pax)) { + return (int)$this->pax['size']; + } + $str = rtrim(substr($this->content, 124, 12)); if (preg_match('/^[0-7]+$/D', $str) !== 1) { throw new InvalidArchiveFormatException( @@ -73,6 +98,11 @@ public function getSize(): int return (int)octdec($str); } + public function mergePaxHeader(Header $header): void + { + $this->pax = array_merge($header->pax, $this->pax); + } + /* * Values used in typeflag field * #define REGTYPE '0' // regular file diff --git a/src/StreamReader.php b/src/StreamReader.php index 895ef91..7a3dcb2 100644 --- a/src/StreamReader.php +++ b/src/StreamReader.php @@ -22,6 +22,8 @@ class StreamReader implements IteratorAggregate /** @var resource */ private $stream; + private ?Header $globalPaxHeader = null; + /** * @param resource $stream Stream resource of TAR file */ @@ -51,7 +53,7 @@ public function getIterator(): Iterator if (!$header->isValid()) { throw new InvalidArchiveFormatException( sprintf( - 'Invalid TAR archive format: Invalid Tar header format: at %s. bytes', + 'Invalid TAR archive format: Invalid Tar header format: at byte %s', $blockStart ) ); @@ -165,7 +167,27 @@ private function readHeader(): Header } while (self::isNullFilled($header)); // ↑↑↑ TAR format inserts few blocks of nulls to EOF - just skip it - return new Header($header); + $header = new Header($header); + + // Handle PAX header block + $paxHeader = null; + switch($header->getType()) { + case 'x': + $paxHeader = $header; + $paxData = fread($this->stream, $paxHeader->getSize()); + $paxHeader->harvestPaxData($paxData); + $header = $this->readHeader(); + break; + } + + if ($this->globalPaxHeader) { + $header->mergePaxHeader($this->globalPaxHeader); + } + if ($paxHeader) { + $header->mergePaxHeader($paxHeader); + } + + return $header; } private static function isNullFilled(string $string): bool diff --git a/tests/Parser/PaxHeaderTest.php b/tests/Parser/PaxHeaderTest.php new file mode 100644 index 0000000..47fcdb8 --- /dev/null +++ b/tests/Parser/PaxHeaderTest.php @@ -0,0 +1,130 @@ +markTestSkipped('Massive file test is skipped. Set TEST_MASSIVE_FILE=true to run it.'); + } + parent::setUp(); + } + + protected function generateSingleFile(?int $chunkSize = null, ?int $totalSize = null): Generator + { + $chunkSize ??= 1024 * 1024; // 1MB chunks + $totalSize ??= 10 * 1024 * $chunkSize; // 10GB + assert($totalSize > $chunkSize); + $chunks = floor($totalSize / $chunkSize); + $extra = $totalSize % $chunkSize; + + if ($totalSize > self::MAX_USTAR_SIZE) { + // Generate a pax header and data record for a file larger than 8GB + $data = implode(PHP_EOL, [ + implode(' ', array_reverse([$data = 'size=' . $totalSize, strlen($data)])), + ]) . PHP_EOL; + + yield str_pad(implode('', [ + /* 0 */ 'name' => str_pad('massive_file.txt', 100, "\0"), + /*100*/ 'mode' => str_pad('', 8, "\0"), + /*108*/ 'uid' => str_pad('', 8, "\0"), + /*116*/ 'gid' => str_pad('', 8, "\0"), + /*100*/ 'size' => str_pad(decoct(strlen($data)), 12, "\0"), + /*136*/ 'mtime' => str_pad('', 12, "\0"), + /*148*/ 'chksum' => str_pad('', 8, "\0"), + /*156*/ 'typeflag' => 'x', + /*157*/ 'linkname' => str_pad('', 100, "\0"), + /*257*/ 'magic' => "ustar\0", + /*263*/ 'version' => str_pad('', 2, "\0"), + /*265*/ 'uname' => str_pad('', 32, "\0"), + /*297*/ 'gname' => str_pad('', 32, "\0"), + /*329*/ 'devmajor' => str_pad('', 8, "\0"), + /*337*/ 'devminor' => str_pad('', 8, "\0"), + /*345*/ 'prefix' => str_pad('PaxHeaders.0', 155, "\0"), + ]), 512, "\0"); + yield $data; + } + + yield str_pad(implode('', [ + /* 0 */ 'name' => str_pad('massive_file.txt', 100, "\0"), + /*100*/ 'mode' => str_pad('', 8, "\0"), + /*108*/ 'uid' => str_pad('', 8, "\0"), + /*116*/ 'gid' => str_pad('', 8, "\0"), + /*100*/ 'size' => str_pad(decoct(min(octdec('77777777777'), $totalSize)), 12, "\0"), + /*136*/ 'mtime' => str_pad('', 12, "\0"), + /*148*/ 'chksum' => str_pad('', 8, "\0"), + /*156*/ 'typeflag' => '0', + /*157*/ 'linkname' => str_pad('', 100, "\0"), + /*257*/ 'magic' => "ustar\0", + /*263*/ 'version' => str_pad('', 2, "\0"), + /*265*/ 'uname' => str_pad('', 32, "\0"), + /*297*/ 'gname' => str_pad('', 32, "\0"), + /*329*/ 'devmajor' => str_pad('', 8, "\0"), + /*337*/ 'devminor' => str_pad('', 8, "\0"), + /*345*/ 'prefix' => str_pad('', 155, "\0"), + ]), 512, "\0"); + + $alphabet = range('A', 'Z'); + for ($i = 0; $i < $chunks; $i++) { + yield str_repeat($alphabet[$i % 26], $chunkSize); + } + + if ($extra > 0) { + yield str_repeat('#', $extra); + } + } + + public function fileCountProvider() + { + return [ + 'one' => [1], + 'two' => [2], + 'three' => [3], + ]; + } + + /** + * @dataProvider fileCountProvider + */ + public function testPaxHeader(int $fileCount): void + { + $fileGenerator = (function () use ($fileCount) { + for ($i = 0; $i < $fileCount; $i++) { + yield from $this->generateSingleFile(); + } + })(); + $resource = fopen('massive-file://', 'r', context: stream_context_create([ + 'massive-file' => ['generator' => $fileGenerator], + ])); + $this->assertIsResource($resource); + + try { + $streamReader = new StreamReader($resource); + + $iterator = $streamReader->getIterator(); + $count = 0; + foreach ($iterator as $file) { + $count++; + $fileResource = $file->getContent()->detach(); + $bytesRead = 0; + while (!feof($fileResource)) { + $bytesRead += strlen(fread($fileResource, 1024 ** 2)); + } + $this->assertSame($file->getSize(), $bytesRead); + } + $this->assertSame($fileCount, $count); + } finally { + fclose($resource); + } + } +} diff --git a/tests/ResourceGenerators/MassiveFileStreamWrapper.php b/tests/ResourceGenerators/MassiveFileStreamWrapper.php new file mode 100644 index 0000000..4621ef0 --- /dev/null +++ b/tests/ResourceGenerators/MassiveFileStreamWrapper.php @@ -0,0 +1,81 @@ +context); + if (!array_key_exists('massive-file', $options)) { + return false; + } + $options = $options['massive-file']; + + if (!array_key_exists('generator', $options)) { + return false; + } + $this->generator = $options['generator']; + + return true; + } + + public function stream_eof(): bool + { + return !$this->generator->valid() && strlen($this->buffer) === 0; + } + + public function stream_read(int $count): string + { + while ($this->generator->valid() && strlen($this->buffer) < $count) { + $this->buffer .= $this->generator->current(); + $this->generator->next(); + } + $data = substr($this->buffer, 0, $count); + $this->buffer = substr($this->buffer, $count); + $this->position += strlen($data); + return $data; + } + + public function stream_tell(): int + { + return $this->position; + } + + public function stream_seek(int $offset, int $whence): bool + { + if ($whence === SEEK_END) { + throw new RuntimeException('Seeking from the end is not supported'); + } + + if ( + ($whence === SEEK_CUR && $offset < 0) + || ($whence === SEEK_SET && $offset < $this->position) + ) { + throw new RuntimeException('Seeking backwards is not supported'); + } + + $newPosition = $whence === SEEK_CUR ? $this->position + $offset : $offset; + while ($this->position < $newPosition) { + // Limit read to 1 MiB, to reduce memory usage + $this->stream_read(min($newPosition - $this->position, 1204 ** 2)); + } + return true; + } + + public function stream_stat(): array + { + return []; + } +} + +stream_wrapper_register('massive-file', MassiveFileStreamWrapper::class) +|| throw new \RuntimeException('Failed to register stream wrapper ' . MassiveFileStreamWrapper::class); From 095a6ba0735929f7fb20c6ac672bccee11ecb1b5 Mon Sep 17 00:00:00 2001 From: James Funk Date: Thu, 2 Jan 2025 10:28:25 -0700 Subject: [PATCH 2/9] Remove unimplemented stub --- src/StreamReader.php | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/StreamReader.php b/src/StreamReader.php index 7a3dcb2..abc5600 100644 --- a/src/StreamReader.php +++ b/src/StreamReader.php @@ -22,8 +22,6 @@ class StreamReader implements IteratorAggregate /** @var resource */ private $stream; - private ?Header $globalPaxHeader = null; - /** * @param resource $stream Stream resource of TAR file */ @@ -180,9 +178,6 @@ private function readHeader(): Header break; } - if ($this->globalPaxHeader) { - $header->mergePaxHeader($this->globalPaxHeader); - } if ($paxHeader) { $header->mergePaxHeader($paxHeader); } From b16b6e50e2eade37a097ac46d719e1fe9fea309c Mon Sep 17 00:00:00 2001 From: James Funk Date: Thu, 2 Jan 2025 10:28:34 -0700 Subject: [PATCH 3/9] Ensure read data doesn't return false --- src/StreamReader.php | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/StreamReader.php b/src/StreamReader.php index abc5600..10a3c6e 100644 --- a/src/StreamReader.php +++ b/src/StreamReader.php @@ -173,6 +173,11 @@ private function readHeader(): Header case 'x': $paxHeader = $header; $paxData = fread($this->stream, $paxHeader->getSize()); + if ($paxData === false) { + throw new InvalidArchiveFormatException( + 'Invalid TAR archive format: Unexpected end of file, expected PAX header data' + ); + } $paxHeader->harvestPaxData($paxData); $header = $this->readHeader(); break; From bf20e3590e749eb602d3872448c1611f46e40fb8 Mon Sep 17 00:00:00 2001 From: James Funk Date: Thu, 2 Jan 2025 10:28:50 -0700 Subject: [PATCH 4/9] Use `\n` instead of PHP_EOL --- src/Parser/Header.php | 2 +- tests/Parser/PaxHeaderTest.php | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Parser/Header.php b/src/Parser/Header.php index 5f5d2e1..b82a524 100644 --- a/src/Parser/Header.php +++ b/src/Parser/Header.php @@ -50,7 +50,7 @@ public function __construct(string $content) public function harvestPaxData(string $paxData): void { - foreach (explode(PHP_EOL, $paxData) as $record) { + foreach (explode("\n", $paxData) as $record) { if ($record === '') { continue; } diff --git a/tests/Parser/PaxHeaderTest.php b/tests/Parser/PaxHeaderTest.php index 47fcdb8..8ce7620 100644 --- a/tests/Parser/PaxHeaderTest.php +++ b/tests/Parser/PaxHeaderTest.php @@ -30,9 +30,9 @@ protected function generateSingleFile(?int $chunkSize = null, ?int $totalSize = if ($totalSize > self::MAX_USTAR_SIZE) { // Generate a pax header and data record for a file larger than 8GB - $data = implode(PHP_EOL, [ + $data = implode("\n", [ implode(' ', array_reverse([$data = 'size=' . $totalSize, strlen($data)])), - ]) . PHP_EOL; + ]) . "\n"; yield str_pad(implode('', [ /* 0 */ 'name' => str_pad('massive_file.txt', 100, "\0"), From d64d4113ec9b6dd08520a50a6ee9cfda77718a94 Mon Sep 17 00:00:00 2001 From: James Funk Date: Thu, 2 Jan 2025 10:29:03 -0700 Subject: [PATCH 5/9] Teach PHPStan the format of an array variable --- src/Parser/Header.php | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Parser/Header.php b/src/Parser/Header.php index b82a524..edaa743 100644 --- a/src/Parser/Header.php +++ b/src/Parser/Header.php @@ -36,6 +36,7 @@ class Header { private string $content; + /** @var array */ private array $pax = []; public function __construct(string $content) From 9b57fffbd0a9cae3cbe3ed6713dd876e50572331 Mon Sep 17 00:00:00 2001 From: James Funk Date: Thu, 2 Jan 2025 10:33:23 -0700 Subject: [PATCH 6/9] Fix PHPStan error --- src/Parser/Header.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Parser/Header.php b/src/Parser/Header.php index edaa743..27e6c3c 100644 --- a/src/Parser/Header.php +++ b/src/Parser/Header.php @@ -55,8 +55,8 @@ public function harvestPaxData(string $paxData): void if ($record === '') { continue; } - preg_match_all('/^(\d+) ([^=]+)=(.*)$/', $record, $matches); - if (count($matches) !== 4) { + $matchesFound = preg_match_all('/^(\d+) ([^=]+)=(.*)$/', $record, $matches); + if (!$matchesFound) { throw new InvalidArchiveFormatException( sprintf('Invalid Pax header record format: %s', $record) ); From a88ca916d4742350a401f6ddeed9c2aba2f5ca72 Mon Sep 17 00:00:00 2001 From: James Funk Date: Tue, 14 Jan 2025 11:06:14 -0700 Subject: [PATCH 7/9] Ignore PHPStan error --- src/StreamReader.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/StreamReader.php b/src/StreamReader.php index 10a3c6e..94670f6 100644 --- a/src/StreamReader.php +++ b/src/StreamReader.php @@ -172,7 +172,7 @@ private function readHeader(): Header switch($header->getType()) { case 'x': $paxHeader = $header; - $paxData = fread($this->stream, $paxHeader->getSize()); + $paxData = fread($this->stream, $paxHeader->getSize()); // @phpstan-ignore argument.type if ($paxData === false) { throw new InvalidArchiveFormatException( 'Invalid TAR archive format: Unexpected end of file, expected PAX header data' From e15eac204be5c90eb664f19ea45bbd3ea4afadeb Mon Sep 17 00:00:00 2001 From: James Funk Date: Tue, 14 Jan 2025 11:12:23 -0700 Subject: [PATCH 8/9] Skip null byte padding --- src/StreamReader.php | 1 + 1 file changed, 1 insertion(+) diff --git a/src/StreamReader.php b/src/StreamReader.php index 94670f6..3b1ff36 100644 --- a/src/StreamReader.php +++ b/src/StreamReader.php @@ -173,6 +173,7 @@ private function readHeader(): Header case 'x': $paxHeader = $header; $paxData = fread($this->stream, $paxHeader->getSize()); // @phpstan-ignore argument.type + fseek($this->stream, 512 - ($paxHeader->getSize() % 512), SEEK_CUR); // Skip null byte padding if ($paxData === false) { throw new InvalidArchiveFormatException( 'Invalid TAR archive format: Unexpected end of file, expected PAX header data' From 74224d3c157d19ec33344c02916313da89933cef Mon Sep 17 00:00:00 2001 From: James Funk Date: Sat, 18 Jan 2025 00:26:07 -0700 Subject: [PATCH 9/9] Fix padded PAX content handling --- tests/Parser/PaxHeaderTest.php | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/Parser/PaxHeaderTest.php b/tests/Parser/PaxHeaderTest.php index 8ce7620..447ab89 100644 --- a/tests/Parser/PaxHeaderTest.php +++ b/tests/Parser/PaxHeaderTest.php @@ -33,13 +33,15 @@ protected function generateSingleFile(?int $chunkSize = null, ?int $totalSize = $data = implode("\n", [ implode(' ', array_reverse([$data = 'size=' . $totalSize, strlen($data)])), ]) . "\n"; + $dataLength = strlen($data); + $data = str_pad($data, 512 * ceil($dataLength / 512), "\0"); yield str_pad(implode('', [ /* 0 */ 'name' => str_pad('massive_file.txt', 100, "\0"), /*100*/ 'mode' => str_pad('', 8, "\0"), /*108*/ 'uid' => str_pad('', 8, "\0"), /*116*/ 'gid' => str_pad('', 8, "\0"), - /*100*/ 'size' => str_pad(decoct(strlen($data)), 12, "\0"), + /*100*/ 'size' => str_pad(decoct($dataLength), 12, "\0"), /*136*/ 'mtime' => str_pad('', 12, "\0"), /*148*/ 'chksum' => str_pad('', 8, "\0"), /*156*/ 'typeflag' => 'x',