Skip to content

Introduce PAX header support #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions src/Parser/Header.php
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
class Header
{
private string $content;
/** @var array<string, string|int> */
private array $pax = [];

public function __construct(string $content)
{
Expand All @@ -47,6 +49,26 @@ public function __construct(string $content)
$this->content = $content;
}

public function harvestPaxData(string $paxData): void
{
foreach (explode("\n", $paxData) as $record) {
if ($record === '') {
continue;
}
$matchesFound = preg_match_all('/^(\d+) ([^=]+)=(.*)$/', $record, $matches);
if (!$matchesFound) {
throw new InvalidArchiveFormatException(
sprintf('Invalid Pax header record format: %s', $record)
);
}

$key = $matches[2][0];
$value = $matches[3][0];

$this->pax[$key] = $value;
}
}

public function isValid(): bool
{
return $this->getMagic() === 'ustar';
Expand All @@ -60,6 +82,10 @@ public function getName(): string

public function getSize(): int
{
if (array_key_exists('size', $this->pax)) {
return (int)$this->pax['size'];
}

$str = rtrim(substr($this->content, 124, 12));
if (preg_match('/^[0-7]+$/D', $str) !== 1) {
throw new InvalidArchiveFormatException(
Expand All @@ -73,6 +99,11 @@ public function getSize(): int
return (int)octdec($str);
}

public function mergePaxHeader(Header $header): void
{
$this->pax = array_merge($header->pax, $this->pax);
}

/*
* Values used in typeflag field
* #define REGTYPE '0' // regular file
Expand Down
27 changes: 25 additions & 2 deletions src/StreamReader.php
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public function getIterator(): Iterator
if (!$header->isValid()) {
throw new InvalidArchiveFormatException(
sprintf(
'Invalid TAR archive format: Invalid Tar header format: at %s. bytes',
'Invalid TAR archive format: Invalid Tar header format: at byte %s',
$blockStart
)
);
Expand Down Expand Up @@ -165,7 +165,30 @@ private function readHeader(): Header
} while (self::isNullFilled($header));
// ↑↑↑ TAR format inserts few blocks of nulls to EOF - just skip it

return new Header($header);
$header = new Header($header);

// Handle PAX header block
$paxHeader = null;
switch($header->getType()) {
case 'x':
$paxHeader = $header;
$paxData = fread($this->stream, $paxHeader->getSize()); // @phpstan-ignore argument.type
fseek($this->stream, 512 - ($paxHeader->getSize() % 512), SEEK_CUR); // Skip null byte padding
if ($paxData === false) {
throw new InvalidArchiveFormatException(
'Invalid TAR archive format: Unexpected end of file, expected PAX header data'
);
}
$paxHeader->harvestPaxData($paxData);
$header = $this->readHeader();
break;
}

if ($paxHeader) {
$header->mergePaxHeader($paxHeader);
}

return $header;
}

private static function isNullFilled(string $string): bool
Expand Down
132 changes: 132 additions & 0 deletions tests/Parser/PaxHeaderTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
<?php

namespace JakubBoucek\Tar\Tests\Parser;

use Generator;
use JakubBoucek\Tar\StreamReader;
use PHPUnit\Framework\TestCase;

require_once __DIR__ . '/../ResourceGenerators/MassiveFileStreamWrapper.php';

class PaxHeaderTest extends TestCase
{
const MAX_USTAR_SIZE = 8589934591; // 8GB - 1

protected function setUp(): void
{
if (getenv('CI') != 'true' && getenv('TEST_MASSIVE_FILE') != 'true') {
$this->markTestSkipped('Massive file test is skipped. Set TEST_MASSIVE_FILE=true to run it.');
}
parent::setUp();
}

protected function generateSingleFile(?int $chunkSize = null, ?int $totalSize = null): Generator
{
$chunkSize ??= 1024 * 1024; // 1MB chunks
$totalSize ??= 10 * 1024 * $chunkSize; // 10GB
assert($totalSize > $chunkSize);
$chunks = floor($totalSize / $chunkSize);
$extra = $totalSize % $chunkSize;

if ($totalSize > self::MAX_USTAR_SIZE) {
// Generate a pax header and data record for a file larger than 8GB
$data = implode("\n", [
implode(' ', array_reverse([$data = 'size=' . $totalSize, strlen($data)])),
]) . "\n";
$dataLength = strlen($data);
$data = str_pad($data, 512 * ceil($dataLength / 512), "\0");

yield str_pad(implode('', [
/* 0 */ 'name' => str_pad('massive_file.txt', 100, "\0"),
/*100*/ 'mode' => str_pad('', 8, "\0"),
/*108*/ 'uid' => str_pad('', 8, "\0"),
/*116*/ 'gid' => str_pad('', 8, "\0"),
/*100*/ 'size' => str_pad(decoct($dataLength), 12, "\0"),
/*136*/ 'mtime' => str_pad('', 12, "\0"),
/*148*/ 'chksum' => str_pad('', 8, "\0"),
/*156*/ 'typeflag' => 'x',
/*157*/ 'linkname' => str_pad('', 100, "\0"),
/*257*/ 'magic' => "ustar\0",
/*263*/ 'version' => str_pad('', 2, "\0"),
/*265*/ 'uname' => str_pad('', 32, "\0"),
/*297*/ 'gname' => str_pad('', 32, "\0"),
/*329*/ 'devmajor' => str_pad('', 8, "\0"),
/*337*/ 'devminor' => str_pad('', 8, "\0"),
/*345*/ 'prefix' => str_pad('PaxHeaders.0', 155, "\0"),
]), 512, "\0");
yield $data;
}

yield str_pad(implode('', [
/* 0 */ 'name' => str_pad('massive_file.txt', 100, "\0"),
/*100*/ 'mode' => str_pad('', 8, "\0"),
/*108*/ 'uid' => str_pad('', 8, "\0"),
/*116*/ 'gid' => str_pad('', 8, "\0"),
/*100*/ 'size' => str_pad(decoct(min(octdec('77777777777'), $totalSize)), 12, "\0"),
/*136*/ 'mtime' => str_pad('', 12, "\0"),
/*148*/ 'chksum' => str_pad('', 8, "\0"),
/*156*/ 'typeflag' => '0',
/*157*/ 'linkname' => str_pad('', 100, "\0"),
/*257*/ 'magic' => "ustar\0",
/*263*/ 'version' => str_pad('', 2, "\0"),
/*265*/ 'uname' => str_pad('', 32, "\0"),
/*297*/ 'gname' => str_pad('', 32, "\0"),
/*329*/ 'devmajor' => str_pad('', 8, "\0"),
/*337*/ 'devminor' => str_pad('', 8, "\0"),
/*345*/ 'prefix' => str_pad('', 155, "\0"),
]), 512, "\0");

$alphabet = range('A', 'Z');
for ($i = 0; $i < $chunks; $i++) {
yield str_repeat($alphabet[$i % 26], $chunkSize);
}

if ($extra > 0) {
yield str_repeat('#', $extra);
}
}

public function fileCountProvider()
{
return [
'one' => [1],
'two' => [2],
'three' => [3],
];
}

/**
* @dataProvider fileCountProvider
*/
public function testPaxHeader(int $fileCount): void
{
$fileGenerator = (function () use ($fileCount) {
for ($i = 0; $i < $fileCount; $i++) {
yield from $this->generateSingleFile();
}
})();
$resource = fopen('massive-file://', 'r', context: stream_context_create([
'massive-file' => ['generator' => $fileGenerator],
]));
$this->assertIsResource($resource);

try {
$streamReader = new StreamReader($resource);

$iterator = $streamReader->getIterator();
$count = 0;
foreach ($iterator as $file) {
$count++;
$fileResource = $file->getContent()->detach();
$bytesRead = 0;
while (!feof($fileResource)) {
$bytesRead += strlen(fread($fileResource, 1024 ** 2));
}
$this->assertSame($file->getSize(), $bytesRead);
}
$this->assertSame($fileCount, $count);
} finally {
fclose($resource);
}
}
}
81 changes: 81 additions & 0 deletions tests/ResourceGenerators/MassiveFileStreamWrapper.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
<?php

namespace JakubBoucek\Tar\Tests\ResourceGenerators;

use Generator;
use JakubBoucek\Tar\Exception\RuntimeException;

class MassiveFileStreamWrapper
{
public $context;
protected Generator $generator;
protected string $buffer = '';
protected int $position = 0;

public function stream_open(string $path, string $mode, int $options, ?string &$opened_path): bool
{
$options = stream_context_get_options($this->context);
if (!array_key_exists('massive-file', $options)) {
return false;
}
$options = $options['massive-file'];

if (!array_key_exists('generator', $options)) {
return false;
}
$this->generator = $options['generator'];

return true;
}

public function stream_eof(): bool
{
return !$this->generator->valid() && strlen($this->buffer) === 0;
}

public function stream_read(int $count): string
{
while ($this->generator->valid() && strlen($this->buffer) < $count) {
$this->buffer .= $this->generator->current();
$this->generator->next();
}
$data = substr($this->buffer, 0, $count);
$this->buffer = substr($this->buffer, $count);
$this->position += strlen($data);
return $data;
}

public function stream_tell(): int
{
return $this->position;
}

public function stream_seek(int $offset, int $whence): bool
{
if ($whence === SEEK_END) {
throw new RuntimeException('Seeking from the end is not supported');
}

if (
($whence === SEEK_CUR && $offset < 0)
|| ($whence === SEEK_SET && $offset < $this->position)
) {
throw new RuntimeException('Seeking backwards is not supported');
}

$newPosition = $whence === SEEK_CUR ? $this->position + $offset : $offset;
while ($this->position < $newPosition) {
// Limit read to 1 MiB, to reduce memory usage
$this->stream_read(min($newPosition - $this->position, 1204 ** 2));
}
return true;
}

public function stream_stat(): array
{
return [];
}
}

stream_wrapper_register('massive-file', MassiveFileStreamWrapper::class)
|| throw new \RuntimeException('Failed to register stream wrapper ' . MassiveFileStreamWrapper::class);
Loading