From 247e9443068bfa1b8f67525543cbb0949f980630 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Fri, 23 May 2025 18:58:35 +0200 Subject: [PATCH 1/2] [HTTPClient] Support HTTP Cache via SeekableRequestReadStream Exploratory PR, do not merge --- .../ByteStream/SeekableRequestReadStream.php | 188 ++++++++++++++---- components/HttpClient/FilesystemCache.php | 10 +- 2 files changed, 155 insertions(+), 43 deletions(-) diff --git a/components/HttpClient/ByteStream/SeekableRequestReadStream.php b/components/HttpClient/ByteStream/SeekableRequestReadStream.php index 2db0b7a4..32625dff 100644 --- a/components/HttpClient/ByteStream/SeekableRequestReadStream.php +++ b/components/HttpClient/ByteStream/SeekableRequestReadStream.php @@ -5,33 +5,82 @@ use WordPress\ByteStream\FileReadWriteStream; use WordPress\ByteStream\ReadStream\BaseByteReadStream; use WordPress\ByteStream\ReadStream\ByteReadStream; +use WordPress\Filesystem\LocalFilesystem; +use WordPress\HttpClient\CacheEntry; +use WordPress\HttpClient\CachePolicy; +use WordPress\HttpClient\FilesystemCache; use WordPress\HttpClient\Request; +use WordPress\HttpClient\Response; /** - * HTTP reader that can seek() within the stream. + * HTTP reader that can seek() within the stream with transparent persistent caching. * - * Downloaded bytes are stored in a temporary file. All the read operations are delegated to that file. - * - * – Seek()-ing forward is done by fetching all the bytes up to the target offset. - * – Seek()-ing backwards is done by seeking within the temporary file. + * Behaviour changes compared to the original implementation: + * – Before issuing the HTTP request, it checks a on-disk cache (FilesystemCache). + * If a fresh 200 response is found (per CachePolicy) the body is replayed with + * zero network overhead. + * – After a successful complete download of a cache-able 200 response, the body + * and headers are written to the cache. Partial / cut-off downloads are never + * stored, preventing corrupted bodies from being reused in the future. */ class SeekableRequestReadStream implements ByteReadStream { - private $remote; // RequestReadStream - private $cache; // FileReadWriteStream - private $temp; + /** @var RequestReadStream|null */ + private $remote; + /** @var FileReadWriteStream */ + private $cache; // local working file (either cached body or tmp during download) + /** @var string */ + private $temp; // absolute path to working file + + /** @var FilesystemCache */ + private $store; + /** @var CacheEntry|null */ + private $cacheEntry = null; + /** @var bool */ + private $cacheHit = false; + /** @var bool */ + private $stored = false; // did we write the cache already? + private $length_resolved = false; public function __construct( $request, array $options = array() ) { if ( is_string( $request ) ) { $request = new Request( $request ); } + + $url = $request->url; + $cache_dir = $options['cache_dir'] ?? sys_get_temp_dir() . '/wp_http_cache'; + $this->store = $options['cache_store'] ?? new FilesystemCache( LocalFilesystem::create( $cache_dir ) ); + + // ------------------------------------------------------------------ cache lookup + $hit = $this->store->lookup( $url ); + if ( $hit && CachePolicy::is_fresh( $hit ) && $hit->status === 200 ) { + // Serve from cache – no network + $this->cacheHit = true; + $this->cacheEntry = $hit; + $this->temp = $cache_dir . '/' . $this->store->get_body_path( $url ); + $this->cache = FileReadWriteStream::from_path( $this->temp ); + // rewind (a+b opens at EOF) + $this->cache->seek( 0 ); + + return; + } + + // ------------------------------------------------------------------ fetch remotely + // Note: we purposely skip conditional GET for simplicity. Stale entries + // fall through to a normal request and will be refreshed. $this->remote = new RequestReadStream( $request, $options ); - $this->temp = $options['cache_path'] ?? tempnam( sys_get_temp_dir(), 'wp_http_cache_' ); - $this->cache = FileReadWriteStream::from_path( $this->temp, true ); + + $this->temp = $options['cache_path'] ?? tempnam( sys_get_temp_dir(), 'wp_http_tmp_' ); + $this->cache = FileReadWriteStream::from_path( $this->temp, true ); } + /* ------------------------------------------------ internal helpers */ + private function pipe_until( int $offset ): void { + if ( $this->cacheHit ) { + return; // data already fully available locally + } while ( $this->cache->length() === null || $this->cache->length() < $offset ) { $pulled = $this->remote->pull( BaseByteReadStream::CHUNK_SIZE ); if ( 0 === $pulled ) { @@ -41,35 +90,63 @@ private function pipe_until( int $offset ): void { } } + private function maybe_store_cache(): void { + if ( $this->cacheHit || $this->stored || ! $this->remote ) { + return; // nothing to do + } + if ( ! $this->remote->reached_end_of_data() ) { + return; // not finished – do not store partial responses + } + + $response = $this->remote->await_response(); + if ( $response->status_code !== 200 ) { + return; // only store 200 OK bodies + } + if ( ! CachePolicy::response_is_cacheable( $response ) ) { + return; // respect Cache-Control / Expires rules + } + + // --- write body into cache storage -------------------------------- + $writer = $this->store->open_body_write_stream( $response->request->url ); + $src = fopen( $this->temp, 'rb' ); + while ( ! feof( $src ) ) { + $chunk = fread( $src, 8192 ); + if ( '' !== $chunk && $chunk !== false ) { + $writer->append_bytes( $chunk ); + } + } + fclose( $src ); + $writer->close_writing(); + + // --- meta --------------------------------------------------------- + $entry = new CacheEntry(); + $entry->url = $response->request->url; + $entry->status = $response->status_code; + $entry->headers = $response->headers; + $entry->stored_at = time(); + $entry->etag = $response->get_header( 'etag' ); + $entry->last_modified = $response->get_header( 'last-modified' ); + $this->store->store( $entry ); + + $this->stored = true; + } + + /* ------------------------------------------------ interface ByteReadStream */ + public function length(): ?int { + if ( $this->cacheHit ) { + return $this->cache->length(); + } + if ( ! $this->length_resolved && null === $this->remote->length() ) { - /** - * Wait for the remote headers before returning the length. - * - * This is an inconsistency between RequestReadStream::length(): - * - * * RequestReadStream returns null until the remote headers are known. - * * SeekableRequestReadStream proactively waits for the remote headers. - * - * That's because: - * - * * RequestReadStream class is a lower-level utility where we simply - * expose what's available at the moment. The developer is responsible - * for awaiting the response headers. - * * SeekableRequestReadStream is a higher-level tool meant for usage - * when knowing the length is vital, e.g. reading from a remote ZIP file. - */ $this->remote->await_response(); if ( null === $this->remote->length() ) { - // The server did not send the Content-Length header. - // We need to consume the entire stream to infer the length. - $position = $this->tell(); + $pos = $this->tell(); $this->consume_all(); - $this->seek( $position ); + $this->seek( $pos ); } $this->length_resolved = true; } - return $this->remote->length(); } @@ -83,18 +160,24 @@ public function seek( int $offset ) { } public function reached_end_of_data(): bool { - return $this->remote->reached_end_of_data() && $this->cache->reached_end_of_data(); + if ( $this->cacheHit ) { + return $this->cache->reached_end_of_data(); + } + + $ended = $this->remote->reached_end_of_data() && $this->cache->reached_end_of_data(); + if ( $ended ) { + $this->maybe_store_cache(); + } + return $ended; } public function pull( $n, $mode = self::PULL_NO_MORE_THAN ): int { $this->pipe_until( $this->tell() + $n ); - return $this->cache->pull( $n, $mode ); } public function peek( $n ): string { $this->pipe_until( $this->tell() + $n ); - return $this->cache->peek( $n ); } @@ -103,25 +186,46 @@ public function consume( $n ): string { } public function consume_all(): string { - while ( ! $this->remote->reached_end_of_data() ) { - $pulled = $this->remote->pull( BaseByteReadStream::CHUNK_SIZE ); - if ( 0 === $pulled ) { - break; + if ( ! $this->cacheHit ) { + while ( ! $this->remote->reached_end_of_data() ) { + $pulled = $this->remote->pull( BaseByteReadStream::CHUNK_SIZE ); + if ( 0 === $pulled ) { + break; + } + $this->cache->append_bytes( $this->remote->consume( $pulled ) ); } - $this->cache->append_bytes( $this->remote->consume( $pulled ) ); + $this->cache->close_writing(); + $this->maybe_store_cache(); } - $this->cache->close_writing(); return $this->cache->consume_all(); } + /** + * Returns the HTTP response associated with this stream. + * For cache hits a synthetic Response object is created from the stored metadata. + */ public function await_response() { + if ( $this->cacheHit && $this->cacheEntry ) { + $resp = new Response(); + $resp->status_code = $this->cacheEntry->status; + $resp->headers = $this->cacheEntry->headers; + $resp->request = new Request( $this->cacheEntry->url ); + return $resp; + } return $this->remote->await_response(); } public function close_reading(): void { - $this->remote->close_reading(); + if ( $this->remote ) { + $this->remote->close_reading(); + } $this->cache->close_reading(); - @unlink( $this->temp ); + $this->maybe_store_cache(); + + // Clean up temporary file when it was only a download buffer + if ( ! $this->cacheHit && is_file( $this->temp ) ) { + @unlink( $this->temp ); + } } } diff --git a/components/HttpClient/FilesystemCache.php b/components/HttpClient/FilesystemCache.php index bc4ca6d3..613f32d3 100644 --- a/components/HttpClient/FilesystemCache.php +++ b/components/HttpClient/FilesystemCache.php @@ -20,7 +20,7 @@ public function __construct( Filesystem $fs ) { $this->fs = $fs; } - private function get_body_path( string $url ): string { + public function get_body_path( string $url ): string { $key = hash( 'sha256', $url ); return "$key.bin"; @@ -109,4 +109,12 @@ public function invalidate( string $url ): void { // Also remove from temporary tracking if invalidate is called mid-stream unset( $this->body_paths[ $url ] ); } + + public function tmp_path( string $url ): string { + $body_path = $this->get_body_path( $url ); + if ( ! $this->fs->exists( 'tmp' ) ) { + $this->fs->mkdir( 'tmp' ); + } + return $this->fs->get_meta()['root'] . '/tmp/' . $body_path; + } } From 527237fa13d872ab55f7c76bea2b1d627d7a912b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sat, 24 May 2025 01:27:35 +0200 Subject: [PATCH 2/2] Push forward the "HTTP Cache as a special read stream" approach --- .../ReadStream/BaseByteReadStream.php | 2 +- .../ByteStream/SeekableRequestReadStream.php | 164 +++++------ components/HttpClient/CacheEntry.php | 43 +++ components/HttpClient/CacheStorage.php | 34 --- components/HttpClient/Client.php | 4 + components/HttpClient/FilesystemCache.php | 274 ++++++++++-------- components/HttpClient/Request.php | 1 + 7 files changed, 286 insertions(+), 236 deletions(-) delete mode 100644 components/HttpClient/CacheStorage.php diff --git a/components/ByteStream/ReadStream/BaseByteReadStream.php b/components/ByteStream/ReadStream/BaseByteReadStream.php index 56d7bc05..e2b36058 100644 --- a/components/ByteStream/ReadStream/BaseByteReadStream.php +++ b/components/ByteStream/ReadStream/BaseByteReadStream.php @@ -7,7 +7,7 @@ abstract class BaseByteReadStream implements ByteReadStream { - const CHUNK_SIZE = 100;// 64 * 1024; + const CHUNK_SIZE = 64 * 1024; protected $buffer_size = 2048; diff --git a/components/HttpClient/ByteStream/SeekableRequestReadStream.php b/components/HttpClient/ByteStream/SeekableRequestReadStream.php index 32625dff..4ee18160 100644 --- a/components/HttpClient/ByteStream/SeekableRequestReadStream.php +++ b/components/HttpClient/ByteStream/SeekableRequestReadStream.php @@ -5,12 +5,10 @@ use WordPress\ByteStream\FileReadWriteStream; use WordPress\ByteStream\ReadStream\BaseByteReadStream; use WordPress\ByteStream\ReadStream\ByteReadStream; -use WordPress\Filesystem\LocalFilesystem; use WordPress\HttpClient\CacheEntry; use WordPress\HttpClient\CachePolicy; use WordPress\HttpClient\FilesystemCache; use WordPress\HttpClient\Request; -use WordPress\HttpClient\Response; /** * HTTP reader that can seek() within the stream with transparent persistent caching. @@ -25,21 +23,20 @@ */ class SeekableRequestReadStream implements ByteReadStream { + private $request; /** @var RequestReadStream|null */ private $remote; /** @var FileReadWriteStream */ - private $cache; // local working file (either cached body or tmp during download) - /** @var string */ - private $temp; // absolute path to working file + private $cacheStream; // local working file (either cached body or tmp during download) /** @var FilesystemCache */ private $store; /** @var CacheEntry|null */ private $cacheEntry = null; /** @var bool */ - private $cacheHit = false; + private $cacheHit = false; /** @var bool */ - private $stored = false; // did we write the cache already? + private $stored = false; // did we write the cache already? private $length_resolved = false; @@ -48,31 +45,24 @@ public function __construct( $request, array $options = array() ) { $request = new Request( $request ); } - $url = $request->url; - $cache_dir = $options['cache_dir'] ?? sys_get_temp_dir() . '/wp_http_cache'; - $this->store = $options['cache_store'] ?? new FilesystemCache( LocalFilesystem::create( $cache_dir ) ); + $this->request = $request; + $url = $request->url; + $cache_dir = $options['cache_dir'] ?? sys_get_temp_dir() . '/wp_http_cache2'; + $this->store = new FilesystemCache( $cache_dir ); - // ------------------------------------------------------------------ cache lookup + // Cache lookup $hit = $this->store->lookup( $url ); if ( $hit && CachePolicy::is_fresh( $hit ) && $hit->status === 200 ) { - // Serve from cache – no network - $this->cacheHit = true; - $this->cacheEntry = $hit; - $this->temp = $cache_dir . '/' . $this->store->get_body_path( $url ); - $this->cache = FileReadWriteStream::from_path( $this->temp ); - // rewind (a+b opens at EOF) - $this->cache->seek( 0 ); + // Cache hit – serve from cache + $this->cacheHit = true; + $this->cacheEntry = $hit; + $this->cacheStream = FileReadWriteStream::from_path( $this->store->get_body_path( $url ) ); return; } - // ------------------------------------------------------------------ fetch remotely - // Note: we purposely skip conditional GET for simplicity. Stale entries - // fall through to a normal request and will be refreshed. + // Cache miss – fetch remotely $this->remote = new RequestReadStream( $request, $options ); - - $this->temp = $options['cache_path'] ?? tempnam( sys_get_temp_dir(), 'wp_http_tmp_' ); - $this->cache = FileReadWriteStream::from_path( $this->temp, true ); } /* ------------------------------------------------ internal helpers */ @@ -81,61 +71,41 @@ private function pipe_until( int $offset ): void { if ( $this->cacheHit ) { return; // data already fully available locally } - while ( $this->cache->length() === null || $this->cache->length() < $offset ) { - $pulled = $this->remote->pull( BaseByteReadStream::CHUNK_SIZE ); + while ( !$this->cacheStream || $this->cacheStream->length() === null || $this->cacheStream->length() < $offset ) { + $pulled = $this->pull_remote(); if ( 0 === $pulled ) { break; } - $this->cache->append_bytes( $this->remote->consume( $pulled ) ); + $this->cacheStream->append_bytes( $this->remote->consume( $pulled ) ); } } - private function maybe_store_cache(): void { - if ( $this->cacheHit || $this->stored || ! $this->remote ) { - return; // nothing to do - } - if ( ! $this->remote->reached_end_of_data() ) { - return; // not finished – do not store partial responses - } - - $response = $this->remote->await_response(); - if ( $response->status_code !== 200 ) { - return; // only store 200 OK bodies - } - if ( ! CachePolicy::response_is_cacheable( $response ) ) { - return; // respect Cache-Control / Expires rules - } - - // --- write body into cache storage -------------------------------- - $writer = $this->store->open_body_write_stream( $response->request->url ); - $src = fopen( $this->temp, 'rb' ); - while ( ! feof( $src ) ) { - $chunk = fread( $src, 8192 ); - if ( '' !== $chunk && $chunk !== false ) { - $writer->append_bytes( $chunk ); - } - } - fclose( $src ); - $writer->close_writing(); - - // --- meta --------------------------------------------------------- - $entry = new CacheEntry(); - $entry->url = $response->request->url; - $entry->status = $response->status_code; - $entry->headers = $response->headers; - $entry->stored_at = time(); - $entry->etag = $response->get_header( 'etag' ); - $entry->last_modified = $response->get_header( 'last-modified' ); - $this->store->store( $entry ); - - $this->stored = true; + private function pull_remote() { + $pulled = $this->remote->pull( BaseByteReadStream::CHUNK_SIZE ); + /** + * Wait for the first response bytes from the remote server to + * ensure we're caching the terminal response after all the + * redirects have been followed. + */ + if ( $pulled > 0 && !$this->cacheStream ) { + $latest_request = $this->request->latest_redirect(); + $this->cacheStream = FileReadWriteStream::from_path( + $this->store->get_body_path( $latest_request->url ), + true + ); + } + return $pulled; } /* ------------------------------------------------ interface ByteReadStream */ public function length(): ?int { + if ( ! $this->cacheStream ) { + return $this->remote->length(); + } + if ( $this->cacheHit ) { - return $this->cache->length(); + return $this->cacheStream->length(); } if ( ! $this->length_resolved && null === $this->remote->length() ) { @@ -147,42 +117,49 @@ public function length(): ?int { } $this->length_resolved = true; } + return $this->remote->length(); } public function tell(): int { - return $this->cache->tell(); + if ( ! $this->cacheStream ) { + return $this->remote->tell(); + } + return $this->cacheStream->tell(); } public function seek( int $offset ) { $this->pipe_until( $offset ); - $this->cache->seek( $offset ); + $this->cacheStream->seek( $offset ); } public function reached_end_of_data(): bool { if ( $this->cacheHit ) { - return $this->cache->reached_end_of_data(); + return $this->cacheStream->reached_end_of_data(); } - $ended = $this->remote->reached_end_of_data() && $this->cache->reached_end_of_data(); + $ended = $this->remote->reached_end_of_data() && $this->cacheStream->reached_end_of_data(); if ( $ended ) { $this->maybe_store_cache(); } + return $ended; } public function pull( $n, $mode = self::PULL_NO_MORE_THAN ): int { $this->pipe_until( $this->tell() + $n ); - return $this->cache->pull( $n, $mode ); + + return $this->cacheStream->pull( $n, $mode ); } public function peek( $n ): string { $this->pipe_until( $this->tell() + $n ); - return $this->cache->peek( $n ); + + return $this->cacheStream->peek( $n ); } public function consume( $n ): string { - return $this->cache->consume( $n ); + return $this->cacheStream->consume( $n ); } public function consume_all(): string { @@ -192,13 +169,13 @@ public function consume_all(): string { if ( 0 === $pulled ) { break; } - $this->cache->append_bytes( $this->remote->consume( $pulled ) ); + $this->cacheStream->append_bytes( $this->remote->consume( $pulled ) ); } - $this->cache->close_writing(); + $this->cacheStream->close_writing(); $this->maybe_store_cache(); } - return $this->cache->consume_all(); + return $this->cacheStream->consume_all(); } /** @@ -207,25 +184,38 @@ public function consume_all(): string { */ public function await_response() { if ( $this->cacheHit && $this->cacheEntry ) { - $resp = new Response(); - $resp->status_code = $this->cacheEntry->status; - $resp->headers = $this->cacheEntry->headers; - $resp->request = new Request( $this->cacheEntry->url ); - return $resp; + return $this->cacheEntry->to_response( $this->request ); } + return $this->remote->await_response(); } public function close_reading(): void { + $this->maybe_store_cache(); if ( $this->remote ) { $this->remote->close_reading(); } - $this->cache->close_reading(); - $this->maybe_store_cache(); + $this->cacheStream->close_reading(); + } - // Clean up temporary file when it was only a download buffer - if ( ! $this->cacheHit && is_file( $this->temp ) ) { - @unlink( $this->temp ); + private function maybe_store_cache(): void { + if ( $this->cacheHit || $this->stored || ! $this->remote ) { + return; // nothing to do } + if ( ! $this->remote->reached_end_of_data() ) { + return; // not finished – do not store partial responses + } + + $response = $this->remote->await_response(); + if ( $response->status_code !== 200 ) { + return; // only store 200 OK bodies + } + if ( ! CachePolicy::response_is_cacheable( $response ) ) { + return; // respect Cache-Control / Expires rules + } + + $this->store->commit( $response ); + $this->stored = true; } + } diff --git a/components/HttpClient/CacheEntry.php b/components/HttpClient/CacheEntry.php index 74a57597..49f95159 100644 --- a/components/HttpClient/CacheEntry.php +++ b/components/HttpClient/CacheEntry.php @@ -31,4 +31,47 @@ final class CacheEntry { * @var string|null */ public $last_modified; + /** + * @var string|null Vary header value from the response that determined the cache entry variant. + */ + public $vary; + /** + * @var array|null Stores the original request header values for every header listed in `Vary`. + * This allows later look-ups to verify the variant matches the current request. + */ + public $vary_headers; + + static public function from_response( Response $response ) { + $entry = new self(); + $entry->url = $response->request->url; + $entry->status = $response->status_code; + $entry->headers = $response->headers; + $entry->stored_at = time(); + $entry->etag = $response->get_header( 'etag' ); + $entry->last_modified = $response->get_header( 'last-modified' ); + + // Capture Vary header information to support correct cache variant matching. + $vary_header = $response->get_header( 'vary' ); + $entry->vary = $vary_header; + if ( null !== $vary_header && trim( $vary_header ) !== '*' ) { + $vary_headers = []; + foreach ( explode( ',', $vary_header ) as $h ) { + $h = strtolower( trim( $h ) ); + if ( '' === $h ) { + continue; + } + $vary_headers[ $h ] = $response->request->get_header( $h ) ?? null; + } + $entry->vary_headers = $vary_headers; + } + + return $entry; + } + + public function to_response( Request $request ) { + $response = new Response( $request ); + $response->status_code = $this->status; + $response->headers = $this->headers; + return $response; + } } diff --git a/components/HttpClient/CacheStorage.php b/components/HttpClient/CacheStorage.php deleted file mode 100644 index 5a95584f..00000000 --- a/components/HttpClient/CacheStorage.php +++ /dev/null @@ -1,34 +0,0 @@ -client_can_follow_redirects ) { + continue; + } + $response = $request->response; if ( ! $response ) { continue; diff --git a/components/HttpClient/FilesystemCache.php b/components/HttpClient/FilesystemCache.php index 613f32d3..c2e79f23 100644 --- a/components/HttpClient/FilesystemCache.php +++ b/components/HttpClient/FilesystemCache.php @@ -2,119 +2,165 @@ namespace WordPress\HttpClient; -use Exception; use RuntimeException; -use WordPress\ByteStream\WriteStream\ByteWriteStream; -use WordPress\Filesystem\Filesystem; -use WordPress\Filesystem\FilesystemException; - -final class FilesystemCache implements CacheStorage { - /** - * @var Filesystem - */ - private $fs; - /** @var array Maps URL to temporary body file path during streaming */ - private $body_paths = []; - - public function __construct( Filesystem $fs ) { - $this->fs = $fs; - } - - public function get_body_path( string $url ): string { - $key = hash( 'sha256', $url ); - - return "$key.bin"; - } - - private function get_meta_path( string $url ): string { - $key = hash( 'sha256', $url ); - - return "$key.json"; - } - - public function lookup( string $url ): ?CacheEntry { - $meta_path = $this->get_meta_path( $url ); - $body_path = $this->get_body_path( $url ); - - // Check for metadata first, as body without metadata is useless. - if ( ! $this->fs->exists( $meta_path ) ) { - return null; - } - - // If metadata exists, but body doesn't, invalidate and return null. - if ( ! $this->fs->exists( $body_path ) ) { - $this->invalidate( $url ); - - return null; - } - - $data = json_decode( $this->fs->get_contents( $meta_path ), true ); - $entry = new CacheEntry(); - foreach ( $data as $k => $v ) { - // Skip body_path if it somehow exists in old cache files - if ( $k === 'body_path' ) { - continue; - } - $entry->$k = $v; - } - - // Re-check URL consistency in case of hash collisions (unlikely but possible) - if ( $entry->url !== $url ) { - // Log potential hash collision - $this->invalidate( $url ); // Invalidate the conflicting entry - - return null; - } - - return $entry; - } - - public function open_body_write_stream( string $url ): ByteWriteStream { - $body_path = $this->get_body_path( $url ); - $this->body_paths[ $url ] = $body_path; - - return $this->fs->open_write_stream( $body_path ); - } - - public function get_body( CacheEntry $entry ): string { - $body_path = $this->get_body_path( $entry->url ); - if ( ! $this->fs->exists( $body_path ) ) { - // Invalidate metadata if body is missing - $this->invalidate( $entry->url ); - throw new RuntimeException( "Cache body file not found for URL: {$entry->url}" ); - } - - return $this->fs->get_contents( $body_path ); - } - - public function store( CacheEntry $e ): void { - $meta_path = $this->get_meta_path( $e->url ); - - $jsonData = json_encode( $e, JSON_PRETTY_PRINT ); - if ( json_last_error() !== JSON_ERROR_NONE ) { - throw new Exception( json_last_error_msg() ); - } - $this->fs->put_contents( $meta_path, $jsonData ); - } - - public function invalidate( string $url ): void { - $meta_path = $this->get_meta_path( $url ); - $body_path = $this->get_body_path( $url ); - try { - $this->fs->rm( $meta_path ); - $this->fs->rm( $body_path ); - } catch ( FilesystemException $e ) { - // Ignore - } - // Also remove from temporary tracking if invalidate is called mid-stream - unset( $this->body_paths[ $url ] ); - } - - public function tmp_path( string $url ): string { - $body_path = $this->get_body_path( $url ); - if ( ! $this->fs->exists( 'tmp' ) ) { - $this->fs->mkdir( 'tmp' ); - } - return $this->fs->get_meta()['root'] . '/tmp/' . $body_path; - } +use Exception; + +/** + * Simple, process-safe cache that avoids the Filesystem abstraction and + * relies on atomic renames. All writes go to .partial first, then + * get renamed into place. Every critical section is wrapped in flock(). + */ +final class FilesystemCache +{ + /** @var string */ + private string $dir; + + /** @var array */ + private array $partials = []; + + public function __construct(string $dir) + { + $this->dir = rtrim($dir, '/'); + if (!is_dir($this->dir) && !mkdir($this->dir, 0755, true)) { + throw new RuntimeException("Cannot create cache dir {$this->dir}"); + } + } + + private function key(string $url): string + { + return hash('sha256', $url); + } + + public function get_body_path(string $url): string + { + return "{$this->dir}/{$this->key($url)}.bin"; + } + + private function get_meta_path(string $url): string + { + return "{$this->dir}/{$this->key($url)}.json"; + } + + /** @return resource */ + public function open_body_write_stream(string $url) + { + $partial = $this->get_body_path($url) . '.partial'; + $h = fopen($partial, 'wb'); + if (!$h) { + throw new RuntimeException("Cannot open {$partial} for writing"); + } + if (!flock($h, LOCK_EX)) { + fclose($h); + throw new RuntimeException("Cannot get exclusive lock on {$partial}"); + } + $this->partials[$url] = $partial; + return $h; // caller must fclose(); promotion happens in store() + } + + public function commit(Response $response): void + { + $e = CacheEntry::from_response( $response ); + + /* promote body if it was streamed */ + if (isset($this->partials[$e->url])) { + $partial = $this->partials[$e->url]; + $final = $this->get_body_path($e->url); + fclose(fopen($partial, 'rb+')); // flush & unlock + rename($partial, $final); // atomic within fs + chmod($final, 0644); + unset($this->partials[$e->url]); + } + + $json = json_encode($e, JSON_PRETTY_PRINT); + if (json_last_error() !== JSON_ERROR_NONE) { + throw new Exception(json_last_error_msg()); + } + + $meta = $this->get_meta_path($e->url); + $tmp = $meta . '.partial'; + $h = fopen($tmp, 'wb'); + if (!$h) { + throw new RuntimeException("Cannot write {$tmp}"); + } + flock($h, LOCK_EX); + fwrite($h, $json); + fflush($h); + if (function_exists('fsync')) { + fsync($h); + } + flock($h, LOCK_UN); + fclose($h); + rename($tmp, $meta); + chmod($meta, 0644); + } + + public function lookup(string $url): ?CacheEntry + { + $meta = $this->get_meta_path($url); + $body = $this->get_body_path($url); + + if (!is_file($meta) || !is_file($body)) { + $this->invalidate($url); + return null; + } + + $h = fopen($meta, 'rb'); + if (!$h) { + $this->invalidate($url); + return null; + } + flock($h, LOCK_SH); + $json = stream_get_contents($h); + flock($h, LOCK_UN); + fclose($h); + + $data = json_decode($json, true); + if (!is_array($data)) { + $this->invalidate($url); + return null; + } + + $entry = new CacheEntry(); + foreach ($data as $k => $v) { + if ($k === 'body_path') { // legacy field + continue; + } + $entry->$k = $v; + } + + if ($entry->url !== $url) { // hash collision guard + $this->invalidate($url); + return null; + } + + return $entry; + } + + public function get_body(CacheEntry $e): string + { + $body = $this->get_body_path($e->url); + if (!is_file($body)) { + $this->invalidate($e->url); + throw new RuntimeException("Cache body missing for {$e->url}"); + } + + $h = fopen($body, 'rb'); + if (!$h) { + throw new RuntimeException("Cannot open body for {$e->url}"); + } + flock($h, LOCK_SH); + $data = stream_get_contents($h); + flock($h, LOCK_UN); + fclose($h); + + return $data; + } + + public function invalidate(string $url): void + { + @unlink($this->get_meta_path($url)); + @unlink($this->get_body_path($url)); + @unlink($this->get_body_path($url) . '.partial'); + unset($this->partials[$url]); + } } diff --git a/components/HttpClient/Request.php b/components/HttpClient/Request.php index 86d979b7..4453550a 100644 --- a/components/HttpClient/Request.php +++ b/components/HttpClient/Request.php @@ -33,6 +33,7 @@ class Request { public $upload_body_stream; public $redirected_from; public $redirected_to; + public $client_can_follow_redirects = true; /** * @var HttpError