Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/build_deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ jobs:
uv run pytest --ignore=tests/extras
- name: Smoke Test Extras - Edgar and HTTP2
run: |
uv pip install edgartools h2
uv run pytest tests/extras/test_edgartools.py tests/extras/test_http2.py
- name: Upload package to PyPI
if: startsWith(github.ref, 'refs/tags/v')
Expand Down
76 changes: 1 addition & 75 deletions httpxthrottlecache/controller.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import logging
import re
from typing import Any, Callable, Optional, Union

import hishel
import httpcore
from typing import Optional, Union

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -39,74 +36,3 @@ def get_rule_for_request(

return None


def get_cache_controller(
key_generator: Callable[[httpcore.Request, Optional[bytes]], str],
cache_rules: dict[str, dict[str, Union[bool, int]]],
**kwargs: dict[str, Any],
):
class EdgarController(hishel.Controller):
def is_cachable(self, request: httpcore.Request, response: httpcore.Response) -> bool:
if response.status not in self._cacheable_status_codes:
return False

cache_period = get_rule_for_request(
request_host=request.url.host.decode(), target=request.url.target.decode(), cache_rules=cache_rules
)

if cache_period: # True or an Int>0
return True
elif cache_period is False or cache_period == 0: # Explicitly not cacheable
return False
else:
# Fall through default caching policy
super_is_cachable = super().is_cachable(request, response)
logger.debug("%s is cacheable %s", request.url, super_is_cachable)
return super_is_cachable

def construct_response_from_cache(
self, request: httpcore.Request, response: httpcore.Response, original_request: httpcore.Request
) -> Union[httpcore.Request, httpcore.Response, None]:
if (
response.status not in self._cacheable_status_codes
): # pragma: no cover - would only occur if the cache was loaded then rules changed
return None

cache_period = get_rule_for_request(
request_host=request.url.host.decode(), target=request.url.target.decode(), cache_rules=cache_rules
)

if cache_period is True:
# Cache forever, never recheck
logger.debug("Cache hit for %s", request.url)
return response
elif (
cache_period is False or cache_period == 0
): # pragma: no cover - would only occur if the cache was loaded then rules changed
return None
elif cache_period: # int
max_age = cache_period

age_seconds = hishel._controller.get_age(response, self._clock) # pyright: ignore[reportPrivateUsage]

if age_seconds > max_age:
logger.debug(
"Request needs to be validated before using %s (age=%d, max_age=%d)",
request.url,
age_seconds,
max_age,
)
self._make_request_conditional(request=request, response=response)
return request
else:
logger.debug("Cache hit for %s (age=%d, max_age=%d)", request.url, age_seconds, max_age)
return response
else:
logger.debug("No rules applied to %s, using default", request.url)
return super().construct_response_from_cache(request, response, original_request)

controller = EdgarController(
cacheable_methods=["GET", "POST"], cacheable_status_codes=[200], key_generator=key_generator, **kwargs
)

return controller
52 changes: 9 additions & 43 deletions httpxthrottlecache/httpxclientmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,12 @@
from pathlib import Path
from typing import Any, AsyncGenerator, Callable, Generator, Literal, Mapping, Optional, Sequence, Union

import hishel
import httpx
from httpx._types import ProxyTypes
from pyrate_limiter import Duration, Limiter

from .controller import get_cache_controller
from .filecache.transport import CachingTransport
from .key_generator import file_key_generator
from .ratelimiter import AsyncRateLimitingTransport, RateLimitingTransport, create_rate_limiter
from .serializer import JSONByteSerializer

logger = logging.getLogger(__name__)

Expand All @@ -33,8 +29,6 @@ class HttpxThrottleCache:
Rate Limiting is across all connections, whether via client & async_htp_client, using pyrate_limiter. For multiprocessing, use pyrate_limiters
MultiprocessBucket or SqliteBucket w/ a file lock.

Caching is implemented via Hishel, which allows a variety of configurations, including AWS storage.

This function is used for all synchronous requests.
"""

Expand All @@ -44,14 +38,12 @@ class HttpxThrottleCache:

cache_rules: dict[str, dict[str, Union[bool, int]]] = field(default_factory=lambda: {})
rate_limiter_enabled: bool = True
cache_mode: Literal[False, "Disabled", "Hishel-S3", "Hishel-File", "FileCache"] = "Hishel-File"
cache_mode: Literal[False, "Disabled", "FileCache"] = "FileCache"
request_per_sec_limit: int = 10
max_delay: Duration = field(default_factory=lambda: Duration.DAY)
_client: Optional[httpx.Client] = None

rate_limiter: Optional[Limiter] = None
s3_bucket: Optional[str] = None
s3_client: Optional[Any] = None
user_agent: Optional[str] = None
user_agent_factory: Optional[Callable[[], str]] = None

Expand All @@ -62,6 +54,9 @@ class HttpxThrottleCache:
proxy: Optional[ProxyTypes] = None

def __post_init__(self):
if self.cache_mode == "Hishel-File":
logger.debug("Hishel-File is deprecated and will be removed, due to breaking API changes")
self.cache_mode = "FileCache"
self.cache_dir = Path(self.cache_dir) if isinstance(self.cache_dir, str) else self.cache_dir
# self.lock = threading.Lock()

Expand All @@ -75,15 +70,14 @@ def __post_init__(self):

if self.cache_mode == "Disabled" or self.cache_mode is False:
pass
elif self.cache_mode == "Hishel-S3":
if self.s3_bucket is None:
raise ValueError("s3_bucket must be provided if using Hishel-S3 storage")
else: # Hishel-File or FileCache
elif self.cache_mode == "FileCache":
if self.cache_dir is None:
raise ValueError(f"cache_dir must be provided if using a file based cache: {self.cache_mode}")
else:
if not self.cache_dir.exists():
self.cache_dir.mkdir()
else:
raise ValueError(f"Unsupported cache_mode: {self.cache_mode}")

logger.debug(
"Initialized cache with cache_mode=%s, cache_dir=%s, rate_limiter_enabled=%s",
Expand Down Expand Up @@ -248,21 +242,7 @@ def _get_transport(self, bypass_cache: bool, httpx_transport_params: dict[str, A
assert self.cache_dir is not None
return CachingTransport(cache_dir=self.cache_dir, transport=next_transport, cache_rules=self.cache_rules)
else:
# either Hishel-S3 or Hishel-File
assert self.cache_mode == "Hishel-File" or self.cache_mode == "Hishel-S3"
controller = get_cache_controller(key_generator=file_key_generator, cache_rules=self.cache_rules)

if self.cache_mode == "Hishel-S3":
assert self.s3_bucket is not None
storage = hishel.S3Storage(
client=self.s3_client, bucket_name=self.s3_bucket, serializer=JSONByteSerializer()
)
else:
assert self.cache_mode == "Hishel-File"
assert self.cache_dir is not None
storage = hishel.FileStorage(base_path=Path(self.cache_dir), serializer=JSONByteSerializer())

return hishel.CacheTransport(transport=next_transport, storage=storage, controller=controller)
raise ValueError(f"Unsupported cache_mode: {self.cache_mode}")

def _get_async_transport(
self, bypass_cache: bool, httpx_transport_params: dict[str, Any]
Expand All @@ -286,21 +266,7 @@ def _get_async_transport(
assert self.cache_dir is not None
return CachingTransport(cache_dir=self.cache_dir, transport=next_transport, cache_rules=self.cache_rules) # pyright: ignore[reportArgumentType]
else:
# either Hishel-S3 or Hishel-File
assert self.cache_mode == "Hishel-File" or self.cache_mode == "Hishel-S3"
controller = get_cache_controller(key_generator=file_key_generator, cache_rules=self.cache_rules)

if self.cache_mode == "Hishel-S3":
assert self.s3_bucket is not None
storage = hishel.AsyncS3Storage(
client=self.s3_client, bucket_name=self.s3_bucket, serializer=JSONByteSerializer()
)
else:
assert self.cache_mode == "Hishel-File"
assert self.cache_dir is not None
storage = hishel.AsyncFileStorage(base_path=Path(self.cache_dir), serializer=JSONByteSerializer())

return hishel.AsyncCacheTransport(transport=next_transport, storage=storage, controller=controller)
raise ValueError(f"Unsupported cache_mode: {self.cache_mode}")

def __enter__(self):
return self
Expand Down
22 changes: 0 additions & 22 deletions httpxthrottlecache/key_generator.py

This file was deleted.

125 changes: 0 additions & 125 deletions httpxthrottlecache/serializer.py

This file was deleted.

6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,11 @@ keywords = [
"sec",
"httpx",
"pyrate_limiter",
"hishel"
]
dependencies = [
"aiofiles>=24.1.0",
"filelock>=3.18.0",
"hishel>=0.1.3",
"httpx>=0.28.1",
"pyrate-limiter>=3.9.0",
]
classifiers = [
Expand All @@ -31,7 +30,8 @@ classifiers = [

[dependency-groups]
dev = [
"hishel[s3]>=0.1.3",
"edgartools>=4.34.1; platform_python_implementation == 'CPython'",
"h2>=4.3.0",
"pre-commit-uv>=4.1.4",
"pylint>=3.3.8",
"pytest>=8.4.1",
Expand Down
Loading
Loading