5
5
from typing import TYPE_CHECKING , Any , Callable , Generic
6
6
7
7
from pydantic import ValidationError
8
- from typing_extensions import NotRequired , TypeVar
8
+ from typing_extensions import NotRequired , TypedDict , TypeVar
9
9
10
10
from crawlee import EnqueueStrategy , RequestTransformAction
11
11
from crawlee ._request import Request , RequestOptions
14
14
from crawlee .crawlers ._basic import BasicCrawler , BasicCrawlerOptions , ContextPipeline
15
15
from crawlee .errors import SessionError
16
16
from crawlee .http_clients import HttpxHttpClient
17
+ from crawlee .statistics import StatisticsState
17
18
18
19
from ._http_crawling_context import HttpCrawlingContext , ParsedHttpCrawlingContext , TParseResult
19
20
27
28
from ._abstract_http_parser import AbstractHttpParser
28
29
29
30
TCrawlingContext = TypeVar ('TCrawlingContext' , bound = ParsedHttpCrawlingContext )
31
+ TStatisticsState = TypeVar ('TStatisticsState' , bound = StatisticsState , default = StatisticsState )
30
32
31
33
32
- @docs_group ('Data structures' )
33
- class HttpCrawlerOptions (Generic [TCrawlingContext ], BasicCrawlerOptions [TCrawlingContext ]):
34
- """Arguments for the `AbstractHttpCrawler` constructor.
35
-
36
- It is intended for typing forwarded `__init__` arguments in the subclasses.
37
- """
38
-
34
+ class _HttpCrawlerAdditionalOptions (TypedDict ):
39
35
additional_http_error_status_codes : NotRequired [Iterable [int ]]
40
36
"""Additional HTTP status codes to treat as errors, triggering automatic retries when encountered."""
41
37
42
38
ignore_http_error_status_codes : NotRequired [Iterable [int ]]
43
39
"""HTTP status codes that are typically considered errors but should be treated as successful responses."""
44
40
45
41
42
+ @docs_group ('Data structures' )
43
+ class HttpCrawlerOptions (
44
+ Generic [TCrawlingContext , TStatisticsState ],
45
+ _HttpCrawlerAdditionalOptions ,
46
+ BasicCrawlerOptions [TCrawlingContext , StatisticsState ],
47
+ ):
48
+ """Arguments for the `AbstractHttpCrawler` constructor.
49
+
50
+ It is intended for typing forwarded `__init__` arguments in the subclasses.
51
+ """
52
+
53
+
46
54
@docs_group ('Abstract classes' )
47
- class AbstractHttpCrawler (Generic [TCrawlingContext , TParseResult ], BasicCrawler [TCrawlingContext ], ABC ):
55
+ class AbstractHttpCrawler (
56
+ Generic [TCrawlingContext , TParseResult ], BasicCrawler [TCrawlingContext , StatisticsState ], ABC
57
+ ):
48
58
"""A web crawler for performing HTTP requests.
49
59
50
60
The `AbstractHttpCrawler` builds on top of the `BasicCrawler`, inheriting all its features. Additionally,
@@ -65,7 +75,7 @@ def __init__(
65
75
parser : AbstractHttpParser [TParseResult ],
66
76
additional_http_error_status_codes : Iterable [int ] = (),
67
77
ignore_http_error_status_codes : Iterable [int ] = (),
68
- ** kwargs : Unpack [BasicCrawlerOptions [TCrawlingContext ]],
78
+ ** kwargs : Unpack [BasicCrawlerOptions [TCrawlingContext , StatisticsState ]],
69
79
) -> None :
70
80
self ._parser = parser
71
81
self ._pre_navigation_hooks : list [Callable [[BasicCrawlingContext ], Awaitable [None ]]] = []
@@ -87,6 +97,32 @@ def __init__(
87
97
kwargs .setdefault ('_logger' , logging .getLogger (__name__ ))
88
98
super ().__init__ (** kwargs )
89
99
100
+ @classmethod
101
+ def create_parsed_http_crawler_class (
102
+ cls ,
103
+ static_parser : AbstractHttpParser [TParseResult ],
104
+ ) -> type [AbstractHttpCrawler [ParsedHttpCrawlingContext [TParseResult ], TParseResult ]]:
105
+ """Convenience class factory that creates specific version of `AbstractHttpCrawler` class.
106
+
107
+ In general typing sense two generic types of `AbstractHttpCrawler` do not have to be dependent on each other.
108
+ This is convenience constructor for specific cases when `TParseResult` is used to specify both generic
109
+ parameters in `AbstractHttpCrawler`.
110
+ """
111
+
112
+ class _ParsedHttpCrawler (AbstractHttpCrawler [ParsedHttpCrawlingContext [TParseResult ], TParseResult ]):
113
+ def __init__ (
114
+ self ,
115
+ parser : AbstractHttpParser [TParseResult ] = static_parser ,
116
+ ** kwargs : Unpack [HttpCrawlerOptions [ParsedHttpCrawlingContext [TParseResult ]]],
117
+ ) -> None :
118
+ kwargs ['_context_pipeline' ] = self ._create_static_content_crawler_pipeline ()
119
+ super ().__init__ (
120
+ parser = parser ,
121
+ ** kwargs ,
122
+ )
123
+
124
+ return _ParsedHttpCrawler
125
+
90
126
def _create_static_content_crawler_pipeline (self ) -> ContextPipeline [ParsedHttpCrawlingContext [TParseResult ]]:
91
127
"""Create static content crawler context pipeline with expected pipeline steps."""
92
128
return (
0 commit comments