Skip to content

Commit

Permalink
name contexts differently
Browse files Browse the repository at this point in the history
honzajavorek committed Apr 17, 2024
1 parent 2d23156 commit 98e22fa
Showing 2 changed files with 6 additions and 9 deletions.
8 changes: 1 addition & 7 deletions jg/plucker/jobs_linkedin/spider.py
Original file line number Diff line number Diff line change
@@ -167,18 +167,12 @@ def verify_job(
def _retry(self, url: str, request: Request | None = None) -> Request:
if not request:
raise ValueError(f"Request object is required to retry {url}")
meta = request.meta | dict(playwright=True)
self.logger.warning(f"Retrying {url} using browser")

# TODO proxy support
# see https://docs.apify.com/sdk/python/docs/concepts/proxy-management#configuring-proxy-based-on-actor-input
# see https://github.com/scrapy-plugins/scrapy-playwright?tab=readme-ov-file#proxy-support
# see https://docs.scrapy.org/en/latest/topics/spiders.html#scrapy.Spider.update_settings
return request.replace(
url=url,
dont_filter=True,
headers=self.request_headers,
meta=meta,
meta=request.meta | dict(playwright=True),
)

def _request(
7 changes: 5 additions & 2 deletions jg/plucker/scrapers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import hashlib
from pathlib import Path
from typing import Any, Generator, Self, Type
from urllib.parse import urlparse
@@ -141,10 +142,12 @@ async def process_request(self, request: Request, spider: Spider):
)

proxy = url.geturl()
Actor.log.info(f"Creating a new Playwright context with proxy {proxy}")
proxy_hash = hashlib.sha1(proxy.encode()).hexdigest()[0:8]
context_name = f"proxy_{proxy_hash}"
Actor.log.info(f"Using Playwright context {context_name}")
request.meta.update(
{
"playwright_context": f"proxy_{urlparse(request.url).hostname}",
"playwright_context": f"proxy_{context_name}",
"playwright_context_kwargs": {
"proxy": {
"server": proxy,

0 comments on commit 98e22fa

Please sign in to comment.