Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/scrapingbee_cli/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ def _params_for_discovery(params: dict[str, Any]) -> dict[str, Any]:
def _preferred_extension_from_scrape_params(params: dict[str, Any]) -> str | None:
"""Return extension when scrape params force a response type (skip detection).
Priority: screenshot+json_response -> json; screenshot -> png;
return_page_markdown -> md; return_page_text -> txt; json_response -> json.
return_page_markdown -> md; return_page_text -> txt;
json_response / extract_rules / ai_extract_rules / ai_query -> json.
"""
if _param_truthy(params, "screenshot") and _param_truthy(params, "json_response"):
return "json"
Expand All @@ -102,6 +103,11 @@ def _preferred_extension_from_scrape_params(params: dict[str, Any]) -> str | Non
return "txt"
if _param_truthy(params, "json_response"):
return "json"
# extract_rules, ai_extract_rules, ai_query always return JSON regardless of URL.
# Without this, URLs ending in .html would be saved as .html despite JSON body
# (the URL-path heuristic in extension_for_crawl wins before body sniff).
if params.get("extract_rules") or params.get("ai_extract_rules") or params.get("ai_query"):
return "json"
return None


Expand Down
55 changes: 55 additions & 0 deletions tests/unit/test_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,24 @@ def test_return_text(self):
def test_json_response_only(self):
assert _preferred_extension_from_scrape_params({"json_response": True}) == "json"

def test_extract_rules(self):
assert (
_preferred_extension_from_scrape_params({"extract_rules": '{"title": "h1"}'}) == "json"
)

def test_ai_extract_rules(self):
assert (
_preferred_extension_from_scrape_params({"ai_extract_rules": '{"title": "h1"}'})
== "json"
)

def test_ai_query(self):
assert _preferred_extension_from_scrape_params({"ai_query": "What is the price?"}) == "json"

def test_ai_selector_alone_returns_none(self):
# ai_selector is a modifier for ai_query/ai_extract_rules, not a JSON producer on its own.
assert _preferred_extension_from_scrape_params({"ai_selector": "h1"}) is None

def test_none_when_no_match(self):
assert _preferred_extension_from_scrape_params({}) is None

Expand Down Expand Up @@ -334,6 +352,43 @@ def test_save_response_manifest_has_required_fields(self, tmp_path):
for field in ("file", "fetched_at", "http_status", "credits_used", "latency_ms"):
assert field in entry, f"Missing field {field!r}"

def test_save_response_extract_rules_writes_json_for_html_url(self, tmp_path):
"""SCR-371: with --extract-rules, JSON body must be saved as .json
even when the URL path ends with .html (URL heuristic must not win)."""
from scrapingbee_cli.crawl import GenericScrapingBeeSpider

spider = GenericScrapingBeeSpider(
start_urls=["https://books.toscrape.com/"],
scrape_params={"extract_rules": '{"title": "h1", "price": ".price_color"}'},
output_dir=str(tmp_path),
)
response = self._make_response(
"https://books.toscrape.com/catalogue/libertarianism-for-beginners_982/index.html",
b'{"title": "Libertarianism for Beginners", "price": "\\u00a351.33"}',
)
spider._save_response(response)
assert (tmp_path / "1.json").exists(), "Expected 1.json (JSON body), not .html"
assert not (tmp_path / "1.html").exists(), "Must not save JSON body as .html"
url = "https://books.toscrape.com/catalogue/libertarianism-for-beginners_982/index.html"
assert spider._url_file_map[url]["file"] == "1.json"

def test_save_response_ai_query_writes_json_for_html_url(self, tmp_path):
"""SCR-371: --ai-query also forces JSON extension regardless of URL path."""
from scrapingbee_cli.crawl import GenericScrapingBeeSpider

spider = GenericScrapingBeeSpider(
start_urls=["https://example.com/"],
scrape_params={"ai_query": "What is the price?"},
output_dir=str(tmp_path),
)
response = self._make_response(
"https://example.com/products/widget.html",
b'{"answer": "$9.99"}',
)
spider._save_response(response)
assert (tmp_path / "1.json").exists()
assert not (tmp_path / "1.html").exists()


class TestRequiresDiscoveryPhase:
"""Tests for _requires_discovery_phase()."""
Expand Down
Loading