ScrapingBee · kostas-jakeliunas-sb · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/src/scrapingbee_cli/crawl.py b/src/scrapingbee_cli/crawl.py
@@ -90,7 +90,8 @@ def _params_for_discovery(params: dict[str, Any]) -> dict[str, Any]:
 def _preferred_extension_from_scrape_params(params: dict[str, Any]) -> str | None:
     """Return extension when scrape params force a response type (skip detection).
     Priority: screenshot+json_response -> json; screenshot -> png;
-    return_page_markdown -> md; return_page_text -> txt; json_response -> json.
+    return_page_markdown -> md; return_page_text -> txt;
+    json_response / extract_rules / ai_extract_rules / ai_query -> json.
     """
     if _param_truthy(params, "screenshot") and _param_truthy(params, "json_response"):
         return "json"
@@ -102,6 +103,11 @@ def _preferred_extension_from_scrape_params(params: dict[str, Any]) -> str | Non
         return "txt"
     if _param_truthy(params, "json_response"):
         return "json"
+    # extract_rules, ai_extract_rules, ai_query always return JSON regardless of URL.
+    # Without this, URLs ending in .html would be saved as .html despite JSON body
+    # (the URL-path heuristic in extension_for_crawl wins before body sniff).
+    if params.get("extract_rules") or params.get("ai_extract_rules") or params.get("ai_query"):
+        return "json"
     return None
 
 

diff --git a/tests/unit/test_crawl.py b/tests/unit/test_crawl.py
@@ -104,6 +104,24 @@ def test_return_text(self):
     def test_json_response_only(self):
         assert _preferred_extension_from_scrape_params({"json_response": True}) == "json"
 
+    def test_extract_rules(self):
+        assert (
+            _preferred_extension_from_scrape_params({"extract_rules": '{"title": "h1"}'}) == "json"
+        )
+
+    def test_ai_extract_rules(self):
+        assert (
+            _preferred_extension_from_scrape_params({"ai_extract_rules": '{"title": "h1"}'})
+            == "json"
+        )
+
+    def test_ai_query(self):
+        assert _preferred_extension_from_scrape_params({"ai_query": "What is the price?"}) == "json"
+
+    def test_ai_selector_alone_returns_none(self):
+        # ai_selector is a modifier for ai_query/ai_extract_rules, not a JSON producer on its own.
+        assert _preferred_extension_from_scrape_params({"ai_selector": "h1"}) is None
+
     def test_none_when_no_match(self):
         assert _preferred_extension_from_scrape_params({}) is None
 
@@ -334,6 +352,43 @@ def test_save_response_manifest_has_required_fields(self, tmp_path):
         for field in ("file", "fetched_at", "http_status", "credits_used", "latency_ms"):
             assert field in entry, f"Missing field {field!r}"
 
+    def test_save_response_extract_rules_writes_json_for_html_url(self, tmp_path):
+        """SCR-371: with --extract-rules, JSON body must be saved as .json
+        even when the URL path ends with .html (URL heuristic must not win)."""
+        from scrapingbee_cli.crawl import GenericScrapingBeeSpider
+
+        spider = GenericScrapingBeeSpider(
+            start_urls=["https://books.toscrape.com/"],
+            scrape_params={"extract_rules": '{"title": "h1", "price": ".price_color"}'},
+            output_dir=str(tmp_path),
+        )
+        response = self._make_response(
+            "https://books.toscrape.com/catalogue/libertarianism-for-beginners_982/index.html",
+            b'{"title": "Libertarianism for Beginners", "price": "\\u00a351.33"}',
+        )
+        spider._save_response(response)
+        assert (tmp_path / "1.json").exists(), "Expected 1.json (JSON body), not .html"
+        assert not (tmp_path / "1.html").exists(), "Must not save JSON body as .html"
+        url = "https://books.toscrape.com/catalogue/libertarianism-for-beginners_982/index.html"
+        assert spider._url_file_map[url]["file"] == "1.json"
+
+    def test_save_response_ai_query_writes_json_for_html_url(self, tmp_path):
+        """SCR-371: --ai-query also forces JSON extension regardless of URL path."""
+        from scrapingbee_cli.crawl import GenericScrapingBeeSpider
+
+        spider = GenericScrapingBeeSpider(
+            start_urls=["https://example.com/"],
+            scrape_params={"ai_query": "What is the price?"},
+            output_dir=str(tmp_path),
+        )
+        response = self._make_response(
+            "https://example.com/products/widget.html",
+            b'{"answer": "$9.99"}',
+        )
+        spider._save_response(response)
+        assert (tmp_path / "1.json").exists()
+        assert not (tmp_path / "1.html").exists()
+
 
 class TestRequiresDiscoveryPhase:
     """Tests for _requires_discovery_phase()."""