From cc419f41d02f992de424a523a1a020539960ba14 Mon Sep 17 00:00:00 2001 From: Jean Kaddour Date: Wed, 12 Feb 2025 19:12:41 +0000 Subject: [PATCH] refactor: firecrawl scrape outputs --- .../integrations/firecrawl/firecrawl_scrape.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/backend/app/nodes/integrations/firecrawl/firecrawl_scrape.py b/backend/app/nodes/integrations/firecrawl/firecrawl_scrape.py index 3d408953..7e81b497 100644 --- a/backend/app/nodes/integrations/firecrawl/firecrawl_scrape.py +++ b/backend/app/nodes/integrations/firecrawl/firecrawl_scrape.py @@ -15,8 +15,8 @@ class Config: class FirecrawlScrapeNodeOutput(BaseNodeOutput): - scrape_result: str = Field( - ..., description="The scraped data in markdown or structured format." + markdown: str = Field( + ..., description="The scraped data in markdown format." ) @@ -26,7 +26,7 @@ class FirecrawlScrapeNodeConfig(BaseNodeConfig): description="The URL to scrape and convert into clean markdown or structured data.", ) output_schema: Dict[str, str] = Field( - default={"scrape_result": "string"}, + default={"markdown": "string"}, description="The schema for the output of the node", ) has_fixed_output: bool = True @@ -59,17 +59,14 @@ async def run(self, input: BaseModel) -> BaseModel: self.config.url_template, raw_input_dict, self.name ) - if not os.getenv("FIRECRAWL_API_KEY"): - raise ValueError("FIRECRAWL_API_KEY environment variable is not set") - app = FirecrawlApp() # type: ignore scrape_result = app.scrape_url( # type: ignore url_template, params={ - "formats": ["markdown", "html"], + "formats": ["markdown"], }, ) - return FirecrawlScrapeNodeOutput(scrape_result=json.dumps(scrape_result)) + return FirecrawlScrapeNodeOutput(markdown=scrape_result["markdown"]) except Exception as e: logging.error(f"Failed to scrape URL: {e}") - return FirecrawlScrapeNodeOutput(scrape_result="") + return FirecrawlScrapeNodeOutput(markdown="")