Skip to content

Commit

Permalink
backend: Web scrape tool -> Remove co reader implementation and use a…
Browse files Browse the repository at this point in the history
…iohttp GET (#867)

* wip

* wip

* wip

* Use aiohttp request for web scrape. add timeout. add title
  • Loading branch information
tianjing-li authored Dec 4, 2024
1 parent 81a6f32 commit c536bd6
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 48 deletions.
4 changes: 4 additions & 0 deletions src/backend/tools/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from aiohttp import ClientTimeout

TIMEOUT_SECONDS = 15
ASYNC_TIMEOUT = ClientTimeout(total=TIMEOUT_SECONDS)
90 changes: 42 additions & 48 deletions src/backend/tools/web_scrape.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
import json
from typing import Any, ClassVar, Dict, List
from typing import Any, Dict, List

import aiohttp
from langchain_text_splitters import MarkdownHeaderTextSplitter
from bs4 import BeautifulSoup

from backend.schemas.tool import ToolCategory, ToolDefinition
from backend.services.logger.utils import LoggerFactory
from backend.services.utils import read_pdf
from backend.tools.base import BaseTool
from backend.tools.constants import ASYNC_TIMEOUT

logger = LoggerFactory().get_logger()


class WebScrapeTool(BaseTool):
ID = "web_scrape"
ENDPOINT: ClassVar[str] = "http://co-reader"
ENABLE_CHUNKING: ClassVar[bool] = True

@classmethod
def is_available(cls) -> bool:
Expand Down Expand Up @@ -50,18 +49,9 @@ async def call(
) -> List[Dict[str, Any]]:
url = parameters.get("url")

headers = {
"X-Respond-With": "markdown",
"x-no-cache": "true",
"Content-Type": "application/json",
}
data = {"url": url}

async with aiohttp.ClientSession() as session:
async with aiohttp.ClientSession(timeout=ASYNC_TIMEOUT) as session:
try:
async with session.post(
self.ENDPOINT, data=json.dumps(data), headers=headers
) as response:
async with session.get(url) as response:
if response.status != 200:
error_message = f"HTTP {response.status} {response.reason}"
return [
Expand All @@ -71,39 +61,43 @@ async def call(
}
]

content = await response.text()
return self.parse_content(content, url, self.ENABLE_CHUNKING)
return await self.handle_response(response, url)

except aiohttp.ClientError as e:
return [
{
"text": f"Request failed: {str(e)}",
"url": url,
}
]

def parse_content(
self, content: str, url: str, enable_chunking: bool
) -> list[dict]:
if enable_chunking:
splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=[
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
("####", "Header 4"),
("#####", "Header 5"),
],
strip_headers=False,
)
docs = splitter.split_text(content)
return [
{"text": doc.page_content, "url": url, **doc.metadata} for doc in docs
]

return [
{
"text": content,
return {
"text": f"Client error using web scrape: {str(e)}",
"url": url,
}
except Exception as e:
return {
"text": f"Request failed using web scrape: {str(e)}",
"url": url,
}

async def handle_response(self, response: aiohttp.ClientResponse, url: str):
content_type = response.headers.get("content-type")

# If URL is a PDF, read contents using helper function
if "application/pdf" in content_type:
return {
"text": read_pdf(response.content),
"url": url,
}
]
elif "text/html" in content_type:
content = await response.text()
soup = BeautifulSoup(content, "html.parser")

text = soup.get_text().replace("\n", "")
title = next((tag.text for tag in soup.find_all('h1')), None)

data = {
"text": text,
"url": url,
}

if title:
data["title"] = title

return data
else:
raise ValueError(f"Unsupported Content Type using web scrape: {content_type}")

0 comments on commit c536bd6

Please sign in to comment.