Skip to content

Commit 3a2d08f

Browse files
lc0rpsrvfalsecollijk
authored
Pass TestSearch benchmark consistently (Add browse_website TOKENS_TO_TRIGGER_SUMMARY) (Significant-Gravitas#5092)
* Added SUMMARIZATION_TRIGGER_LENGTH browse_website won't summarize content that's shorter than SUMMARIZATION_TRIGGER_LENGTH. It defaults to 250 characters, which is approximately 50 tokens. * Refactor BrowserOptions * Use tokens instead of length to trigger summarization * Bugfix * fix: Always return links even if not summarizing feat: Increase the number of links returned from 5 to 20 --------- Co-authored-by: lc0rp <[email protected]> Co-authored-by: James Collins <[email protected]>
1 parent a593c32 commit 3a2d08f

File tree

1 file changed

+23
-15
lines changed

1 file changed

+23
-15
lines changed

autogpt/commands/web_selenium.py

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,23 @@
22

33
from __future__ import annotations
44

5+
from autogpt.llm.utils.token_counter import count_string_tokens
6+
57
COMMAND_CATEGORY = "web_browse"
68
COMMAND_CATEGORY_TITLE = "Web Browsing"
79

810
import logging
911
from pathlib import Path
1012
from sys import platform
11-
from typing import Optional, Type
13+
from typing import Optional
1214

1315
from bs4 import BeautifulSoup
1416
from selenium.common.exceptions import WebDriverException
1517
from selenium.webdriver.chrome.options import Options as ChromeOptions
1618
from selenium.webdriver.chrome.service import Service as ChromeDriverService
1719
from selenium.webdriver.chrome.webdriver import WebDriver as ChromeDriver
1820
from selenium.webdriver.common.by import By
21+
from selenium.webdriver.common.options import ArgOptions as BrowserOptions
1922
from selenium.webdriver.edge.options import Options as EdgeOptions
2023
from selenium.webdriver.edge.service import Service as EdgeDriverService
2124
from selenium.webdriver.edge.webdriver import WebDriver as EdgeDriver
@@ -38,9 +41,9 @@
3841
from autogpt.processing.html import extract_hyperlinks, format_hyperlinks
3942
from autogpt.url_utils.validators import validate_url
4043

41-
BrowserOptions = ChromeOptions | EdgeOptions | FirefoxOptions | SafariOptions
42-
4344
FILE_DIR = Path(__file__).parent.parent
45+
TOKENS_TO_TRIGGER_SUMMARY = 50
46+
LINKS_TO_RETURN = 20
4447

4548

4649
@command(
@@ -64,25 +67,30 @@ def browse_website(url: str, question: str, agent: Agent) -> str:
6467
question (str): The question asked by the user
6568
6669
Returns:
67-
Tuple[str, WebDriver]: The answer and links to the user and the webdriver
70+
str: The answer and links to the user and the webdriver
6871
"""
72+
driver = None
6973
try:
7074
driver, text = scrape_text_with_selenium(url, agent)
75+
add_header(driver)
76+
if TOKENS_TO_TRIGGER_SUMMARY < count_string_tokens(text, agent.llm.name):
77+
text = summarize_memorize_webpage(url, text, question, agent, driver)
78+
79+
links = scrape_links_with_selenium(driver, url)
80+
81+
# Limit links to LINKS_TO_RETURN
82+
if len(links) > LINKS_TO_RETURN:
83+
links = links[:LINKS_TO_RETURN]
84+
85+
return f"Answer gathered from website: {text}\n\nLinks: {links}"
7186
except WebDriverException as e:
7287
# These errors are often quite long and include lots of context.
7388
# Just grab the first line.
7489
msg = e.msg.split("\n")[0]
7590
return f"Error: {msg}"
76-
77-
add_header(driver)
78-
summary = summarize_memorize_webpage(url, text, question, agent, driver)
79-
links = scrape_links_with_selenium(driver, url)
80-
81-
# Limit links to 5
82-
if len(links) > 5:
83-
links = links[:5]
84-
close_browser(driver)
85-
return f"Answer gathered from website: {summary}\n\nLinks: {links}"
91+
finally:
92+
if driver:
93+
close_browser(driver)
8694

8795

8896
def scrape_text_with_selenium(url: str, agent: Agent) -> tuple[WebDriver, str]:
@@ -96,7 +104,7 @@ def scrape_text_with_selenium(url: str, agent: Agent) -> tuple[WebDriver, str]:
96104
"""
97105
logging.getLogger("selenium").setLevel(logging.CRITICAL)
98106

99-
options_available: dict[str, Type[BrowserOptions]] = {
107+
options_available: dict[str, BrowserOptions] = {
100108
"chrome": ChromeOptions,
101109
"edge": EdgeOptions,
102110
"firefox": FirefoxOptions,

0 commit comments

Comments
 (0)