agent-smith/webgrep at main · KiriLev/agent-smith · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import prompts
from llm import run_mistral_llm, run_mistral_tuned_llm, run_openai_llm
import argparse


def get_html_and_snapshot(url):
    # gets html and snapshot from serverless chrome
    with sync_playwright() as p:
        browser = p.chromium.launch()
        page = browser.new_page()
        page.goto(url)
        snapshot = page.accessibility.snapshot()
        return page.content(), snapshot


def get_answers(texts: list[str], query: str) -> list[str]:
    # tries to find an answer for the given query in texts using llm request
    llm_result = run_openai_llm(prompts.answer_finder_system_prompt, prompts.get_answer_finder_user_prompt(texts, query))
    if not llm_result.success:
        return []
    result = []
    texts_merged = " ".join(texts)
    for answer in llm_result.results:
        if answer in texts_merged:
            result.append(answer)
    return result


def filter_link_with_texts(links_with_texts: list[tuple[str, str]], query: str, model: str) -> list[tuple[str, str]]:
    # simple + llm filtering and ranking for the best next link to get closer to get answer for the query
    links_with_texts = [(link, text) for link, text in links_with_texts if text != ""]
    if model and "mistral_tuned" in model.lower():
        llm_result = run_mistral_tuned_llm(prompts.links_filter_system_prompt,
                                     prompts.get_links_filter_user_prompt(links_with_texts, query))
    elif model and "mistral" in model.lower():
        llm_result = run_mistral_llm(prompts.links_filter_system_prompt,
                             prompts.get_links_filter_user_prompt(links_with_texts, query))
    else:
        llm_result = run_openai_llm(prompts.links_filter_system_prompt,
                             prompts.get_links_filter_user_prompt(links_with_texts, query))
    if not llm_result.success:
        return []
    result = []
    for idx in llm_result.results:
        result.append(links_with_texts[idx])
    return result


def get_links_from_page(soup):
    # Find all <a> tags, which define hyperlinks
    links = soup.find_all('a')

    # Extract the URL and text from each link
    links_with_text = [(link.get('href'), link.get_text(strip=True))
                       for link in links
                       if link.get('href') is not None and len(link.get('href')) > 1]
    return links_with_text


def postprocess_links(parent_url: str, next_links: list[str]) -> list[str]:
    result = []
    parent_url_parsed = urlparse(parent_url)
    base_url = parent_url_parsed.scheme + "://" + parent_url_parsed.hostname
    for l in next_links:
        if l.startswith("/"):
            result.append(base_url.rstrip("/") + l)
        elif urlparse(l).hostname == parent_url_parsed.hostname:
            result.append(l)
    return result


def get_page_texts(soup):
    # gets all texts from the webpage loaded in soup
    visible_texts = [text for text in soup.stripped_strings]
    visible_texts = [t.strip() for t in visible_texts if len(t) > 3]
    return visible_texts


def get_results_or_next_links(url: str, query: str, model: str) -> tuple[list[str], list[str]]:
    full_html, snapshot = get_html_and_snapshot(url)
    # breakpoint()
    soup = BeautifulSoup(full_html, 'html.parser')
    page_texts = get_page_texts(soup)
    answers = get_answers(page_texts, query)
    if len(answers) > 0:
        return answers, []
    links_with_texts = get_links_from_page(soup)  # (link, text)
    links_with_texts = filter_link_with_texts(links_with_texts, query, model)
    next_links = [l[0] for l in links_with_texts]
    next_links = postprocess_links(url, next_links)
    return [], next_links


def get_result(url: str, query: str, model: str) -> list[str]:
    # running bfs on the website
    q = [url]
    visited = set()
    while q:
        new_url = q.pop(0)
        if new_url in visited:
            continue
        print("Current url:", new_url)
        results, next_links = get_results_or_next_links(new_url, query, model)
        visited.add(new_url)
        if results:
            return results
        else:
            q.extend(next_links)
    return []


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Tiny helpful script to retrieve the data from the provided url based on provided query")
    parser.add_argument('query', help='query')
    parser.add_argument('url', help='url')
    parser.add_argument('--model', help='model to run')

    args = parser.parse_args()
    print("Results:", get_result(args.url, args.query, args.model))