course_content_generator/link_validation.py at main · gwenf/course_content_generator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
Just a simple script that isn't functional connected to the app yet.
TODO: Change to use Google bc it is better with links.
"""

import requests

from bs4 import BeautifulSoup
from openai import OpenAI


def check_link_validity(url):
    try:
        response = requests.head(url, allow_redirects=True, timeout=5)
        if response.status_code == 200:
            return True, "Valid"
        else:
            return False, f"Invalid (HTTP {response.status_code})"
    except requests.exceptions.RequestException as e:
        return False, f"Error: {e}"


urls = [
    "https://www.openai.com/research/gpt-3",
    "https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/10-ways-ai-can-improve-your-business",
]

for url in urls:
    is_valid, status = check_link_validity(url)
    print(f"{url}: {status}")


def fetch_page_content(url):
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        return soup.get_text()[:2000]  # Limit content length for API input
    except requests.exceptions.RequestException as e:
        return None


def check_relevance(url, topic, api_key):
    content = fetch_page_content(url)
    if not content:
        return False, "Error fetching content"

    openai = OpenAI(api_key=api_key)
    prompt = f"Does the following content match the topic '{topic}'? Answer with Yes or No.\n\nContent:\n{content}"
    response = openai.chat.completions.create(
        model="gpt-4", messages=[{"role": "system", "content": prompt}]
    )
    return (
        "Yes" in response.choices[0].message.content,
        response.choices[0].message.content,
    )


api_key = "YOUR_OPENAI_API_KEY"
urls = ["https://www.mckinsey.com", "https://www.openai.com"]
topic = "AI in marketing and storytelling"

for url in urls:
    is_relevant, feedback = check_relevance(url, topic, api_key)
    print(f"{url} Relevant? {is_relevant}: {feedback}")


def check_relevance_keywords(url, keywords):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        page_text = soup.get_text().lower()
        keyword_matches = [
            keyword for keyword in keywords if keyword.lower() in page_text
        ]
        return len(keyword_matches) > 0, keyword_matches
    except Exception as e:
        return False, f"Error: {e}"


keywords = ["AI", "storytelling", "marketing"]
urls = ["https://www.mckinsey.com", "https://www.openai.com"]

for url in urls:
    is_relevant, matches = check_relevance_keywords(url, keywords)
    print(f"{url} Relevant? {is_relevant}: Matches - {matches}")


def validate_and_check_relevance(url, topic, keywords, api_key=None):
    is_valid, validity_status = check_link_validity(url)
    if not is_valid:
        return False, f"Invalid: {validity_status}"

    if api_key:
        is_relevant, relevance_feedback = check_relevance(url, topic, api_key)
    else:
        is_relevant, relevance_feedback = check_relevance_keywords(url, keywords)

    if is_relevant:
        return True, "Valid and Relevant"
    else:
        return False, f"Valid but Irrelevant: {relevance_feedback}"


topic = "AI in marketing and storytelling"
keywords = ["AI", "storytelling", "marketing"]
urls = ["https://www.openai.com/research/gpt-3", "https://www.brokenlinkexample.com"]

for url in urls:
    status, feedback = validate_and_check_relevance(
        url, topic, keywords, api_key="YOUR_OPENAI_API_KEY"
    )
    print(f"{url}: {feedback}")