-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlink_validation.py
More file actions
113 lines (86 loc) · 3.48 KB
/
link_validation.py
File metadata and controls
113 lines (86 loc) · 3.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
Just a simple script that isn't functional connected to the app yet.
TODO: Change to use Google bc it is better with links.
"""
import requests
from bs4 import BeautifulSoup
from openai import OpenAI
def check_link_validity(url):
try:
response = requests.head(url, allow_redirects=True, timeout=5)
if response.status_code == 200:
return True, "Valid"
else:
return False, f"Invalid (HTTP {response.status_code})"
except requests.exceptions.RequestException as e:
return False, f"Error: {e}"
urls = [
"https://www.openai.com/research/gpt-3",
"https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/10-ways-ai-can-improve-your-business",
]
for url in urls:
is_valid, status = check_link_validity(url)
print(f"{url}: {status}")
def fetch_page_content(url):
try:
response = requests.get(url, timeout=5)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
return soup.get_text()[:2000] # Limit content length for API input
except requests.exceptions.RequestException as e:
return None
def check_relevance(url, topic, api_key):
content = fetch_page_content(url)
if not content:
return False, "Error fetching content"
openai = OpenAI(api_key=api_key)
prompt = f"Does the following content match the topic '{topic}'? Answer with Yes or No.\n\nContent:\n{content}"
response = openai.chat.completions.create(
model="gpt-4", messages=[{"role": "system", "content": prompt}]
)
return (
"Yes" in response.choices[0].message.content,
response.choices[0].message.content,
)
api_key = "YOUR_OPENAI_API_KEY"
urls = ["https://www.mckinsey.com", "https://www.openai.com"]
topic = "AI in marketing and storytelling"
for url in urls:
is_relevant, feedback = check_relevance(url, topic, api_key)
print(f"{url} Relevant? {is_relevant}: {feedback}")
def check_relevance_keywords(url, keywords):
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
page_text = soup.get_text().lower()
keyword_matches = [
keyword for keyword in keywords if keyword.lower() in page_text
]
return len(keyword_matches) > 0, keyword_matches
except Exception as e:
return False, f"Error: {e}"
keywords = ["AI", "storytelling", "marketing"]
urls = ["https://www.mckinsey.com", "https://www.openai.com"]
for url in urls:
is_relevant, matches = check_relevance_keywords(url, keywords)
print(f"{url} Relevant? {is_relevant}: Matches - {matches}")
def validate_and_check_relevance(url, topic, keywords, api_key=None):
is_valid, validity_status = check_link_validity(url)
if not is_valid:
return False, f"Invalid: {validity_status}"
if api_key:
is_relevant, relevance_feedback = check_relevance(url, topic, api_key)
else:
is_relevant, relevance_feedback = check_relevance_keywords(url, keywords)
if is_relevant:
return True, "Valid and Relevant"
else:
return False, f"Valid but Irrelevant: {relevance_feedback}"
topic = "AI in marketing and storytelling"
keywords = ["AI", "storytelling", "marketing"]
urls = ["https://www.openai.com/research/gpt-3", "https://www.brokenlinkexample.com"]
for url in urls:
status, feedback = validate_and_check_relevance(
url, topic, keywords, api_key="YOUR_OPENAI_API_KEY"
)
print(f"{url}: {feedback}")