Skip to content

Commit a389463

Browse files
committed
add example and tests
1 parent b9a17d5 commit a389463

File tree

2 files changed

+140
-0
lines changed

2 files changed

+140
-0
lines changed

examples/local_scraper_example.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
from bs4 import BeautifulSoup
2+
import os
3+
4+
def scrape_local_html(file_path):
5+
"""
6+
Scrape content from a local HTML file.
7+
8+
Args:
9+
file_path (str): Path to the local HTML file
10+
11+
Returns:
12+
dict: Extracted data from the HTML file
13+
"""
14+
# Check if file exists
15+
if not os.path.exists(file_path):
16+
raise FileNotFoundError(f"HTML file not found at: {file_path}")
17+
18+
# Read the HTML file
19+
with open(file_path, 'r', encoding='utf-8') as file:
20+
html_content = file.read()
21+
22+
# Parse HTML with BeautifulSoup
23+
soup = BeautifulSoup(html_content, 'html.parser')
24+
25+
# Example extraction - modify based on your HTML structure
26+
data = {
27+
'title': soup.title.string if soup.title else None,
28+
'paragraphs': [p.text for p in soup.find_all('p')],
29+
'links': [{'text': a.text, 'href': a.get('href')} for a in soup.find_all('a')],
30+
'headers': [h.text for h in soup.find_all(['h1', 'h2', 'h3'])]
31+
}
32+
33+
return data
34+
35+
def main():
36+
# Example usage
37+
try:
38+
# Assuming you have a sample.html file in the same directory
39+
result = scrape_local_html('sample.html')
40+
41+
# Print extracted data
42+
print("Title:", result['title'])
43+
print("\nParagraphs:")
44+
for p in result['paragraphs']:
45+
print(f"- {p}")
46+
47+
print("\nLinks:")
48+
for link in result['links']:
49+
print(f"- {link['text']}: {link['href']}")
50+
51+
print("\nHeaders:")
52+
for header in result['headers']:
53+
print(f"- {header}")
54+
55+
except FileNotFoundError as e:
56+
print(f"Error: {e}")
57+
except Exception as e:
58+
print(f"An error occurred: {e}")
59+
60+
if __name__ == "__main__":
61+
main()

tests/local_scraper_test.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import unittest
2+
from unittest.mock import patch
3+
from scrapegraph_py.local_scraper import scrape_text
4+
from pydantic import BaseModel, Field
5+
import requests
6+
7+
class TestSchema(BaseModel):
8+
title: str = Field(description="The title")
9+
content: str = Field(description="The content")
10+
11+
class TestLocalScraper(unittest.TestCase):
12+
13+
@patch('scrapegraph_py.local_scraper.requests.post')
14+
def test_scrape_text_success(self, mock_post):
15+
# Setup mock response
16+
mock_post.return_value.status_code = 200
17+
mock_post.return_value.text = '{"title": "Test", "content": "Content"}'
18+
19+
# Test basic scraping without schema
20+
response = scrape_text(
21+
"test_api_key",
22+
"Sample website text",
23+
"Extract information"
24+
)
25+
self.assertEqual(response, '{"title": "Test", "content": "Content"}')
26+
27+
@patch('scrapegraph_py.local_scraper.requests.post')
28+
def test_scrape_text_with_schema(self, mock_post):
29+
# Setup mock response
30+
mock_post.return_value.status_code = 200
31+
mock_post.return_value.text = '{"title": "Test", "content": "Content"}'
32+
33+
# Test scraping with schema
34+
response = scrape_text(
35+
"test_api_key",
36+
"Sample website text",
37+
"Extract information",
38+
schema=TestSchema
39+
)
40+
self.assertEqual(response, '{"title": "Test", "content": "Content"}')
41+
42+
@patch('scrapegraph_py.local_scraper.requests.post')
43+
def test_scrape_text_http_error(self, mock_post):
44+
# Test HTTP error handling
45+
mock_post.side_effect = requests.exceptions.HTTPError("404 Client Error")
46+
response = scrape_text(
47+
"test_api_key",
48+
"Sample website text",
49+
"Extract information"
50+
)
51+
self.assertIn("HTTP error occurred", response)
52+
53+
@patch('scrapegraph_py.local_scraper.requests.post')
54+
def test_scrape_text_forbidden(self, mock_post):
55+
# Test 403 forbidden error
56+
mock_response = mock_post.return_value
57+
mock_response.status_code = 403
58+
mock_post.side_effect = requests.exceptions.HTTPError("403 Forbidden")
59+
60+
response = scrape_text(
61+
"test_api_key",
62+
"Sample website text",
63+
"Extract information"
64+
)
65+
self.assertIn("Access forbidden (403)", response)
66+
67+
@patch('scrapegraph_py.local_scraper.requests.post')
68+
def test_scrape_text_general_error(self, mock_post):
69+
# Test general request exception handling
70+
mock_post.side_effect = requests.exceptions.RequestException("Connection error")
71+
response = scrape_text(
72+
"test_api_key",
73+
"Sample website text",
74+
"Extract information"
75+
)
76+
self.assertIn("An error occurred", response)
77+
78+
if __name__ == '__main__':
79+
unittest.main()

0 commit comments

Comments
 (0)