Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions newspaper/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,20 @@
from .utils import extend_config, print_available_languages


def build(url='', dry=False, config=None, **kwargs) -> Source:
def build(url='', dry=False, config=None, restrict_to_homepage_urls=False, **kwargs) -> Source:
"""Returns a constructed source object without
downloading or parsing the articles

:param url: URL of the source (homepage)
:param dry: If True, don't build the source (download and parse)
:param config: Configuration object
:param restrict_to_homepage_urls: If True, only articles linked directly from the homepage will be processed
:param kwargs: Additional keyword arguments to pass to the Source constructor
"""
config = config or Configuration()
config = extend_config(config, kwargs)
url = url or ''
s = Source(url, config=config)
s = Source(url, config=config, restrict_to_homepage_urls=restrict_to_homepage_urls)
if not dry:
s.build()
return s
Expand Down
95 changes: 95 additions & 0 deletions test_homepage_restriction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Demonstration script for the restrict_to_homepage_urls feature.

This script shows how to use the new feature to scrape only articles
listed on a news site's homepage rather than crawling the entire site.
"""

import os
import sys
import time
import newspaper
from newspaper import Article


def print_article_info(article, index):
"""Print basic information about an article"""
print(f"\n[{index}] {article.title}")
print(f"URL: {article.url}")
print(f"Published: {article.publish_date}")
print(f"Summary: {article.summary[:150]}..." if article.summary else "No summary available")


def save_to_file(articles, filename):
"""Save article information to a file"""
with open(filename, 'w', encoding='utf-8') as f:
f.write(f"Total articles: {len(articles)}\n\n")
for i, article in enumerate(articles, 1):
f.write(f"[{i}] {article.title}\n")
f.write(f"URL: {article.url}\n")
f.write(f"Published: {article.publish_date}\n")
f.write(f"Summary: {article.summary[:200]}...\n" if article.summary else "No summary available\n")
f.write("-" * 80 + "\n\n")
print(f"Saved {len(articles)} articles to {filename}")


def main():
# Set up output directory
output_dir = "reuters_articles"
if not os.path.exists(output_dir):
os.makedirs(output_dir)

# Get the URL from command line or use default
url = sys.argv[1] if len(sys.argv) > 1 else "https://www.reuters.com"

print(f"Scraping articles from {url}...")

# First, demonstrate normal behavior (crawls entire site)
start_time = time.time()
print("\nBuilding source WITHOUT homepage restriction...")
news_unrestricted = newspaper.build(url, memoize_articles=False, fetch_images=False, number_threads=1)
print(f"Found {len(news_unrestricted.articles)} articles without restriction")
print(f"Time taken: {time.time() - start_time:.2f} seconds")

# Now demonstrate the new feature
start_time = time.time()
print("\nBuilding source WITH homepage restriction...")
news_restricted = newspaper.build(
url,
restrict_to_homepage_urls=True,
memoize_articles=False,
fetch_images=False,
number_threads=1
)
print(f"Found {len(news_restricted.articles)} articles with homepage restriction")
print(f"Time taken: {time.time() - start_time:.2f} seconds")

# Download and process restricted articles
print("\nDownloading and processing homepage articles...")
processed_count = 0
successful_articles = []

for i, article in enumerate(news_restricted.articles[:20], 1): # Process up to 20 articles
try:
print(f"Processing article {i}/{min(20, len(news_restricted.articles))}...")
article.download()
article.parse()
article.nlp()
processed_count += 1
successful_articles.append(article)
print_article_info(article, i)
except Exception as e:
print(f"Error processing article {i}: {e}")

print(f"\nSuccessfully processed {processed_count} articles")

# Save results to file
if successful_articles:
save_to_file(successful_articles, os.path.join(output_dir, "homepage_articles.txt"))


if __name__ == "__main__":
main()
90 changes: 90 additions & 0 deletions tests/test_reuters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# -*- coding: utf-8 -*-

"""
Test the homepage URL restriction feature with Reuters website.
"""

import unittest
import re
import requests
from bs4 import BeautifulSoup
from newspaper import build
from newspaper.article import Article


class TestReutersScraper(unittest.TestCase):
def test_restrict_to_homepage_urls(self):
"""Test that only URLs from the Reuters homepage are processed when restrict_to_homepage_urls=True"""
# Skip this test if Reuters is not accessible
try:
requests.get("https://www.reuters.com", timeout=5)
except (requests.exceptions.RequestException, requests.exceptions.Timeout):
self.skipTest("Reuters website not accessible")

# Build the source with restricted URLs
news = build("https://www.reuters.com",
restrict_to_homepage_urls=True,
memoize_articles=False,
fetch_images=False,
number_threads=1)

# Verify we have a reasonable number of articles (not too many, not too few)
# Count may vary based on Reuters homepage changes
self.assertLessEqual(news.size(), 500, "Too many articles scraped")
self.assertGreater(news.size(), 50, "Too few articles scraped")

# Check if article URLs look like Reuters article URLs
article_pattern = re.compile(r'^https://www\.reuters\.com/.*')
for article in news.articles[:10]: # Check first 10 articles
self.assertTrue(
article_pattern.match(article.url),
f"Invalid article URL: {article.url}"
)

def test_manual_homepage_extraction(self):
"""Test a manual process to extract and process homepage URLs"""
# Skip this test if Reuters is not accessible
try:
resp = requests.get("https://www.reuters.com", timeout=5)
except (requests.exceptions.RequestException, requests.exceptions.Timeout):
self.skipTest("Reuters website not accessible")

# Parse homepage HTML to extract article URLs
soup = BeautifulSoup(resp.text, 'html.parser')
homepage_urls = set()

# Extract and normalize article URLs from <a> tags
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
if href.startswith('/'):
href = "https://www.reuters.com" + href
if re.match(r'^https://www\.reuters\.com/.*', href) and \
not re.search(r'/(video|gallery|slideshow)/', href):
homepage_urls.add(href)

# Verify we found a reasonable number of URLs
self.assertGreater(len(homepage_urls), 50, "Too few URLs found on homepage")
self.assertLess(len(homepage_urls), 500, "Too many URLs found on homepage")

# Process a small sample of URLs
sample_size = min(5, len(homepage_urls))
processed = 0

for url in list(homepage_urls)[:sample_size]:
try:
article = Article(url, language='en', fetch_images=False)
article.download()
article.parse()
article.nlp()
self.assertTrue(article.title, f"No title for {url}")
self.assertTrue(article.text.strip(), f"No text for {url}")
processed += 1
except Exception as e:
print(f"Error processing {url}: {e}")

# Verify we processed the expected number of articles
self.assertEqual(processed, sample_size, "Failed to process all sample articles")


if __name__ == '__main__':
unittest.main()