codelucas · ljluestc · Jun 22, 2025 · Jun 22, 2025
diff --git a/newspaper/api.py b/newspaper/api.py
@@ -18,14 +18,20 @@
 from .utils import extend_config, print_available_languages
 
 
-def build(url='', dry=False, config=None, **kwargs) -> Source:
+def build(url='', dry=False, config=None, restrict_to_homepage_urls=False, **kwargs) -> Source:
     """Returns a constructed source object without
     downloading or parsing the articles
+
+    :param url: URL of the source (homepage)
+    :param dry: If True, don't build the source (download and parse)
+    :param config: Configuration object
+    :param restrict_to_homepage_urls: If True, only articles linked directly from the homepage will be processed
+    :param kwargs: Additional keyword arguments to pass to the Source constructor
     """
     config = config or Configuration()
     config = extend_config(config, kwargs)
     url = url or ''
-    s = Source(url, config=config)
+    s = Source(url, config=config, restrict_to_homepage_urls=restrict_to_homepage_urls)
     if not dry:
         s.build()
     return s

diff --git a/test_homepage_restriction.py b/test_homepage_restriction.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Demonstration script for the restrict_to_homepage_urls feature.
+
+This script shows how to use the new feature to scrape only articles
+listed on a news site's homepage rather than crawling the entire site.
+"""
+
+import os
+import sys
+import time
+import newspaper
+from newspaper import Article
+
+
+def print_article_info(article, index):
+    """Print basic information about an article"""
+    print(f"\n[{index}] {article.title}")
+    print(f"URL: {article.url}")
+    print(f"Published: {article.publish_date}")
+    print(f"Summary: {article.summary[:150]}..." if article.summary else "No summary available")
+
+
+def save_to_file(articles, filename):
+    """Save article information to a file"""
+    with open(filename, 'w', encoding='utf-8') as f:
+        f.write(f"Total articles: {len(articles)}\n\n")
+        for i, article in enumerate(articles, 1):
+            f.write(f"[{i}] {article.title}\n")
+            f.write(f"URL: {article.url}\n")
+            f.write(f"Published: {article.publish_date}\n")
+            f.write(f"Summary: {article.summary[:200]}...\n" if article.summary else "No summary available\n")
+            f.write("-" * 80 + "\n\n")
+    print(f"Saved {len(articles)} articles to {filename}")
+
+
+def main():
+    # Set up output directory
+    output_dir = "reuters_articles"
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    # Get the URL from command line or use default
+    url = sys.argv[1] if len(sys.argv) > 1 else "https://www.reuters.com"
+
+    print(f"Scraping articles from {url}...")
+
+    # First, demonstrate normal behavior (crawls entire site)
+    start_time = time.time()
+    print("\nBuilding source WITHOUT homepage restriction...")
+    news_unrestricted = newspaper.build(url, memoize_articles=False, fetch_images=False, number_threads=1)
+    print(f"Found {len(news_unrestricted.articles)} articles without restriction")
+    print(f"Time taken: {time.time() - start_time:.2f} seconds")
+
+    # Now demonstrate the new feature
+    start_time = time.time()
+    print("\nBuilding source WITH homepage restriction...")
+    news_restricted = newspaper.build(
+        url, 
+        restrict_to_homepage_urls=True,
+        memoize_articles=False, 
+        fetch_images=False,
+        number_threads=1
+    )
+    print(f"Found {len(news_restricted.articles)} articles with homepage restriction")
+    print(f"Time taken: {time.time() - start_time:.2f} seconds")
+
+    # Download and process restricted articles
+    print("\nDownloading and processing homepage articles...")
+    processed_count = 0
+    successful_articles = []
+
+    for i, article in enumerate(news_restricted.articles[:20], 1):  # Process up to 20 articles
+        try:
+            print(f"Processing article {i}/{min(20, len(news_restricted.articles))}...")
+            article.download()
+            article.parse()
+            article.nlp()
+            processed_count += 1
+            successful_articles.append(article)
+            print_article_info(article, i)
+        except Exception as e:
+            print(f"Error processing article {i}: {e}")
+
+    print(f"\nSuccessfully processed {processed_count} articles")
+
+    # Save results to file
+    if successful_articles:
+        save_to_file(successful_articles, os.path.join(output_dir, "homepage_articles.txt"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_reuters.py b/tests/test_reuters.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+
+"""
+Test the homepage URL restriction feature with Reuters website.
+"""
+
+import unittest
+import re
+import requests
+from bs4 import BeautifulSoup
+from newspaper import build
+from newspaper.article import Article
+
+
+class TestReutersScraper(unittest.TestCase):
+    def test_restrict_to_homepage_urls(self):
+        """Test that only URLs from the Reuters homepage are processed when restrict_to_homepage_urls=True"""
+        # Skip this test if Reuters is not accessible
+        try:
+            requests.get("https://www.reuters.com", timeout=5)
+        except (requests.exceptions.RequestException, requests.exceptions.Timeout):
+            self.skipTest("Reuters website not accessible")
+
+        # Build the source with restricted URLs
+        news = build("https://www.reuters.com", 
+                    restrict_to_homepage_urls=True, 
+                    memoize_articles=False,
+                    fetch_images=False,
+                    number_threads=1)
+
+        # Verify we have a reasonable number of articles (not too many, not too few)
+        # Count may vary based on Reuters homepage changes
+        self.assertLessEqual(news.size(), 500, "Too many articles scraped")
+        self.assertGreater(news.size(), 50, "Too few articles scraped")
+
+        # Check if article URLs look like Reuters article URLs
+        article_pattern = re.compile(r'^https://www\.reuters\.com/.*')
+        for article in news.articles[:10]:  # Check first 10 articles
+            self.assertTrue(
+                article_pattern.match(article.url),
+                f"Invalid article URL: {article.url}"
+            )
+
+    def test_manual_homepage_extraction(self):
+        """Test a manual process to extract and process homepage URLs"""
+        # Skip this test if Reuters is not accessible
+        try:
+            resp = requests.get("https://www.reuters.com", timeout=5)
+        except (requests.exceptions.RequestException, requests.exceptions.Timeout):
+            self.skipTest("Reuters website not accessible")
+
+        # Parse homepage HTML to extract article URLs
+        soup = BeautifulSoup(resp.text, 'html.parser')
+        homepage_urls = set()
+
+        # Extract and normalize article URLs from <a> tags
+        for a_tag in soup.find_all('a', href=True):
+            href = a_tag['href']
+            if href.startswith('/'):
+                href = "https://www.reuters.com" + href
+            if re.match(r'^https://www\.reuters\.com/.*', href) and \
+               not re.search(r'/(video|gallery|slideshow)/', href):
+                homepage_urls.add(href)
+
+        # Verify we found a reasonable number of URLs
+        self.assertGreater(len(homepage_urls), 50, "Too few URLs found on homepage")
+        self.assertLess(len(homepage_urls), 500, "Too many URLs found on homepage")
+
+        # Process a small sample of URLs
+        sample_size = min(5, len(homepage_urls))
+        processed = 0
+
+        for url in list(homepage_urls)[:sample_size]:
+            try:
+                article = Article(url, language='en', fetch_images=False)
+                article.download()
+                article.parse()
+                article.nlp()
+                self.assertTrue(article.title, f"No title for {url}")
+                self.assertTrue(article.text.strip(), f"No text for {url}")
+                processed += 1
+            except Exception as e:
+                print(f"Error processing {url}: {e}")
+
+        # Verify we processed the expected number of articles
+        self.assertEqual(processed, sample_size, "Failed to process all sample articles")
+
+
+if __name__ == '__main__':
+    unittest.main()