-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebscrape_playwright.py
133 lines (109 loc) · 5.3 KB
/
webscrape_playwright.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import requests
import base64
import argparse
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from playwright.sync_api import sync_playwright
from datetime import datetime
from PyPDF2 import PdfMerger
import time
visited_urls = set()
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
def debug_message(message):
"""Prints a debug message with a timestamp."""
print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {message}")
def get_internal_links(url, base_domain):
"""Fetch internal links and PDF links from a page within the same domain."""
internal_links = set()
headers = {"User-Agent": USER_AGENT}
try:
response = requests.get(url, headers=headers, timeout=10)
debug_message(f"Started extracting links from: {url} | HTTP Status: {response.status_code}")
if response.status_code != 200:
debug_message(f"Failed to retrieve page: {url} | HTTP Status: {response.status_code}")
return []
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a', href=True):
href = link.get('href')
full_url = urljoin(url, href)
link_domain = urlparse(full_url).netloc
# Check if the link is within the same domain
if link_domain == base_domain and full_url not in visited_urls:
internal_links.add(full_url)
except requests.exceptions.RequestException as e:
debug_message(f"Error fetching links from {url}: {e}")
return internal_links
def save_page_to_pdf(url, file_name, page):
"""Fetch page content and save it as a PDF using Playwright."""
try:
page.goto(url, timeout=30000, wait_until="load")
debug_message(f"Started processing URL: {url}")
page.pdf(path=file_name, format="A4", print_background=True)
debug_message(f"Saved {url} to {file_name}")
return file_name
except Exception as e:
debug_message(f"Error saving {url} to PDF: {e}")
return None
def crawl_and_save(url, base_domain, page, output_dir, pdf_only=False, download_pdfs=False, depth=2):
"""Recursively crawl internal links, downloading PDFs if in pdf_only mode or if PDF links are encountered."""
if depth == 0:
return []
url = url.rstrip('/')
visited_urls.add(url)
pdf_files = []
if url.endswith(".pdf"):
if pdf_only or download_pdfs:
response = requests.get(url, stream=True)
pdf_path = os.path.join(output_dir, url.split("/")[-1])
with open(pdf_path, 'wb') as f:
f.write(response.content)
debug_message(f"Downloaded PDF from {url} to {pdf_path}")
pdf_files.append(pdf_path)
elif not pdf_only:
page_name = os.path.join(output_dir, url.replace("https://", "").replace("http://", "").replace("/", "_") + ".pdf")
pdf_file = save_page_to_pdf(url, page_name, page)
if pdf_file:
pdf_files.append(pdf_file)
internal_links = get_internal_links(url, base_domain)
for link in internal_links:
if link not in visited_urls:
try:
pdf_files.extend(crawl_and_save(link, base_domain, page, output_dir, pdf_only=pdf_only, download_pdfs=download_pdfs, depth=depth - 1) or [])
time.sleep(1)
except Exception as e:
debug_message(f"Error processing link {link}: {e}")
return pdf_files
def combine_pdfs(pdf_files, output_filename):
"""Combine a list of PDF files into a single PDF."""
merger = PdfMerger()
for pdf in pdf_files:
merger.append(pdf)
merger.write(output_filename)
merger.close()
debug_message(f"Combined PDF saved as {output_filename}")
def main():
parser = argparse.ArgumentParser(description="Crawl a website and save pages as PDFs or download linked PDFs.")
parser.add_argument("url", help="The starting URL to crawl")
parser.add_argument("--depth", type=int, default=2, help="Depth of crawling")
parser.add_argument("--single-pdf", action="store_true", help="Combine all pages into a single PDF")
parser.add_argument("--download-pdfs", action="store_true", help="Download any PDF files linked on pages")
parser.add_argument("--pdf-only", action="store_true", help="Only download PDF files and ignore HTML pages")
args = parser.parse_args()
target_url = args.url
base_domain = urlparse(target_url).netloc
output_dir = base_domain
os.makedirs(output_dir, exist_ok=True)
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(user_agent=USER_AGENT)
page = context.new_page()
pdf_files = crawl_and_save(target_url, base_domain, page, output_dir, pdf_only=args.pdf_only, download_pdfs=args.download_pdfs, depth=args.depth)
if args.single_pdf and not args.pdf_only:
combined_output = os.path.join(output_dir, "combined_document.pdf")
combine_pdfs(pdf_files, combined_output)
else:
debug_message("Saved each page or downloaded PDF as a separate file.")
browser.close()
if __name__ == "__main__":
main()