-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex_page.py
118 lines (92 loc) · 3.9 KB
/
index_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
from datetime import datetime
from bs4 import BeautifulSoup
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID, DATETIME
from selenium import webdriver
from urllib.parse import urljoin, urlparse
import multiprocessing
# Define the schema for your search index
schema = Schema(
title=TEXT(stored=True),
content=TEXT,
url=ID(stored=True, unique=True),
date=DATETIME(stored=True)
)
# Create or open an index in a directory (change 'indexdir' to your preferred directory)
indexdir = "index_dir"
if not os.path.exists(indexdir):
os.mkdir(indexdir)
ix = create_in(indexdir, schema)
# External file to track visited URLs
visited_urls_file = "visited_urls.txt"
links_to_visit_file = "links_to_visit.txt"
def load_links_to_visit():
if os.path.exists(links_to_visit_file):
with open(links_to_visit_file, 'r') as f:
return set(line.strip() for line in f.readlines())
return set()
def save_links_to_visit(links_to_visit):
with open(links_to_visit_file, 'w') as f:
f.write("\n".join(links_to_visit))
def load_visited_urls():
if os.path.exists(visited_urls_file):
with open(visited_urls_file, 'r') as f:
return set(line.strip() for line in f.readlines())
return set()
def save_visited_urls(visited_urls):
with open(visited_urls_file, 'w') as f:
f.write("\n".join(visited_urls))
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
# Define a function to fetch and index a webpage using Selenium
def index_webpage(url, depth=3, open_in_new_tab=False):
visited_urls=load_visited_urls()
links_to_visit = load_links_to_visit()
if(url not in visited_urls or depth==3):
try:
print(f"Indexing {url}")
if open_in_new_tab:
driver.execute_script("window.open();")
driver.switch_to.window(driver.window_handles[-1])
driver.get(url)
driver.implicitly_wait(30)
soup = BeautifulSoup(driver.page_source, 'html.parser')
title = soup.title.text.strip() if soup.title else ""
content = soup.get_text().strip()
current_date = datetime.now()
with ix.writer() as writer:
print(f"Indexing new document for URL: {url}")
writer.add_document(title=title, content=content, url=url, date=current_date)
visited_urls.add(url)
save_visited_urls(visited_urls)
if depth > 0:
print(f"Indexing links in {url}, depth={depth}")
for link in soup.find_all('a', href=True):
print(f"Found link: {link['href']}")
next_url = urljoin(url, link['href'])
parsed_url = urlparse(next_url)
base_domain = parsed_url.netloc
print(f"Base domain: {base_domain}")
links_to_visit.add(next_url)
# Check for links that have the domain "ur.ac.rw"
if "ur.ac.rw" in base_domain:
index_webpage(next_url, depth - 1, open_in_new_tab=True)
except Exception as e:
visited_urls.add(url)
print(f"Error indexing {url}: {e}")
if __name__ == '__main__':
start_url = "https://ur.ac.rw/"
# Split the crawling process among multiple processes
num_processes = multiprocessing.cpu_count()
url_chunks = [start_url] # Modify this if you want to start from different URLs
print("Number of processor processing data", num_processes)
processes = []
for url_chunk in url_chunks:
process = multiprocessing.Process(target=index_webpage, args=(url_chunk,))
processes.append(process)
process.start()
# Wait for all processes to finish
for process in processes:
process.join()
driver.quit()