-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
79 lines (66 loc) · 2.55 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import requests
import json
import os
from bs4 import BeautifulSoup
URL_TEMPLATE = "https://www.cashconverters.be/fr/recherche?controller=search&s=manette+ps3&q=Prix-%E2%82%AC-2.99-40.99&order=product.price.desc&page={}"
HEADERS = {"User-Agent": "Mozilla/5.0"}
DATA_FILE = "products.json"
PR_BODY_FILE = "pr_body.md"
def load_existing_products():
if os.path.exists(DATA_FILE):
with open(DATA_FILE, "r", encoding="utf-8") as f:
return json.load(f)
return []
def save_products(products):
with open(DATA_FILE, "w", encoding="utf-8") as f:
json.dump(products, f, indent=4)
def generate_pr_body(new_products):
with open(PR_BODY_FILE, "w", encoding="utf-8") as f:
f.write("# New Products Available\n\n")
f.write("| Title | Price | Store | URL |\n")
f.write("|---|---|---|---|\n")
for product in new_products:
f.write(f"| {product['title']} | {product['price']} | {product['store']} | [Link]({product['url']}) |\n")
def scrape_page(page_number):
url = URL_TEMPLATE.format(page_number)
response = requests.get(url, headers=HEADERS)
if response.status_code != 200:
return []
soup = BeautifulSoup(response.text, "html.parser")
product_elements = soup.select(".product-miniature")
products = []
for product in product_elements:
# print(product.prettify())
link = product.select_one(".product-title a")
price = product.select_one(".price")
store = product.select_one(".magasin")
if link and price and store:
products.append({
"title": link.text.strip(),
"url": link["href"],
"price": price.text.strip(),
"store": store.get_text(strip=True).replace('storefront', '').replace('\n', '').strip() if store else "Unknown"
})
return products
def main():
existing_products = load_existing_products()
existing_urls = {p["url"] for p in existing_products}
new_products = []
page = 1
while True:
products = scrape_page(page)
if not products:
break
for product in products:
if product["url"] not in existing_urls:
new_products.append(product)
existing_products.append(product)
page += 1
if new_products:
save_products(existing_products)
generate_pr_body(new_products)
print("New products found and PR body generated.")
else:
print("No new products found.")
if __name__ == "__main__":
main()