-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathGermaniaParsare.py
90 lines (73 loc) · 3.03 KB
/
GermaniaParsare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import json
from bs4 import BeautifulSoup
def extract_metadata_from_file(file_path):
"""Extract metadata from an HTML file."""
with open(file_path, "r", encoding="utf-8") as file:
html_content = file.read()
soup = BeautifulSoup(html_content, "html.parser")
# Title
title_element = soup.find("meta", property="og:title")
title = title_element["content"].strip() if title_element else None
# Text content
article_content = soup.find("div", class_="nv-content-wrap entry-content")
# Remove all <b> tags from the article content
if article_content:
for bold_tag in article_content.find_all("b"):
bold_tag.decompose()
text = article_content.get_text(separator="\n").strip()
else:
text = None
# Remove unwanted parts from the text dynamically
if text:
# Remove lines containing dates and "ACTUALIZAT" patterns
cleaned_lines = []
for line in text.splitlines():
if not ("ACTUALIZAT" in line or any(char.isdigit() for char in line[:10])):
cleaned_lines.append(line)
text = "\n".join(cleaned_lines).strip()
# Region (specific to this site)
region = "Germania"
# Author
author_element = soup.find("span", class_="author-name fn")
author = author_element.get_text().strip() if author_element else None
# Date of publication
date_element = soup.find("div", style="border-top:1px solid #dedede;border-bottom:1px solid #dedede; font-size:13px;")
date = None
if date_element:
raw_date = date_element.get_text(separator=" ").strip()
date = raw_date.split(" ")[0] if raw_date else None
# Category
category_element = soup.find("a", rel="category tag")
category = category_element.get_text().strip() if category_element else None
# URL
canonical_link = soup.find("link", rel="canonical")
url = canonical_link["href"] if canonical_link else None
return {
"title": title,
"text": text,
"region": region,
"author": author,
"date": date,
"category": category,
"url": url,
}
def process_directory(input_dir, output_file):
"""Process all HTML files in a directory and save metadata to a JSON file."""
articles_metadata = []
for file_name in os.listdir(input_dir):
if file_name.endswith(".html"):
file_path = os.path.join(input_dir, file_name)
metadata = extract_metadata_from_file(file_path)
articles_metadata.append(metadata)
print(f"Processed: {file_name}")
# Save all metadata to a JSON file
with open(output_file, "w", encoding="utf-8") as f:
json.dump(articles_metadata, f, ensure_ascii=False, indent=4)
print(f"Metadata saved to {output_file}")
if __name__ == "__main__":
# Input directory containing HTML files
INPUT_DIR = "germania_ziarulromanesc_de_articles" # Replace with the actual directory path
# Output JSON file
OUTPUT_FILE = "germania.json"
process_directory(INPUT_DIR, OUTPUT_FILE)