-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathUKParsare.py
77 lines (62 loc) · 2.54 KB
/
UKParsare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
import json
from bs4 import BeautifulSoup
def extract_metadata_from_file(file_path):
"""Extract metadata from an HTML file."""
with open(file_path, "r", encoding="utf-8") as file:
html_content = file.read()
soup = BeautifulSoup(html_content, "html.parser")
# Title
title_element = soup.find("meta", property="og:title")
title = title_element["content"].strip() if title_element else None
# Text content
article_content = soup.find("div", class_="entry-content")
text = article_content.get_text(separator="\n").strip() if article_content else None
# Region (specific to this site)
region = "UK" # Adjust based on the source or content
# Author
author = None
author_element = soup.find("strong", itemprop="name")
if author_element:
author = author_element.get_text().strip()
# Date of publication
date_element = soup.find("time", class_="entry-date")
date = date_element.get_text().strip() if date_element else None
# Category
category = None
category_element = soup.find("span", class_="entry-categories")
if category_element:
actual_category = category_element.find("span", itemprop="articleSection")
if actual_category:
category = actual_category.get_text().strip()
# URL
canonical_link = soup.find("link", rel="canonical")
url = canonical_link["href"] if canonical_link else None
return {
"title": title,
"text": text,
"region": region,
"author": author, # Updated to get the author dynamically
"date": date,
"category": category,
"url": url,
}
def process_directory(input_dir, output_file):
"""Process all HTML files in a directory and save metadata to a JSON file."""
articles_metadata = []
for file_name in os.listdir(input_dir):
if file_name.endswith(".html"):
file_path = os.path.join(input_dir, file_name)
metadata = extract_metadata_from_file(file_path)
articles_metadata.append(metadata)
print(f"Processed: {file_name}")
# Save all metadata to a JSON file
with open(output_file, "w", encoding="utf-8") as f:
json.dump(articles_metadata, f, ensure_ascii=False, indent=4)
print(f"Metadata saved to {output_file}")
if __name__ == "__main__":
# Input directory containing HTML files
INPUT_DIR = "UK_ziarulromanesc_articles" # Replace with the actual directory path
# Output JSON file
OUTPUT_FILE = "UK.json"
process_directory(INPUT_DIR, OUTPUT_FILE)