-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvocalize_json_v3_GOOD.py
71 lines (59 loc) · 3.01 KB
/
vocalize_json_v3_GOOD.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import json
import subprocess
import tempfile
import os
import re
def vocalize_arabic_in_json(input_json_file, output_json_file, jar_path):
# Load the JSON data
with open(input_json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# Prepare a temporary file for the Arabic text
with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as temp_input_file:
for key, value in data.items():
if isinstance(value, str):
# Extract Arabic text from the string using regex
arabic_texts = re.findall(r'[\u0600-\u06FF]+', value)
if arabic_texts: # If there are Arabic texts
for arabic_text in arabic_texts:
temp_input_file.write(arabic_text + '\n')
temp_input_file_path = temp_input_file.name
# Prepare a temporary output file for diacritized text
temp_output_file = tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8')
try:
# Run the Farasa diacritization tool
subprocess.run(
["java", "-jar", jar_path, "-i", temp_input_file_path, "-o", temp_output_file.name],
check=True
)
# Read the diacritized output
with open(temp_output_file.name, 'r', encoding='utf-8') as temp_output:
diacritized_lines = [line.strip() for line in temp_output.readlines()]
# Update the original JSON with diacritized text
diacritized_index = 0
for key in data.keys():
value = data[key]
if isinstance(value, str):
# Replace only the Arabic parts with their diacritized versions
def replace_arabic(match):
nonlocal diacritized_index
arabic_word = match.group(0)
if diacritized_index < len(diacritized_lines):
diacritized_word = diacritized_lines[diacritized_index]
diacritized_index += 1
return diacritized_word # Return the diacritized word
return arabic_word # In case there's an issue, return the original word
# Replace Arabic words in the value
updated_value = re.sub(r'[\u0600-\u06FF]+', replace_arabic, value)
data[key] = updated_value
finally:
# Clean up temporary files
os.remove(temp_input_file_path)
os.remove(temp_output_file.name)
# Write the updated data back to a new JSON file
with open(output_json_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if __name__ == "__main__":
input_json_filename = "ar_sa.json" # Your input JSON file
output_json_filename = "diacritized_ar_sa.json" # Output JSON file with diacritized text
jar_file_path = "/home/reng/Downloads/QCRI/Dev/ArabicNLP/Farasa/FarasaDiacritizeJar/dist/FarasaDiacritizeJar.jar" # Path to the jar file
vocalize_arabic_in_json(input_json_filename, output_json_filename, jar_file_path)