Skip to content

Commit 3bca724

Browse files
feat: update standardize_quote()
1 parent ef1c85e commit 3bca724

File tree

1 file changed

+17
-3
lines changed

1 file changed

+17
-3
lines changed

unstructured/metrics/text_extraction.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,8 @@ def standardize_quotes(text: str) -> str:
192192
"〝": "U+301D", # REVERSED DOUBLE PRIME QUOTATION MARK
193193
"〞": "U+301E", # DOUBLE PRIME QUOTATION MARK
194194
"〟": "U+301F", # LOW DOUBLE PRIME QUOTATION MARK
195+
""": "U+FF02", # FULLWIDTH QUOTATION MARK
196+
",,": "U+275E", # LOW HEAVY DOUBLE COMMA ORNAMENT
195197
}
196198

197199
# Single Quotes Dictionary
@@ -213,7 +215,6 @@ def standardize_quotes(text: str) -> str:
213215
"﹂": "U+FE42", # PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
214216
"﹃": "U+FE43", # PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
215217
"﹄": "U+FE44", # PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
216-
""": "U+FF02", # FULLWIDTH QUOTATION MARK
217218
"'": "U+FF07", # FULLWIDTH APOSTROPHE
218219
"「": "U+FF62", # HALFWIDTH LEFT CORNER BRACKET
219220
"」": "U+FF63", # HALFWIDTH RIGHT CORNER BRACKET
@@ -225,14 +226,27 @@ def standardize_quotes(text: str) -> str:
225226
# Apply double quote replacements
226227
# Apply double quote replacements
227228
for unicode_val in double_quotes.values():
228-
unicode_char = chr(int(unicode_val.replace("U+", ""), 16))
229+
unicode_char = unicode_to_char(unicode_val)
229230
if unicode_char in text:
230231
text = text.replace(unicode_char, double_quote_standard)
231232

232233
# Apply single quote replacements
233234
for unicode_val in single_quotes.values():
234-
unicode_char = chr(int(unicode_val.replace("U+", ""), 16))
235+
unicode_char = unicode_to_char(unicode_val)
235236
if unicode_char in text:
236237
text = text.replace(unicode_char, single_quote_standard)
237238

238239
return text
240+
241+
242+
def unicode_to_char(unicode_val: str) -> str:
243+
"""
244+
Converts a Unicode value to a character.
245+
246+
Args:
247+
unicode_val (str): The Unicode value to convert.
248+
249+
Returns:
250+
str: The character corresponding to the Unicode value.
251+
"""
252+
return chr(int(unicode_val.replace("U+", ""), 16))

0 commit comments

Comments
 (0)