From 3a19533bf295edbc354bfaf7bb71ee5fa9f9b9d8 Mon Sep 17 00:00:00 2001 From: MarcDjandji Date: Thu, 21 Nov 2024 10:39:10 +0100 Subject: [PATCH 1/3] Extend support to chinese, korean and spanish --- mailparser_reply/constants.py | 70 ++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 6 deletions(-) diff --git a/mailparser_reply/constants.py b/mailparser_reply/constants.py index a45aafe..42bd69a 100644 --- a/mailparser_reply/constants.py +++ b/mailparser_reply/constants.py @@ -163,11 +163,69 @@ ], 'sent_from': 'Wysłano z' }, - 'david': { - # Custom Software Headers – also kind of like a language, right? - 'from_header': r'((?:^ *' + QUOTED_MATCH_INCLUDE + r'\[?Original Message processed by david.+?$\n{,4})' - + r'(?:.*\n?){,2}' # david's non-subject line + date wildcard identification - + r'(?:(?:^|\n|\n' - + QUOTED_MATCH_INCLUDE + r')[* ]*(?:Von|An|Cc)(?:\s{,2}).*){2,})' + 'zh': { + 'wrote_header': r'^(?!.*\d{4}年\d{1,2}月\d{1,2}日.*?写道:)(' + + QUOTED_MATCH_INCLUDE + + r'\d{4}年\d{1,2}月\d{1,2}日.*?写道:)$', + 'from_header': r'((?:(?:^|\n|\n' + + QUOTED_MATCH_INCLUDE + + r')[* ]*(?:发件人|发送时间|收件人|主题|抄送|组织):[ *]*(?:\s{,2}).*){2,}(?:\n.*){,1})', + 'disclaimers': [ + '免责声明:', + '注意:', + '重要信息:', + ], + 'signatures': [ + '此致,', + '敬礼,', + '谢谢,', + '谢谢您的关注,', + '祝好,', + ], + 'sent_from': r'从我的.*发送', + }, + 'ko': { + 'wrote_header': r'^(?!.*\d{4}년 \d{1,2}월 \d{1,2}일.*?님이 작성하였습니다:)(' + + QUOTED_MATCH_INCLUDE + + r'\d{4}년 \d{1,2}월 \d{1,2}일 .*님이 작성하였습니다:)$', + 'from_header': r'((?:(?:^|\n|\n' + + QUOTED_MATCH_INCLUDE + + r')[* ]*(?:보낸\s?사람|보낸\s?날짜|받는\s?사람|제목|참조):[ *]*(?:\s{,2}).*){2,}(?:\n.*){,1})', + 'disclaimers': [ + '주의:', + '면책 조항:', + '비밀정보:', + ], + 'signatures': [ + '감사합니다,', + '안부 전합니다,', + '좋은 하루 되세요,', + '고맙습니다,', + '감사합니다.', + ], + 'sent_from': r'내 .*에서 보냄', + }, + 'es': { + 'wrote_header': r'^(?!El\s.+\s escribió:)(' + + QUOTED_MATCH_INCLUDE + + r'El\s.+\s escribió:)$', + 'from_header': r'((?:(?:^|\n|\n' + + QUOTED_MATCH_INCLUDE + + r')[* ]*(?:De|Enviado|Para|Asunto|Fecha|CC|Organización):[ *]*(?:\s{,2}).*){2,}(?:\n.*){,1})', + 'disclaimers': [ + 'Aviso:', + 'Confidencialidad:', + 'Advertencia:', + 'Descargo de responsabilidad:', + ], + 'signatures': [ + 'Saludos,', + 'Atentamente,', + 'Gracias,', + 'Un saludo,', + 'Cordialmente,', + 'Muchas gracias,', + ], + 'sent_from': r'Enviado desde mi.*', }, } From 10b56fd45b650aaf4973cec4499b5b4143dfc583 Mon Sep 17 00:00:00 2001 From: MarcDjandji Date: Fri, 10 Jan 2025 11:06:01 +0100 Subject: [PATCH 2/3] Add cs-CZ to supported languages --- mailparser_reply/constants.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/mailparser_reply/constants.py b/mailparser_reply/constants.py index 42bd69a..0b3bc2f 100644 --- a/mailparser_reply/constants.py +++ b/mailparser_reply/constants.py @@ -228,4 +228,25 @@ ], 'sent_from': r'Enviado desde mi.*', }, + 'cs': { + 'wrote_header': r'^(?!Dne[.\s]*Dne\s(.+?\s?.+?)\snapsal\(a\):)(' + + QUOTED_MATCH_INCLUDE + + r'Dne\s(?:.+?\s?.+?)\s?napsal\(a\):)$', + 'from_header': r'((?:(?:^|\n|\n' + + QUOTED_MATCH_INCLUDE + + r')[* ]*(?:Od|Odesláno|Komu|Předmět|Datum|Kopie):[ *]*(?:\s{,2}).*){2,}(?:\n.*){,1})', + 'disclaimers': [ + 'Upozornění:', + 'Důvěrné:', + 'Varování:', + ], + 'signatures': [ + 'S pozdravem,', + 'S úctou,', + 'Děkuji,', + 'Děkujeme,', + 'S přáním hezkého dne,', + ], + 'sent_from': r'Odesláno z mého.*', + }, } From 28647102ff70ec07ac05b631a4e5eba1ee633433 Mon Sep 17 00:00:00 2001 From: Marc Djandji <36219909+MarcDjandji@users.noreply.github.com> Date: Tue, 14 Jan 2025 17:45:33 +0100 Subject: [PATCH 3/3] Update README.md --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f4ca6f5..cd4c3c9 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,11 @@ Currently supported languages are: * German (`de`) 🇩🇪 * Italian (`it`) 🇮🇹 * Japanese (`ja`) 🇯🇵 -* Polish (`pl`) 🇵🇱 +* Polish (`pl`) 🇵🇱 +* Korean (`ko`) 🇰🇷 +* Chinese (`zh`) 🇨🇳 +* Spanish (`es`) 🇪🇸 +* Czech (`cs`) 🇨🇿 🏳️‍🌈 **Adding more languages is quite easy!**