From 38d2ed11e07f0095eb8985547bd8481c7a82a591 Mon Sep 17 00:00:00 2001 From: Adam Dobrawy Date: Wed, 3 Jan 2018 03:36:09 +0100 Subject: [PATCH] Ensure UTF-8 surogates escaped on save - fix #159 --- .gitignore | 3 +- django_mailbox/models.py | 9 ++-- .../messages/message_with_utf8_surrogates.eml | 46 +++++++++++++++++++ django_mailbox/tests/test_process_email.py | 36 ++++++++++++++- 4 files changed, 89 insertions(+), 5 deletions(-) create mode 100644 django_mailbox/tests/messages/message_with_utf8_surrogates.eml diff --git a/.gitignore b/.gitignore index 9b355bf6..2fca30e6 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,5 @@ include/* dummy_project/* .cache/ .tox/ -messages +/messages + diff --git a/django_mailbox/models.py b/django_mailbox/models.py index 6ca9884e..6e183dfb 100644 --- a/django_mailbox/models.py +++ b/django_mailbox/models.py @@ -346,7 +346,6 @@ def _get_dehydrated_message(self, msg, record): def _process_message(self, message): msg = Message() settings = utils.get_settings() - if settings['store_original_message']: self._process_save_original_message(message, msg) msg.mailbox = self @@ -386,10 +385,14 @@ def _process_message(self, message): def _process_save_original_message(self, message, msg): settings = utils.get_settings() + if six.PY3: + content = message.as_string().encode('ascii', 'surrogateescape') + else: + content = message.as_string() if settings['compress_original_message']: with NamedTemporaryFile(suffix=".eml.gz") as fp_tmp: with gzip.GzipFile(fileobj=fp_tmp, mode="w") as fp: - fp.write(message.as_string().encode('utf-8')) + fp.write(content) msg.eml.save( "%s.eml.gz" % (uuid.uuid4(), ), File(fp_tmp), @@ -399,7 +402,7 @@ def _process_save_original_message(self, message, msg): else: msg.eml.save( '%s.eml' % uuid.uuid4(), - ContentFile(message.as_string()), + ContentFile(content), save=False ) diff --git a/django_mailbox/tests/messages/message_with_utf8_surrogates.eml b/django_mailbox/tests/messages/message_with_utf8_surrogates.eml new file mode 100644 index 00000000..8f68cb39 --- /dev/null +++ b/django_mailbox/tests/messages/message_with_utf8_surrogates.eml @@ -0,0 +1,46 @@ +Return-path: +Envelope-to: sprawa-1418@porady.REDACTED +Delivery-date: Sat, 16 Dec 2017 16:22:42 +0100 +Received: from mx1.wp.pl ([212.77.101.6]) + by s50.hekko.net.pl with esmtps (TLSv1.2:ECDHE-RSA-AES256-GCM-SHA384:256) + (Exim 4.89) (envelope-from ) id 1eQEII-0005Fu-Cs + for sprawa-1418@porady.REDACTED; Sat, 16 Dec 2017 16:22:42 +0100 +Received: (wp-smtpd smtp.wp.pl 33592 invoked from network); + 16 Dec 2017 16:22:11 +0100 +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=wp.pl; s=1024a; + t=1513437731; bh=6Ox0FVna7vMxBu5CbF0s6HvpkNalENwSSgDxNJ/Rsdc=; + h=From:To:Subject; + b=QnrdNDBDNLuENN9QS0Pvn85/bDE1Fc6jJrvKUdnApFrykwzbHxXxCG4qX7g3sS3Qj + xxHGNf8UXZh3zyCln2EZpUD03LgkppMpTbv3tLKA4HSnaT7txr6AWHq2y8A/YQo7EY + 2806CYWtFCKYoVolzDN9lctM2nEoZpD5jOVZqYsM= +Received: from public-gprs394416.centertel.pl (HELO REDACTED) + (REDACTED@wp.pl@[37.47.171.241]) (envelope-sender ) + by smtp.wp.pl (WP-SMTPD) with SMTP + for ; 16 Dec 2017 16:22:11 +0100 +Message-ID: +From: +To: +Subject: =?windows-1250?Q?Do_czego_te=BF_s=B9_zdolni_Polscy_s=EAdziowie_..._?= +Date: Sat, 16 Dec 2017 16:21:04 +0100 +MIME-Version: 1.0 +Content-Type: multipart/mixed; + boundary="----=_NextPart_000_0018_01D37689.E058CEF0" +X-Spam-Status: No, message_size=7331926 larger than 200K + +To jest wielocz�ciowa wiadomo�� w formacie MIME. + +------=_NextPart_000_0018_01D37689.E058CEF0 +Content-Type: multipart/alternative; + boundary="----=_NextPart_001_0019_01D37689.E058CEF0" + + +------=_NextPart_001_0019_01D37689.E058CEF0 +Content-Type: text/plain; + charset="windows-1250" +Content-Transfer-Encoding: quoted-printable + +REDACTED + +------=_NextPart_001_0019_01D37689.E058CEF0-- + +------=_NextPart_000_0018_01D37689.E058CEF0-- diff --git a/django_mailbox/tests/test_process_email.py b/django_mailbox/tests/test_process_email.py index a128e2c6..b71bc789 100644 --- a/django_mailbox/tests/test_process_email.py +++ b/django_mailbox/tests/test_process_email.py @@ -147,6 +147,40 @@ def test_message_with_utf8_attachment_header(self): u'odpowied\u017a Burmistrza.jpg' ) + def test_message_with_utf8_surrogates(self): + """Ensure that we properly handle UTF-8 surrogates + + The problem observed in Python 3.5. It safes from regress of #159. + """ + + email_object = self._get_email_object( + 'message_with_utf8_surrogates.eml', + ) + mailbox = Mailbox.objects.create() + default_settings = utils.get_settings() + with mock.patch('django_mailbox.utils.get_settings') as get_settings: + altered = copy.deepcopy(default_settings) + altered['store_original_message'] = True + + get_settings.return_value = altered + + # This call throws the UnicodeEncodeError exception. + msg = mailbox.process_incoming_message(email_object) + + self.assertEqual( + msg.subject, + u'Do czego te\u017c s\u0105 zdolni Polscy s\u0119dziowie ... ' + ) + + self.assertEqual( + msg.attachments.count(), + 0 + ) + + with open(msg.eml.name, 'rb') as f: + self.assertEqual(f.read(), + self._get_email_as_text('message_with_utf8_surrogates.eml')) + def test_message_get_text_body(self): message = self._get_email_object('multipart_text.eml') @@ -468,4 +502,4 @@ def test_message_compressed(self): with gzip.open(msg.eml.name, 'rb') as f: self.assertEqual(f.read(), - self._get_email_as_text('generic_message.eml')) \ No newline at end of file + self._get_email_as_text('generic_message.eml'))