[IMP] mail: parsing emails with several html parts

If an email contains several text/html parts inside a multipart email, the previous code was only keeping the last content part.
The Content-Type: multipart/mixed allows several independent part (RFC1341 7.2.2), so two html is technically valid.
With this patch, the two parts are concatenated. (opw 614755)

Modify append_content_to_html regex to make sure the regex keeps the content of the html instead of removing it.
e.g.: "123 <html> 456 </html> 789" used to be stripped to "123  789" while we expect "123 456 789"
This commit is contained in:
Martin Trigaux 2014-10-09 09:14:22 +02:00
parent 03df412faf
commit bd52298073
3 changed files with 64 additions and 2 deletions

View File

@ -801,9 +801,13 @@ class mail_thread(osv.AbstractModel):
body = tools.append_content_to_html(u'', body, preserve=True)
else:
alternative = False
mixed = False
html = u''
for part in message.walk():
if part.get_content_type() == 'multipart/alternative':
alternative = True
if part.get_content_type() == 'multipart/mixed':
mixed = True
if part.get_content_maintype() == 'multipart':
continue # skip container
# part.get_filename returns decoded value if able to decode, coded otherwise.
@ -830,8 +834,11 @@ class mail_thread(osv.AbstractModel):
encoding, errors='replace'), preserve=True)
# 3) text/html -> raw
elif part.get_content_type() == 'text/html':
# mutlipart/alternative have one text and a html part, keep only the second
# mixed allows several html parts, append html content
append_content = not alternative or (html and mixed)
html = tools.ustr(part.get_payload(decode=True), encoding, errors='replace')
if alternative:
if not append_content:
body = html
else:
body = tools.append_content_to_html(body, html, plaintext=False)

View File

@ -141,6 +141,53 @@ X-Attachment-Id: f_hkpb27k00
dGVzdAo=
--089e01536c4ed4d17204e49b8e96--"""
MAIL_MULTIPART_MIXED_TWO = """X-Original-To: raoul@grosbedon.fr
Delivered-To: raoul@grosbedon.fr
Received: by mail1.grosbedon.com (Postfix, from userid 10002)
id E8166BFACA; Fri, 23 Aug 2013 13:18:01 +0200 (CEST)
From: "Bruce Wayne" <bruce@wayneenterprises.com>
Content-Type: multipart/alternative;
boundary="Apple-Mail=_9331E12B-8BD2-4EC7-B53E-01F3FBEC9227"
Message-Id: <6BB1FAB2-2104-438E-9447-07AE2C8C4A92@sexample.com>
Mime-Version: 1.0 (Mac OS X Mail 7.3 \(1878.6\))
--Apple-Mail=_9331E12B-8BD2-4EC7-B53E-01F3FBEC9227
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
charset=us-ascii
First and second part
--Apple-Mail=_9331E12B-8BD2-4EC7-B53E-01F3FBEC9227
Content-Type: multipart/mixed;
boundary="Apple-Mail=_CA6C687E-6AA0-411E-B0FE-F0ABB4CFED1F"
--Apple-Mail=_CA6C687E-6AA0-411E-B0FE-F0ABB4CFED1F
Content-Transfer-Encoding: 7bit
Content-Type: text/html;
charset=us-ascii
<html><head></head><body>First part</body></html>
--Apple-Mail=_CA6C687E-6AA0-411E-B0FE-F0ABB4CFED1F
Content-Disposition: inline;
filename=thetruth.pdf
Content-Type: application/pdf;
name="thetruth.pdf"
Content-Transfer-Encoding: base64
SSBhbSB0aGUgQmF0TWFuCg==
--Apple-Mail=_CA6C687E-6AA0-411E-B0FE-F0ABB4CFED1F
Content-Transfer-Encoding: 7bit
Content-Type: text/html;
charset=us-ascii
<html><head></head><body>Second part</body></html>
--Apple-Mail=_CA6C687E-6AA0-411E-B0FE-F0ABB4CFED1F--
--Apple-Mail=_9331E12B-8BD2-4EC7-B53E-01F3FBEC9227--
"""
class TestMailgateway(TestMailBase):
@ -202,6 +249,14 @@ class TestMailgateway(TestMailBase):
self.assertIn('<div dir="ltr">Should create a multipart/mixed: from gmail, <b>bold</b>, with attachment.<br clear="all"><div><br></div>', res.get('body', ''),
'message_parse: html version should be in body after parsing multipart/mixed')
res = self.mail_thread.message_parse(cr, uid, MAIL_MULTIPART_MIXED_TWO)
self.assertNotIn('First and second part', res.get('body', ''),
'message_parse: text version should not be in body after parsing multipart/mixed')
self.assertIn('First part', res.get('body', ''),
'message_parse: first part of the html version should be in body after parsing multipart/mixed')
self.assertIn('Second part', res.get('body', ''),
'message_parse: second part of the html version should be in body after parsing multipart/mixed')
def test_10_message_process(self):
""" Testing incoming emails processing. """
cr, uid, user_raoul = self.cr, self.uid, self.user_raoul

View File

@ -282,7 +282,7 @@ def append_content_to_html(html, content, plaintext=True, preserve=False, contai
elif plaintext:
content = '\n%s\n' % plaintext2html(content, container_tag)
else:
content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
content = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', content)
content = u'\n%s\n' % ustr(content)
# Force all tags to lowercase
html = re.sub(r'(</?)\W*(\w+)([ >])',