[IMP] mail: parsing emails with several html parts
If an email contains several text/html parts inside a multipart email, the previous code was only keeping the last content part. The Content-Type: multipart/mixed allows several independent part (RFC1341 7.2.2), so two html is technically valid. With this patch, the two parts are concatenated. (opw 614755) Modify append_content_to_html regex to make sure the regex keeps the content of the html instead of removing it. e.g.: "123 <html> 456 </html> 789" used to be stripped to "123 789" while we expect "123 456 789"
This commit is contained in:
parent
03df412faf
commit
bd52298073
|
@ -801,9 +801,13 @@ class mail_thread(osv.AbstractModel):
|
|||
body = tools.append_content_to_html(u'', body, preserve=True)
|
||||
else:
|
||||
alternative = False
|
||||
mixed = False
|
||||
html = u''
|
||||
for part in message.walk():
|
||||
if part.get_content_type() == 'multipart/alternative':
|
||||
alternative = True
|
||||
if part.get_content_type() == 'multipart/mixed':
|
||||
mixed = True
|
||||
if part.get_content_maintype() == 'multipart':
|
||||
continue # skip container
|
||||
# part.get_filename returns decoded value if able to decode, coded otherwise.
|
||||
|
@ -830,8 +834,11 @@ class mail_thread(osv.AbstractModel):
|
|||
encoding, errors='replace'), preserve=True)
|
||||
# 3) text/html -> raw
|
||||
elif part.get_content_type() == 'text/html':
|
||||
# mutlipart/alternative have one text and a html part, keep only the second
|
||||
# mixed allows several html parts, append html content
|
||||
append_content = not alternative or (html and mixed)
|
||||
html = tools.ustr(part.get_payload(decode=True), encoding, errors='replace')
|
||||
if alternative:
|
||||
if not append_content:
|
||||
body = html
|
||||
else:
|
||||
body = tools.append_content_to_html(body, html, plaintext=False)
|
||||
|
|
|
@ -141,6 +141,53 @@ X-Attachment-Id: f_hkpb27k00
|
|||
dGVzdAo=
|
||||
--089e01536c4ed4d17204e49b8e96--"""
|
||||
|
||||
MAIL_MULTIPART_MIXED_TWO = """X-Original-To: raoul@grosbedon.fr
|
||||
Delivered-To: raoul@grosbedon.fr
|
||||
Received: by mail1.grosbedon.com (Postfix, from userid 10002)
|
||||
id E8166BFACA; Fri, 23 Aug 2013 13:18:01 +0200 (CEST)
|
||||
From: "Bruce Wayne" <bruce@wayneenterprises.com>
|
||||
Content-Type: multipart/alternative;
|
||||
boundary="Apple-Mail=_9331E12B-8BD2-4EC7-B53E-01F3FBEC9227"
|
||||
Message-Id: <6BB1FAB2-2104-438E-9447-07AE2C8C4A92@sexample.com>
|
||||
Mime-Version: 1.0 (Mac OS X Mail 7.3 \(1878.6\))
|
||||
|
||||
--Apple-Mail=_9331E12B-8BD2-4EC7-B53E-01F3FBEC9227
|
||||
Content-Transfer-Encoding: 7bit
|
||||
Content-Type: text/plain;
|
||||
charset=us-ascii
|
||||
|
||||
First and second part
|
||||
|
||||
--Apple-Mail=_9331E12B-8BD2-4EC7-B53E-01F3FBEC9227
|
||||
Content-Type: multipart/mixed;
|
||||
boundary="Apple-Mail=_CA6C687E-6AA0-411E-B0FE-F0ABB4CFED1F"
|
||||
|
||||
--Apple-Mail=_CA6C687E-6AA0-411E-B0FE-F0ABB4CFED1F
|
||||
Content-Transfer-Encoding: 7bit
|
||||
Content-Type: text/html;
|
||||
charset=us-ascii
|
||||
|
||||
<html><head></head><body>First part</body></html>
|
||||
|
||||
--Apple-Mail=_CA6C687E-6AA0-411E-B0FE-F0ABB4CFED1F
|
||||
Content-Disposition: inline;
|
||||
filename=thetruth.pdf
|
||||
Content-Type: application/pdf;
|
||||
name="thetruth.pdf"
|
||||
Content-Transfer-Encoding: base64
|
||||
|
||||
SSBhbSB0aGUgQmF0TWFuCg==
|
||||
|
||||
--Apple-Mail=_CA6C687E-6AA0-411E-B0FE-F0ABB4CFED1F
|
||||
Content-Transfer-Encoding: 7bit
|
||||
Content-Type: text/html;
|
||||
charset=us-ascii
|
||||
|
||||
<html><head></head><body>Second part</body></html>
|
||||
--Apple-Mail=_CA6C687E-6AA0-411E-B0FE-F0ABB4CFED1F--
|
||||
|
||||
--Apple-Mail=_9331E12B-8BD2-4EC7-B53E-01F3FBEC9227--
|
||||
"""
|
||||
|
||||
class TestMailgateway(TestMailBase):
|
||||
|
||||
|
@ -202,6 +249,14 @@ class TestMailgateway(TestMailBase):
|
|||
self.assertIn('<div dir="ltr">Should create a multipart/mixed: from gmail, <b>bold</b>, with attachment.<br clear="all"><div><br></div>', res.get('body', ''),
|
||||
'message_parse: html version should be in body after parsing multipart/mixed')
|
||||
|
||||
res = self.mail_thread.message_parse(cr, uid, MAIL_MULTIPART_MIXED_TWO)
|
||||
self.assertNotIn('First and second part', res.get('body', ''),
|
||||
'message_parse: text version should not be in body after parsing multipart/mixed')
|
||||
self.assertIn('First part', res.get('body', ''),
|
||||
'message_parse: first part of the html version should be in body after parsing multipart/mixed')
|
||||
self.assertIn('Second part', res.get('body', ''),
|
||||
'message_parse: second part of the html version should be in body after parsing multipart/mixed')
|
||||
|
||||
def test_10_message_process(self):
|
||||
""" Testing incoming emails processing. """
|
||||
cr, uid, user_raoul = self.cr, self.uid, self.user_raoul
|
||||
|
|
|
@ -282,7 +282,7 @@ def append_content_to_html(html, content, plaintext=True, preserve=False, contai
|
|||
elif plaintext:
|
||||
content = '\n%s\n' % plaintext2html(content, container_tag)
|
||||
else:
|
||||
content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
|
||||
content = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', content)
|
||||
content = u'\n%s\n' % ustr(content)
|
||||
# Force all tags to lowercase
|
||||
html = re.sub(r'(</?)\W*(\w+)([ >])',
|
||||
|
|
Loading…
Reference in New Issue