[IMP] tools: mail: improved append_content_to_html, including plaintext2html. Updated tests.

bzr revid: tde@openerp.com-20121109123521-e04g4zrhtv947l1r
This commit is contained in:
Thibault Delavallée 2012-11-09 13:35:21 +01:00
parent 0d027a3f8b
commit 62c419e66f
4 changed files with 161 additions and 138 deletions

View File

@ -3,7 +3,6 @@
<data noupdate="1">
<record id="partner_demo" model="res.partner">
<field name="name">Demo User</field>
<field name="email">demo@example.com</field>
<field name="company_id" ref="main_company"/>
<field name="customer" eval="False"/>
<field name="email">demo@example.com</field>

View File

@ -44,8 +44,8 @@ import openerp
import openerp.tools as tools
from openerp.tools.translate import _
from openerp.tools import float_round, float_repr
from openerp.tools import html_sanitize
import simplejson
from openerp.tools.mail import html_sanitize
from openerp import SUPERUSER_ID
_logger = logging.getLogger(__name__)

View File

@ -23,9 +23,9 @@
##############################################################################
import unittest2
from openerp.tools.mail import html_sanitize, html_email_clean, append_content_to_html, text2html
from openerp.tools import html_sanitize, html_email_clean, append_content_to_html, plaintext2html
test_case = """
HTML_SOURCE = """
<font size="2" style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; ">test1</font>
<div style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; font-size: 12px; font-style: normal; ">
<b>test2</b></div><div style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; font-size: 12px; ">
@ -43,88 +43,65 @@ test12</font></div><div><font color="#1f1f1f" face="monospace" size="2"><br></fo
<a href="javascript:alert('malicious code')">test link</a>
"""
GMAIL_REPLY_SAN = """<div>R&#233;ponse via thunderbird, classique.<br><br>
On 11/05/2012 10:51 AM, Raoul Tartopoils wrote:<br></div>
<blockquote>
<div>Plop !</div>
<ul><li>Vive les lapins rapides !<br></li>
<li>Nouille</li>
<li>Frites</li>
</ul><div><br></div>
<div>Clairement, hein ?</div>
-- <br>
Raoul Tartopoils<br></blockquote>
<br><br><pre>--
Raoul Tartopoils
</pre>"""
TEXT_MAIL1 = """I contact you about our meeting for tomorrow. Here is the schedule I propose:
9 AM: brainstorming about our new amazing business app</span></li>
9.45 AM: summary
10 AM: meeting with Fabien to present our app
Is everything ok for you ?
--
Administrator"""
GMAIL_REPLY2_SAN = """<div>Je r&#233;ponds, hop, via thunderbird. Mais
je vais r&#233;podnre aussi au milieu du thread.<br><br>
On 11/05/2012 10:53 AM, Raoul Tartopoils wrote:<br></div>
<blockquote>Reply rapide de gmail.</blockquote>
<br>
Jamais.<br><br><blockquote>
<div><br><br><div>2012/11/5 Thibault Delavall&#233;e <span>&lt;<a href="mailto:tde@openerp.com">tde@openerp.com</a>&gt;</span><br><blockquote>
<div>
<div>R&#233;ponse via thunderbird, classique.
<div>
<div><br><br>
On 11/05/2012 10:51 AM, Raoul Tartopoils wrote:<br></div>
</div>
</div>
<div>
<div>
<blockquote>
<div>Plop !</div>
<ul><li>Vive les lapins rapides !<br></li>
<li>Nouille</li>
</ul></blockquote>
</div>
</div>
</div>
</blockquote>
</div>
HTML_MAIL1 = """<div>
<font><span>I contact you about our meeting for tomorrow. Here is the schedule I propose:</span></font>
</div>
<div><ul>
<li><span>9 AM: brainstorming about our new amazing business app</span></li>
<li><span>9.45 AM: summary</span></li>
<li><span>10 AM: meeting with Fabien to present our app</span></li>
</ul></div>
<div><font><span>Is everything ok for you ?</span></font></div>"""
GMAIL_REPLY1_SAN = """Hello,<div><br></div><div>Ok for me. I am replying directly in gmail, without signature.</div><div><br></div><div>Kind regards,</div><div><br></div><div>Demo.<br><br><div>On Thu, Nov 8, 2012 at 5:29 PM, <span>&lt;<a href="mailto:dummy@example.com">dummy@example.com</a>&gt;</span> wrote:<br><blockquote><div>I contact you about our meeting for tomorrow. Here is the schedule I propose:</div><div><ul><li>9 AM: brainstorming about our new amazing business app&lt;/span&gt;&lt;/li&gt;</li>
<li>9.45 AM: summary</li><li>10 AM: meeting with Fabien to present our app</li></ul></div><div>Is everything ok for you ?</div>
<div><p>--<br>Administrator</p></div>
<div><p>Log in our portal at: <a href="http://localhost:8069#action=login&amp;db=mail_1&amp;login=demo">http://localhost:8069#action=login&amp;db=mail_1&amp;login=demo</a></p></div>
</blockquote></div><br></div>"""
THUNDERBIRD_16_REPLY1_SAN = """ <div>On 11/08/2012 05:29 PM,
<a href="mailto:dummy@example.com">dummy@example.com</a> wrote:<br></div>
<blockquote>
<div>I contact you about our meeting for tomorrow. Here is the
schedule I propose:</div>
<div>
<ul><li>9 AM: brainstorming about our new amazing business
app&lt;/span&gt;&lt;/li&gt;</li>
<li>9.45 AM: summary</li>
<li>10 AM: meeting with Fabien to present our app</li>
</ul></div>
<div>Is everything ok for you ?</div>
<div>
<p>--<br>
Administrator</p>
</div>
<div>
<p>Log in our portal at:
<a href="http://localhost:8069#action=login&amp;db=mail_1&amp;token=rHdWcUART5PhEnJRaXjH">http://localhost:8069#action=login&amp;db=mail_1&amp;token=rHdWcUART5PhEnJRaXjH</a></p>
</div>
</blockquote>
je rajotuerais bien pommes de terre dans la liste.<br><blockquote>
<div>
<div>
<blockquote>
<div>
<div>
<div>
<blockquote>
<ul><li>Frites</li>
</ul><div><br></div>
<div>Clairement, hein ?</div>
-- <br>
Raoul Tartopoils<br></blockquote>
<br><br></div>
</div>
<span><font>
<pre>--
Raoul Tartopoils
</pre>
</font></span></div>
</blockquote>
</div>
<br><br><div><br></div>
-- <br>
Raoul Tartopoils<br></div>
</blockquote>
<br><br><pre>--
Raoul Tartopoils
Ok for me. I am replying directly below your mail, using
Thunderbird, with a signature.<br><br>
Did you receive my email about my new laptop, by the way ?<br><br>
Raoul.<br><pre>--
Raoul Grosbedonn&#233;e
</pre>"""
TEXT_TPL = """Salut Raoul!
Le 28 oct. 2012 à 00:02, Raoul Grosbedon a écrit :
> C'est sûr que je suis intéressé (quote)!
Trouloulou pouet pouet.
Je ne vais quand même pas écrire de vrais mails, non mais ho.
Trouloulou pouet pouet. Je ne vais quand même pas écrire de vrais mails, non mais ho.
> 2012/10/27 Bert Tartopoils :
>> Diantre, me disè-je en envoyant un message similaire à Martine, mais comment vas-tu (quote)?
@ -138,7 +115,6 @@ Je ne vais quand même pas écrire de vrais mails, non mais ho.
>>
>
>
>
> --
> Raoul Grosbedon
@ -147,21 +123,8 @@ bert.tartopoils@miam.miam
"""
class TestAppendContentToHtml(unittest2.TestCase):
""" Test some of our generic utility functions """
def test_append_to_html(self):
test_samples = [
('<!DOCTYPE...><HTML encoding="blah">some <b>content</b></HtMl>', '--\nYours truly', True,
'<!DOCTYPE...><html encoding="blah">some <b>content</b>\n<pre>--\nYours truly</pre>\n</html>'),
('<html><body>some <b>content</b></body></html>', '<!DOCTYPE...>\n<html><body>\n<p>--</p>\n<p>Yours truly</p>\n</body>\n</html>', False,
'<html><body>some <b>content</b>\n\n\n<p>--</p>\n<p>Yours truly</p>\n\n\n</body></html>'),
]
for html, content, flag, expected in test_samples:
self.assertEqual(append_content_to_html(html, content, flag), expected, 'append_content_to_html is broken')
class TestSanitizer(unittest2.TestCase):
""" Test the html sanitizer """
# TDE note: could be improved by actually checking the output
def test_simple(self):
@ -173,33 +136,67 @@ class TestSanitizer(unittest2.TestCase):
self.assertEqual(x, html_sanitize(x))
def test_no_exception(self):
html_sanitize(test_case)
html_sanitize(HTML_SOURCE)
def test_unicode(self):
html_sanitize("Merci à l'intérêt pour notre produit.nous vous contacterons bientôt. Merci")
class TestCleaner(unittest2.TestCase):
""" Test the email cleaner function that filter the content of incoming emails """
def test_gmail(self):
# Test1: blahblah
new_html = html_email_clean(GMAIL_REPLY_SAN)
self.assertNotIn(new_html, 'blockquote')
self.assertNotIn(new_html, 'Vive les lapins rapides !')
self.assertNotIn(new_html, 'Bert Tartopoils')
def test_html_email_clean(self):
# Test1: reply through gmail: quote in blockquote, signature --\nAdministrator
new_html = html_email_clean(GMAIL_REPLY1_SAN)
self.assertNotIn('blockquote', new_html, 'html_email_cleaner did not remove a blockquote')
self.assertNotIn('I contact you about our meeting', new_html, 'html_email_cleaner wrongly removed the quoted content')
self.assertNotIn('Administrator', new_html, 'html_email_cleaner did not erase the signature')
self.assertIn('Ok for me', new_html, 'html_email_cleaner erased too much content')
# Test2: reply through Tunderbird 16.0.2
new_html = html_email_clean(THUNDERBIRD_16_REPLY1_SAN)
self.assertNotIn('blockquote', new_html, 'html_email_cleaner did not remove a blockquote')
self.assertNotIn('I contact you about our meeting', new_html, 'html_email_cleaner wrongly removed the quoted content')
self.assertNotIn('Administrator', new_html, 'html_email_cleaner did not erase the signature')
self.assertNotIn('Grosbedonn', new_html, 'html_email_cleaner did not erase the signature')
self.assertIn('Ok for me', new_html, 'html_email_cleaner erased too much content')
# Test3: text email
new_html = html_email_clean(TEXT_MAIL1)
self.assertIn('I contact you about our meeting', new_html, 'html_email_cleaner wrongly removed the quoted content')
self.assertNotIn('Administrator', new_html, 'html_email_cleaner did not erase the signature')
# Test4: more complex text email
new_html = html_email_clean(TEXT_TPL)
self.assertNotIn('quote', new_html, 'html_email_cleaner did not remove correctly plaintext quotes')
class TestText2Html(unittest2.TestCase):
class TestAppendContentToHtml(unittest2.TestCase):
""" Test some of our generic utility functions about html """
def test_text2html(self):
def test_plaintext2html(self):
cases = [
("First \nSecond \nThird\n \nParagraph\n\r--\nSignature paragraph", 'div',
"<div><p>First <br/>Second <br/>Third</p><p>Paragraph</p><p>--<br/>Signature paragraph</p></div>"),
("First<p>It should be escaped</p>\nSignature", False,
"<p>First&lt;p&gt;It should be escaped&lt;/p&gt;<br/>Signature</p>")
]
for content, container_tag, expected in cases:
html = text2html(content, container_tag)
html = plaintext2html(content, container_tag)
self.assertEqual(html, expected, 'text2html is broken')
def test_append_to_html(self):
test_samples = [
('<!DOCTYPE...><HTML encoding="blah">some <b>content</b></HtMl>', '--\nYours truly', True, True, False,
'<!DOCTYPE...><html encoding="blah">some <b>content</b>\n<pre>--\nYours truly</pre>\n</html>'),
('<!DOCTYPE...><HTML encoding="blah">some <b>content</b></HtMl>', '--\nYours truly', True, False, False,
'<!DOCTYPE...><html encoding="blah">some <b>content</b>\n<p>--<br/>Yours truly</p>\n</html>'),
('<html><body>some <b>content</b></body></html>', '<!DOCTYPE...>\n<html><body>\n<p>--</p>\n<p>Yours truly</p>\n</body>\n</html>', False, False, False,
'<html><body>some <b>content</b>\n\n\n<p>--</p>\n<p>Yours truly</p>\n\n\n</body></html>'),
]
for html, content, plaintext_flag, preserve_flag, container_tag, expected in test_samples:
self.assertEqual(append_content_to_html(html, content, plaintext_flag, preserve_flag, container_tag), expected, 'append_content_to_html is broken')
if __name__ == '__main__':
unittest2.main()

View File

@ -121,6 +121,7 @@ def html_email_clean(html):
be present in the html string. This method therefore takes as input
html code coming from a sanitized source, like fields.html.
"""
html = ustr(html)
modified_html = ''
# 1. <br[ /]> -> \n, because otherwise the tree is obfuscated
@ -131,6 +132,7 @@ def html_email_clean(html):
idx = item.end()
modified_html += html[idx:]
html = modified_html
# TDE note: seems to have lots of <div><br></div> in emails... needs to be checks, could be cleaned
# 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre
root = lxml.html.fromstring(html)
@ -138,9 +140,28 @@ def html_email_clean(html):
html = '<div>%s</div>' % html
root = lxml.html.fromstring(html)
# 2.5 remove quoted text in nodes
quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
for node in root.getiterator():
if not node.text:
continue
idx = 0
text = ''
for item in re.finditer(quote_tags, node.text):
print item
text += node.text[idx:item.start()]
idx = item.end()
text += node.text[idx:]
node.text = text
# 3. remove blockquotes
quotes = [el for el in root.getiterator(tag='blockquote')]
for node in quotes:
# copy the node tail into parent text
if node.tail:
parent = node.getparent()
parent.text = parent.text or '' + node.tail
# remove the node
node.getparent().remove(node)
# 4. strip signatures
@ -187,9 +208,7 @@ def html2plaintext(html, body_id=None, encoding='utf-8'):
## download here: http://www.peterbe.com/plog/html2plaintext
html = ustr(html)
from lxml.etree import tostring, fromstring, HTMLParser
tree = fromstring(html, parser=HTMLParser())
tree = etree.fromstring(html, parser=etree.HTMLParser())
if body_id is not None:
source = tree.xpath('//*[@id=%s]' % (body_id,))
@ -208,7 +227,7 @@ def html2plaintext(html, body_id=None, encoding='utf-8'):
link.text = '%s [%s]' % (link.text, i)
url_index.append(url)
html = ustr(tostring(tree, encoding=encoding))
html = ustr(etree.tostring(tree, encoding=encoding))
html = html.replace('<strong>', '*').replace('</strong>', '*')
html = html.replace('<b>', '*').replace('</b>', '*')
@ -233,7 +252,7 @@ def html2plaintext(html, body_id=None, encoding='utf-8'):
return html
def text2html(text, container_tag='div'):
def plaintext2html(text, container_tag=False):
""" Convert plaintext into html. Content of the text is escaped to manage
html entities, using cgi.escape().
- all \n,\r are replaced by <br />
@ -243,7 +262,7 @@ def text2html(text, container_tag='div'):
:param string container_tag: container of the html; by default the
content is embedded into a <div>
"""
text = cgi.escape(text)
text = cgi.escape(ustr(text))
# 1. replace \n and \r
text = text.replace('\n', '<br/>')
@ -261,7 +280,45 @@ def text2html(text, container_tag='div'):
# 4. container
if container_tag:
final = '<%s>%s</%s>' % (container_tag, final, container_tag)
return final
return ustr(final)
def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
""" Append extra content at the end of an HTML snippet, trying
to locate the end of the HTML document (</body>, </html>, or
EOF), and converting the provided content in html unless ``plaintext``
is False.
Content conversion can be done in two ways:
- wrapping it into a pre (preserve=True)
- use plaintext2html (preserve=False, using container_tag to wrap the
whole content)
A side-effect of this method is to coerce all HTML tags to
lowercase in ``html``, and strip enclosing <html> or <body> tags in
content if ``plaintext`` is False.
:param str html: html tagsoup (doesn't have to be XHTML)
:param str content: extra content to append
:param bool plaintext: whether content is plaintext and should
be wrapped in a <pre/> tag.
:param bool preserve: if content is plaintext, wrap it into a <pre>
instead of converting it into html
"""
html = ustr(html)
if plaintext and preserve:
content = u'\n<pre>%s</pre>\n' % ustr(content)
elif plaintext:
content = '\n%s\n' % plaintext2html(content, container_tag)
else:
content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
content = u'\n%s\n' % ustr(content)
# Force all tags to lowercase
html = re.sub(r'(</?)\W*(\w+)([ >])',
lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
insert_location = html.find('</body>')
if insert_location == -1:
insert_location = html.find('</html>')
if insert_location == -1:
return '%s%s' % (html, content)
return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
#----------------------------------------------------------
# Emails
@ -339,33 +396,3 @@ def email_split(text):
if not text:
return []
return re.findall(r'([^ ,<@]+@[^> ,]+)', text)
def append_content_to_html(html, content, plaintext=True):
"""Append extra content at the end of an HTML snippet, trying
to locate the end of the HTML document (</body>, </html>, or
EOF), and wrapping the provided content in a <pre/> block
unless ``plaintext`` is False. A side-effect of this
method is to coerce all HTML tags to lowercase in ``html``,
and strip enclosing <html> or <body> tags in content if
``plaintext`` is False.
:param str html: html tagsoup (doesn't have to be XHTML)
:param str content: extra content to append
:param bool plaintext: whether content is plaintext and should
be wrapped in a <pre/> tag.
"""
html = ustr(html)
if plaintext:
content = u'\n<pre>%s</pre>\n' % ustr(content)
else:
content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
content = u'\n%s\n' % ustr(content)
# Force all tags to lowercase
html = re.sub(r'(</?)\W*(\w+)([ >])',
lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
insert_location = html.find('</body>')
if insert_location == -1:
insert_location = html.find('</html>')
if insert_location == -1:
return '%s%s' % (html, content)
return '%s%s%s' % (html[:insert_location], content, html[insert_location:])