[IMP] tools: mail: improved append_content_to_html, including plaintext2html. Updated tests.

bzr revid: tde@openerp.com-20121109123521-e04g4zrhtv947l1r
2012-11-09 13:35:21 +01:00 · 2012-11-09 13:35:21 +01:00 · 62c419e66f
parent 0d027a3f8b
commit 62c419e66f
4 changed files with 161 additions and 138 deletions
--- a/openerp/addons/base/base_demo.xml
+++ b/openerp/addons/base/base_demo.xml
@ -3,7 +3,6 @@
    <data noupdate="1">
        <record id="partner_demo" model="res.partner">
            <field name="name">Demo User</field>
-            <field name="email">demo@example.com</field>
            <field name="company_id" ref="main_company"/>
            <field name="customer" eval="False"/>
            <field name="email">demo@example.com</field>
--- a/openerp/osv/fields.py
+++ b/openerp/osv/fields.py
@ -44,8 +44,8 @@ import openerp
 import openerp.tools as tools
 from openerp.tools.translate import _
 from openerp.tools import float_round, float_repr
+from openerp.tools import html_sanitize
 import simplejson
-from openerp.tools.mail import html_sanitize
 from openerp import SUPERUSER_ID

 _logger = logging.getLogger(__name__)
--- a/openerp/tests/test_mail.py
+++ b/openerp/tests/test_mail.py
@ -23,9 +23,9 @@
 ##############################################################################

 import unittest2
-from openerp.tools.mail import html_sanitize, html_email_clean, append_content_to_html, text2html
+from openerp.tools import html_sanitize, html_email_clean, append_content_to_html, plaintext2html

-test_case = """
+HTML_SOURCE = """
 <font size="2" style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; ">test1</font>
 <div style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; font-size: 12px; font-style: normal; ">
 <b>test2</b></div><div style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; font-size: 12px; ">
@ -43,88 +43,65 @@ test12</font></div><div><font color="#1f1f1f" face="monospace" size="2"><br></fo
 <a href="javascript:alert('malicious code')">test link</a>
 """

-GMAIL_REPLY_SAN = """<div>R&#233;ponse via thunderbird, classique.<br><br>
-      On 11/05/2012 10:51 AM, Raoul Tartopoils wrote:<br></div>
-    <blockquote>
-      <div>Plop !</div>
-      <ul><li>Vive les lapins rapides !<br></li>
-        <li>Nouille</li>
-        <li>Frites</li>
-      </ul><div><br></div>
-      <div>Clairement, hein ?</div>
-      -- <br>
-      Raoul Tartopoils<br></blockquote>
-    <br><br><pre>-- 
-Raoul Tartopoils
-</pre>"""
+TEXT_MAIL1 = """I contact you about our meeting for tomorrow. Here is the schedule I propose:
+9 AM: brainstorming about our new amazing business app</span></li>
+9.45 AM: summary
+10 AM: meeting with Fabien to present our app
+Is everything ok for you ?
+--
+Administrator"""

-GMAIL_REPLY2_SAN = """<div>Je r&#233;ponds, hop, via thunderbird. Mais
-      je vais r&#233;podnre aussi au milieu du thread.<br><br>
-      On 11/05/2012 10:53 AM, Raoul Tartopoils wrote:<br></div>
-    <blockquote>Reply rapide de gmail.</blockquote>
-    <br>
-    Jamais.<br><br><blockquote>
-      <div><br><br><div>2012/11/5 Thibault Delavall&#233;e <span>&lt;<a href="mailto:tde@openerp.com">tde@openerp.com</a>&gt;</span><br><blockquote>
-            <div>
-              <div>R&#233;ponse via thunderbird, classique.
-                <div>
-                  <div><br><br>
-                    On 11/05/2012 10:51 AM, Raoul Tartopoils wrote:<br></div>
-                </div>
-              </div>
-              <div>
-                <div>
-                  <blockquote>
-                    <div>Plop !</div>
-                    <ul><li>Vive les lapins rapides !<br></li>
-                      <li>Nouille</li>
-                    </ul></blockquote>
-                </div>
-              </div>
-            </div>
-          </blockquote>
-        </div>
+HTML_MAIL1 = """<div>
+<font><span>I contact you about our meeting for tomorrow. Here is the schedule I propose:</span></font>
+</div>
+<div><ul>
+<li><span>9 AM: brainstorming about our new amazing business app</span></li>
+<li><span>9.45 AM: summary</span></li>
+<li><span>10 AM: meeting with Fabien to present our app</span></li>
+</ul></div>
+<div><font><span>Is everything ok for you ?</span></font></div>"""
+
+GMAIL_REPLY1_SAN = """Hello,<div><br></div><div>Ok for me. I am replying directly in gmail, without signature.</div><div><br></div><div>Kind regards,</div><div><br></div><div>Demo.<br><br><div>On Thu, Nov 8, 2012 at 5:29 PM,  <span>&lt;<a href="mailto:dummy@example.com">dummy@example.com</a>&gt;</span> wrote:<br><blockquote><div>I contact you about our meeting for tomorrow. Here is the schedule I propose:</div><div><ul><li>9 AM: brainstorming about our new amazing business app&lt;/span&gt;&lt;/li&gt;</li>
+<li>9.45 AM: summary</li><li>10 AM: meeting with Fabien to present our app</li></ul></div><div>Is everything ok for you ?</div>
+<div><p>--<br>Administrator</p></div>
+
+<div><p>Log in our portal at: <a href="http://localhost:8069#action=login&amp;db=mail_1&amp;login=demo">http://localhost:8069#action=login&amp;db=mail_1&amp;login=demo</a></p></div>
+</blockquote></div><br></div>"""
+
+THUNDERBIRD_16_REPLY1_SAN = """    <div>On 11/08/2012 05:29 PM,
+      <a href="mailto:dummy@example.com">dummy@example.com</a> wrote:<br></div>
+    <blockquote>
+      <div>I contact you about our meeting for tomorrow. Here is the
+        schedule I propose:</div>
+      <div>
+        <ul><li>9 AM: brainstorming about our new amazing business
+            app&lt;/span&gt;&lt;/li&gt;</li>
+          <li>9.45 AM: summary</li>
+          <li>10 AM: meeting with Fabien to present our app</li>
+        </ul></div>
+      <div>Is everything ok for you ?</div>
+      <div>
+        <p>--<br>
+          Administrator</p>
+      </div>
+      <div>
+        <p>Log in our portal at:
+<a href="http://localhost:8069#action=login&amp;db=mail_1&amp;token=rHdWcUART5PhEnJRaXjH">http://localhost:8069#action=login&amp;db=mail_1&amp;token=rHdWcUART5PhEnJRaXjH</a></p>
      </div>
    </blockquote>
-    je rajotuerais bien pommes de terre dans la liste.<br><blockquote>
-      <div>
-        <div>
-          <blockquote>
-            <div>
-              <div>
-                <div>
-                  <blockquote>
-                    <ul><li>Frites</li>
-                    </ul><div><br></div>
-                    <div>Clairement, hein ?</div>
-                    -- <br>
-                    Raoul Tartopoils<br></blockquote>
-                  <br><br></div>
-              </div>
-              <span><font>
-                  <pre>-- 
-Raoul Tartopoils
-</pre>
-                </font></span></div>
-          </blockquote>
-        </div>
-        <br><br><div><br></div>
-        -- <br>
-        Raoul Tartopoils<br></div>
-    </blockquote>
-    <br><br><pre>-- 
-Raoul Tartopoils
+    Ok for me. I am replying directly below your mail, using
+    Thunderbird, with a signature.<br><br>
+    Did you receive my email about my new laptop, by the way ?<br><br>
+    Raoul.<br><pre>-- 
+Raoul Grosbedonn&#233;e
 </pre>"""

-
 TEXT_TPL = """Salut Raoul!
 Le 28 oct. 2012 à 00:02, Raoul Grosbedon a écrit :

 > C'est sûr que je suis intéressé (quote)!

-Trouloulou pouet pouet.
-
-Je ne vais quand même pas écrire de vrais mails, non mais ho.
+Trouloulou pouet pouet. Je ne vais quand même pas écrire de vrais mails, non mais ho.

 > 2012/10/27 Bert Tartopoils :
 >> Diantre, me disè-je en envoyant un message similaire à Martine, mais comment vas-tu (quote)?
@ -138,7 +115,6 @@ Je ne vais quand même pas écrire de vrais mails, non mais ho.
 >> 
 > 
 > 
-> 
 > -- 
 > Raoul Grosbedon

@ -147,21 +123,8 @@ bert.tartopoils@miam.miam
 """


-class TestAppendContentToHtml(unittest2.TestCase):
-    """ Test some of our generic utility functions """
-
-    def test_append_to_html(self):
-        test_samples = [
-            ('<!DOCTYPE...><HTML encoding="blah">some <b>content</b></HtMl>', '--\nYours truly', True,
-             '<!DOCTYPE...><html encoding="blah">some <b>content</b>\n<pre>--\nYours truly</pre>\n</html>'),
-            ('<html><body>some <b>content</b></body></html>', '<!DOCTYPE...>\n<html><body>\n<p>--</p>\n<p>Yours truly</p>\n</body>\n</html>', False,
-             '<html><body>some <b>content</b>\n\n\n<p>--</p>\n<p>Yours truly</p>\n\n\n</body></html>'),
-        ]
-        for html, content, flag, expected in test_samples:
-            self.assertEqual(append_content_to_html(html, content, flag), expected, 'append_content_to_html is broken')
-
-
 class TestSanitizer(unittest2.TestCase):
+    """ Test the html sanitizer """
    # TDE note: could be improved by actually checking the output

    def test_simple(self):
@ -173,33 +136,67 @@ class TestSanitizer(unittest2.TestCase):
        self.assertEqual(x, html_sanitize(x))

    def test_no_exception(self):
-        html_sanitize(test_case)
+        html_sanitize(HTML_SOURCE)

    def test_unicode(self):
        html_sanitize("Merci à l'intérêt pour notre produit.nous vous contacterons bientôt. Merci")


 class TestCleaner(unittest2.TestCase):
+    """ Test the email cleaner function that filter the content of incoming emails """

-    def test_gmail(self):
-        # Test1: blahblah
-        new_html = html_email_clean(GMAIL_REPLY_SAN)
-        self.assertNotIn(new_html, 'blockquote')
-        self.assertNotIn(new_html, 'Vive les lapins rapides !')
-        self.assertNotIn(new_html, 'Bert Tartopoils')
+    def test_html_email_clean(self):
+        # Test1: reply through gmail: quote in blockquote, signature --\nAdministrator
+        new_html = html_email_clean(GMAIL_REPLY1_SAN)
+        self.assertNotIn('blockquote', new_html, 'html_email_cleaner did not remove a blockquote')
+        self.assertNotIn('I contact you about our meeting', new_html, 'html_email_cleaner wrongly removed the quoted content')
+        self.assertNotIn('Administrator', new_html, 'html_email_cleaner did not erase the signature')
+        self.assertIn('Ok for me', new_html, 'html_email_cleaner erased too much content')
+
+        # Test2: reply through Tunderbird 16.0.2
+        new_html = html_email_clean(THUNDERBIRD_16_REPLY1_SAN)
+        self.assertNotIn('blockquote', new_html, 'html_email_cleaner did not remove a blockquote')
+        self.assertNotIn('I contact you about our meeting', new_html, 'html_email_cleaner wrongly removed the quoted content')
+        self.assertNotIn('Administrator', new_html, 'html_email_cleaner did not erase the signature')
+        self.assertNotIn('Grosbedonn', new_html, 'html_email_cleaner did not erase the signature')
+        self.assertIn('Ok for me', new_html, 'html_email_cleaner erased too much content')
+
+        # Test3: text email
+        new_html = html_email_clean(TEXT_MAIL1)
+        self.assertIn('I contact you about our meeting', new_html, 'html_email_cleaner wrongly removed the quoted content')
+        self.assertNotIn('Administrator', new_html, 'html_email_cleaner did not erase the signature')
+
+        # Test4: more complex text email
+        new_html = html_email_clean(TEXT_TPL)
+        self.assertNotIn('quote', new_html, 'html_email_cleaner did not remove correctly plaintext quotes')


-class TestText2Html(unittest2.TestCase):
+class TestAppendContentToHtml(unittest2.TestCase):
+    """ Test some of our generic utility functions about html """

-    def test_text2html(self):
+    def test_plaintext2html(self):
        cases = [
            ("First \nSecond \nThird\n \nParagraph\n\r--\nSignature paragraph", 'div',
             "<div><p>First <br/>Second <br/>Third</p><p>Paragraph</p><p>--<br/>Signature paragraph</p></div>"),
+            ("First<p>It should be escaped</p>\nSignature", False,
+             "<p>First&lt;p&gt;It should be escaped&lt;/p&gt;<br/>Signature</p>")
        ]
        for content, container_tag, expected in cases:
-            html = text2html(content, container_tag)
+            html = plaintext2html(content, container_tag)
            self.assertEqual(html, expected, 'text2html is broken')

+    def test_append_to_html(self):
+        test_samples = [
+            ('<!DOCTYPE...><HTML encoding="blah">some <b>content</b></HtMl>', '--\nYours truly', True, True, False,
+             '<!DOCTYPE...><html encoding="blah">some <b>content</b>\n<pre>--\nYours truly</pre>\n</html>'),
+            ('<!DOCTYPE...><HTML encoding="blah">some <b>content</b></HtMl>', '--\nYours truly', True, False, False,
+             '<!DOCTYPE...><html encoding="blah">some <b>content</b>\n<p>--<br/>Yours truly</p>\n</html>'),
+            ('<html><body>some <b>content</b></body></html>', '<!DOCTYPE...>\n<html><body>\n<p>--</p>\n<p>Yours truly</p>\n</body>\n</html>', False, False, False,
+             '<html><body>some <b>content</b>\n\n\n<p>--</p>\n<p>Yours truly</p>\n\n\n</body></html>'),
+        ]
+        for html, content, plaintext_flag, preserve_flag, container_tag, expected in test_samples:
+            self.assertEqual(append_content_to_html(html, content, plaintext_flag, preserve_flag, container_tag), expected, 'append_content_to_html is broken')
+

 if __name__ == '__main__':
    unittest2.main()
--- a/openerp/tools/mail.py
+++ b/openerp/tools/mail.py
@ -121,6 +121,7 @@ def html_email_clean(html):
            be present in the html string. This method therefore takes as input
            html code coming from a sanitized source, like fields.html.
    """
+    html = ustr(html)
    modified_html = ''

    # 1. <br[ /]> -> \n, because otherwise the tree is obfuscated
@ -131,6 +132,7 @@ def html_email_clean(html):
        idx = item.end()
    modified_html += html[idx:]
    html = modified_html
+    # TDE note: seems to have lots of <div><br></div> in emails... needs to be checks, could be cleaned

    # 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre
    root = lxml.html.fromstring(html)
@ -138,9 +140,28 @@ def html_email_clean(html):
        html = '<div>%s</div>' % html
        root = lxml.html.fromstring(html)

+    # 2.5 remove quoted text in nodes
+    quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
+    for node in root.getiterator():
+        if not node.text:
+            continue
+        idx = 0
+        text = ''
+        for item in re.finditer(quote_tags, node.text):
+            print item
+            text += node.text[idx:item.start()]
+            idx = item.end()
+        text += node.text[idx:]
+        node.text = text
+
    # 3. remove blockquotes
    quotes = [el for el in root.getiterator(tag='blockquote')]
    for node in quotes:
+        # copy the node tail into parent text
+        if node.tail:
+            parent = node.getparent()
+            parent.text = parent.text or '' + node.tail
+        # remove the node
        node.getparent().remove(node)

    # 4. strip signatures
@ -187,9 +208,7 @@ def html2plaintext(html, body_id=None, encoding='utf-8'):
    ## download here: http://www.peterbe.com/plog/html2plaintext

    html = ustr(html)
-
-    from lxml.etree import tostring, fromstring, HTMLParser
-    tree = fromstring(html, parser=HTMLParser())
+    tree = etree.fromstring(html, parser=etree.HTMLParser())

    if body_id is not None:
        source = tree.xpath('//*[@id=%s]' % (body_id,))
@ -208,7 +227,7 @@ def html2plaintext(html, body_id=None, encoding='utf-8'):
            link.text = '%s [%s]' % (link.text, i)
            url_index.append(url)

-    html = ustr(tostring(tree, encoding=encoding))
+    html = ustr(etree.tostring(tree, encoding=encoding))

    html = html.replace('<strong>', '*').replace('</strong>', '*')
    html = html.replace('<b>', '*').replace('</b>', '*')
@ -233,7 +252,7 @@ def html2plaintext(html, body_id=None, encoding='utf-8'):

    return html

-def text2html(text, container_tag='div'):
+def plaintext2html(text, container_tag=False):
    """ Convert plaintext into html. Content of the text is escaped to manage
        html entities, using cgi.escape().
        - all \n,\r are replaced by <br />
@ -243,7 +262,7 @@ def text2html(text, container_tag='div'):
        :param string container_tag: container of the html; by default the
            content is embedded into a <div>
    """
-    text = cgi.escape(text)
+    text = cgi.escape(ustr(text))

    # 1. replace \n and \r
    text = text.replace('\n', '<br/>')
@ -261,7 +280,45 @@ def text2html(text, container_tag='div'):
    # 4. container
    if container_tag:
        final = '<%s>%s</%s>' % (container_tag, final, container_tag)
-    return final
+    return ustr(final)
+
+def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
+    """ Append extra content at the end of an HTML snippet, trying
+        to locate the end of the HTML document (</body>, </html>, or
+        EOF), and converting the provided content in html unless ``plaintext``
+        is False.
+        Content conversion can be done in two ways:
+        - wrapping it into a pre (preserve=True)
+        - use plaintext2html (preserve=False, using container_tag to wrap the
+            whole content)
+        A side-effect of this method is to coerce all HTML tags to
+        lowercase in ``html``, and strip enclosing <html> or <body> tags in
+        content if ``plaintext`` is False.
+
+        :param str html: html tagsoup (doesn't have to be XHTML)
+        :param str content: extra content to append
+        :param bool plaintext: whether content is plaintext and should
+            be wrapped in a <pre/> tag.
+        :param bool preserve: if content is plaintext, wrap it into a <pre>
+            instead of converting it into html
+    """
+    html = ustr(html)
+    if plaintext and preserve:
+        content = u'\n<pre>%s</pre>\n' % ustr(content)
+    elif plaintext:
+        content = '\n%s\n' % plaintext2html(content, container_tag)
+    else:
+        content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
+        content = u'\n%s\n' % ustr(content)
+    # Force all tags to lowercase
+    html = re.sub(r'(</?)\W*(\w+)([ >])',
+        lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
+    insert_location = html.find('</body>')
+    if insert_location == -1:
+        insert_location = html.find('</html>')
+    if insert_location == -1:
+        return '%s%s' % (html, content)
+    return '%s%s%s' % (html[:insert_location], content, html[insert_location:])

 #----------------------------------------------------------
 # Emails
@ -339,33 +396,3 @@ def email_split(text):
    if not text:
        return []
    return re.findall(r'([^ ,<@]+@[^> ,]+)', text)
-
-def append_content_to_html(html, content, plaintext=True):
-    """Append extra content at the end of an HTML snippet, trying
-       to locate the end of the HTML document (</body>, </html>, or
-       EOF), and wrapping the provided content in a <pre/> block
-       unless ``plaintext`` is False. A side-effect of this
-       method is to coerce all HTML tags to lowercase in ``html``,
-       and strip enclosing <html> or <body> tags in content if
-       ``plaintext`` is False.
-
-       :param str html: html tagsoup (doesn't have to be XHTML)
-       :param str content: extra content to append
-       :param bool plaintext: whether content is plaintext and should
-           be wrapped in a <pre/> tag.
-    """
-    html = ustr(html)
-    if plaintext:
-        content = u'\n<pre>%s</pre>\n' % ustr(content)
-    else:
-        content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
-        content = u'\n%s\n' % ustr(content)
-    # Force all tags to lowercase
-    html = re.sub(r'(</?)\W*(\w+)([ >])',
-        lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
-    insert_location = html.find('</body>')
-    if insert_location == -1:
-        insert_location = html.find('</html>')
-    if insert_location == -1:
-        return '%s%s' % (html, content)
-    return '%s%s%s' % (html[:insert_location], content, html[insert_location:])