[IMP] tools: mail: improved append_content_to_html, including plaintext2html. Updated tests.

bzr revid: tde@openerp.com-20121109123521-e04g4zrhtv947l1r
2012-11-09 13:35:21 +01:00 · 2012-11-09 13:35:21 +01:00 · 62c419e66f
parent 0d027a3f8b
commit 62c419e66f
4 changed files with 161 additions and 138 deletions
--- a/openerp/addons/base/base_demo.xml
+++ b/openerp/addons/base/base_demo.xml
@ -3,7 +3,6 @@
    <data noupdate="1">
        <record id="partner_demo" model="res.partner">
            <field name="name">Demo User</field>
            <field name="email">demo@example.com</field>
            <field name="company_id" ref="main_company"/>
            <field name="customer" eval="False"/>
            <field name="email">demo@example.com</field>
--- a/openerp/osv/fields.py
+++ b/openerp/osv/fields.py
@ -44,8 +44,8 @@ import openerp
 import openerp.tools as tools
 from openerp.tools.translate import _
 from openerp.tools import float_round, float_repr
 from openerp.tools import html_sanitize
 import simplejson
 from openerp.tools.mail import html_sanitize
 from openerp import SUPERUSER_ID
 _logger = logging.getLogger(__name__)
--- a/openerp/tests/test_mail.py
+++ b/openerp/tests/test_mail.py
@ -23,9 +23,9 @@
 ##############################################################################
 import unittest2
-from openerp.tools.mail import html_sanitize, html_email_clean, append_content_to_html, text2html
+from openerp.tools import html_sanitize, html_email_clean, append_content_to_html, plaintext2html
-test_case = """
+HTML_SOURCE = """
 <font size="2" style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; ">test1</font>
 <div style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; font-size: 12px; font-style: normal; ">
 <b>test2</b></div><div style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; font-size: 12px; ">
@ -43,88 +43,65 @@ test12</font></div><div><font color="#1f1f1f" face="monospace" size="2"><br></fo
 <a href="javascript:alert('malicious code')">test link</a>
 """
-GMAIL_REPLY_SAN = """<div>R&#233;ponse via thunderbird, classique.<br><br>
+TEXT_MAIL1 = """I contact you about our meeting for tomorrow. Here is the schedule I propose:
-      On 11/05/2012 10:51 AM, Raoul Tartopoils wrote:<br></div>
+9 AM: brainstorming about our new amazing business app</span></li>
-    <blockquote>
+9.45 AM: summary
-      <div>Plop !</div>
+10 AM: meeting with Fabien to present our app
-      <ul><li>Vive les lapins rapides !<br></li>
+Is everything ok for you ?
-        <li>Nouille</li>
+--
-        <li>Frites</li>
+Administrator"""
      </ul><div><br></div>
      <div>Clairement, hein ?</div>
      -- <br>
      Raoul Tartopoils<br></blockquote>
    <br><br><pre>-- 
 Raoul Tartopoils
 </pre>"""
-GMAIL_REPLY2_SAN = """<div>Je r&#233;ponds, hop, via thunderbird. Mais
+HTML_MAIL1 = """<div>
-      je vais r&#233;podnre aussi au milieu du thread.<br><br>
+<font><span>I contact you about our meeting for tomorrow. Here is the schedule I propose:</span></font>
-      On 11/05/2012 10:53 AM, Raoul Tartopoils wrote:<br></div>
+</div>
-    <blockquote>Reply rapide de gmail.</blockquote>
+<div><ul>
-    <br>
+<li><span>9 AM: brainstorming about our new amazing business app</span></li>
-    Jamais.<br><br><blockquote>
+<li><span>9.45 AM: summary</span></li>
-      <div><br><br><div>2012/11/5 Thibault Delavall&#233;e <span>&lt;<a href="mailto:tde@openerp.com">tde@openerp.com</a>&gt;</span><br><blockquote>
+<li><span>10 AM: meeting with Fabien to present our app</span></li>
-            <div>
+</ul></div>
-              <div>R&#233;ponse via thunderbird, classique.
+<div><font><span>Is everything ok for you ?</span></font></div>"""
-                <div>
+
-                  <div><br><br>
+GMAIL_REPLY1_SAN = """Hello,<div><br></div><div>Ok for me. I am replying directly in gmail, without signature.</div><div><br></div><div>Kind regards,</div><div><br></div><div>Demo.<br><br><div>On Thu, Nov 8, 2012 at 5:29 PM,  <span>&lt;<a href="mailto:dummy@example.com">dummy@example.com</a>&gt;</span> wrote:<br><blockquote><div>I contact you about our meeting for tomorrow. Here is the schedule I propose:</div><div><ul><li>9 AM: brainstorming about our new amazing business app&lt;/span&gt;&lt;/li&gt;</li>
-                    On 11/05/2012 10:51 AM, Raoul Tartopoils wrote:<br></div>
+<li>9.45 AM: summary</li><li>10 AM: meeting with Fabien to present our app</li></ul></div><div>Is everything ok for you ?</div>
-                </div>
+<div><p>--<br>Administrator</p></div>
-              </div>
+
-              <div>
+<div><p>Log in our portal at: <a href="http://localhost:8069#action=login&amp;db=mail_1&amp;login=demo">http://localhost:8069#action=login&amp;db=mail_1&amp;login=demo</a></p></div>
-                <div>
+</blockquote></div><br></div>"""
-                  <blockquote>
+
-                    <div>Plop !</div>
+THUNDERBIRD_16_REPLY1_SAN = """    <div>On 11/08/2012 05:29 PM,
-                    <ul><li>Vive les lapins rapides !<br></li>
+      <a href="mailto:dummy@example.com">dummy@example.com</a> wrote:<br></div>
-                      <li>Nouille</li>
+    <blockquote>
-                    </ul></blockquote>
+      <div>I contact you about our meeting for tomorrow. Here is the
-                </div>
+        schedule I propose:</div>
-              </div>
+      <div>
-            </div>
+        <ul><li>9 AM: brainstorming about our new amazing business
-          </blockquote>
+            app&lt;/span&gt;&lt;/li&gt;</li>
-        </div>
+          <li>9.45 AM: summary</li>
          <li>10 AM: meeting with Fabien to present our app</li>
        </ul></div>
      <div>Is everything ok for you ?</div>
      <div>
        <p>--<br>
          Administrator</p>
      </div>
      <div>
        <p>Log in our portal at:
 <a href="http://localhost:8069#action=login&amp;db=mail_1&amp;token=rHdWcUART5PhEnJRaXjH">http://localhost:8069#action=login&amp;db=mail_1&amp;token=rHdWcUART5PhEnJRaXjH</a></p>
      </div>
    </blockquote>
-    je rajotuerais bien pommes de terre dans la liste.<br><blockquote>
+    Ok for me. I am replying directly below your mail, using
-      <div>
+    Thunderbird, with a signature.<br><br>
-        <div>
+    Did you receive my email about my new laptop, by the way ?<br><br>
-          <blockquote>
+    Raoul.<br><pre>-- 
-            <div>
+Raoul Grosbedonn&#233;e
              <div>
                <div>
                  <blockquote>
                    <ul><li>Frites</li>
                    </ul><div><br></div>
                    <div>Clairement, hein ?</div>
                    -- <br>
                    Raoul Tartopoils<br></blockquote>
                  <br><br></div>
              </div>
              <span><font>
                  <pre>-- 
 Raoul Tartopoils
 </pre>
                </font></span></div>
          </blockquote>
        </div>
        <br><br><div><br></div>
        -- <br>
        Raoul Tartopoils<br></div>
    </blockquote>
    <br><br><pre>-- 
 Raoul Tartopoils
 </pre>"""
 TEXT_TPL = """Salut Raoul!
 Le 28 oct. 2012 à 00:02, Raoul Grosbedon a écrit :
 > C'est sûr que je suis intéressé (quote)!
-Trouloulou pouet pouet.
+Trouloulou pouet pouet. Je ne vais quand même pas écrire de vrais mails, non mais ho.
 Je ne vais quand même pas écrire de vrais mails, non mais ho.
 > 2012/10/27 Bert Tartopoils :
 >> Diantre, me disè-je en envoyant un message similaire à Martine, mais comment vas-tu (quote)?
@ -138,7 +115,6 @@ Je ne vais quand même pas écrire de vrais mails, non mais ho.
 >> 
 > 
 > 
 > 
 > -- 
 > Raoul Grosbedon
@ -147,21 +123,8 @@ bert.tartopoils@miam.miam
 """
 class TestAppendContentToHtml(unittest2.TestCase):
    """ Test some of our generic utility functions """
    def test_append_to_html(self):
        test_samples = [
            ('<!DOCTYPE...><HTML encoding="blah">some <b>content</b></HtMl>', '--\nYours truly', True,
             '<!DOCTYPE...><html encoding="blah">some <b>content</b>\n<pre>--\nYours truly</pre>\n</html>'),
            ('<html><body>some <b>content</b></body></html>', '<!DOCTYPE...>\n<html><body>\n<p>--</p>\n<p>Yours truly</p>\n</body>\n</html>', False,
             '<html><body>some <b>content</b>\n\n\n<p>--</p>\n<p>Yours truly</p>\n\n\n</body></html>'),
        ]
        for html, content, flag, expected in test_samples:
            self.assertEqual(append_content_to_html(html, content, flag), expected, 'append_content_to_html is broken')
 class TestSanitizer(unittest2.TestCase):
    """ Test the html sanitizer """
    # TDE note: could be improved by actually checking the output
    def test_simple(self):
@ -173,33 +136,67 @@ class TestSanitizer(unittest2.TestCase):
        self.assertEqual(x, html_sanitize(x))
    def test_no_exception(self):
-        html_sanitize(test_case)
+        html_sanitize(HTML_SOURCE)
    def test_unicode(self):
        html_sanitize("Merci à l'intérêt pour notre produit.nous vous contacterons bientôt. Merci")
 class TestCleaner(unittest2.TestCase):
    """ Test the email cleaner function that filter the content of incoming emails """
-    def test_gmail(self):
+    def test_html_email_clean(self):
-        # Test1: blahblah
+        # Test1: reply through gmail: quote in blockquote, signature --\nAdministrator
-        new_html = html_email_clean(GMAIL_REPLY_SAN)
+        new_html = html_email_clean(GMAIL_REPLY1_SAN)
-        self.assertNotIn(new_html, 'blockquote')
+        self.assertNotIn('blockquote', new_html, 'html_email_cleaner did not remove a blockquote')
-        self.assertNotIn(new_html, 'Vive les lapins rapides !')
+        self.assertNotIn('I contact you about our meeting', new_html, 'html_email_cleaner wrongly removed the quoted content')
-        self.assertNotIn(new_html, 'Bert Tartopoils')
+        self.assertNotIn('Administrator', new_html, 'html_email_cleaner did not erase the signature')
        self.assertIn('Ok for me', new_html, 'html_email_cleaner erased too much content')
        # Test2: reply through Tunderbird 16.0.2
        new_html = html_email_clean(THUNDERBIRD_16_REPLY1_SAN)
        self.assertNotIn('blockquote', new_html, 'html_email_cleaner did not remove a blockquote')
        self.assertNotIn('I contact you about our meeting', new_html, 'html_email_cleaner wrongly removed the quoted content')
        self.assertNotIn('Administrator', new_html, 'html_email_cleaner did not erase the signature')
        self.assertNotIn('Grosbedonn', new_html, 'html_email_cleaner did not erase the signature')
        self.assertIn('Ok for me', new_html, 'html_email_cleaner erased too much content')
        # Test3: text email
        new_html = html_email_clean(TEXT_MAIL1)
        self.assertIn('I contact you about our meeting', new_html, 'html_email_cleaner wrongly removed the quoted content')
        self.assertNotIn('Administrator', new_html, 'html_email_cleaner did not erase the signature')
        # Test4: more complex text email
        new_html = html_email_clean(TEXT_TPL)
        self.assertNotIn('quote', new_html, 'html_email_cleaner did not remove correctly plaintext quotes')
-class TestText2Html(unittest2.TestCase):
+class TestAppendContentToHtml(unittest2.TestCase):
    """ Test some of our generic utility functions about html """
-    def test_text2html(self):
+    def test_plaintext2html(self):
        cases = [
            ("First \nSecond \nThird\n \nParagraph\n\r--\nSignature paragraph", 'div',
             "<div><p>First <br/>Second <br/>Third</p><p>Paragraph</p><p>--<br/>Signature paragraph</p></div>"),
            ("First<p>It should be escaped</p>\nSignature", False,
             "<p>First&lt;p&gt;It should be escaped&lt;/p&gt;<br/>Signature</p>")
        ]
        for content, container_tag, expected in cases:
-            html = text2html(content, container_tag)
+            html = plaintext2html(content, container_tag)
            self.assertEqual(html, expected, 'text2html is broken')
    def test_append_to_html(self):
        test_samples = [
            ('<!DOCTYPE...><HTML encoding="blah">some <b>content</b></HtMl>', '--\nYours truly', True, True, False,
             '<!DOCTYPE...><html encoding="blah">some <b>content</b>\n<pre>--\nYours truly</pre>\n</html>'),
            ('<!DOCTYPE...><HTML encoding="blah">some <b>content</b></HtMl>', '--\nYours truly', True, False, False,
             '<!DOCTYPE...><html encoding="blah">some <b>content</b>\n<p>--<br/>Yours truly</p>\n</html>'),
            ('<html><body>some <b>content</b></body></html>', '<!DOCTYPE...>\n<html><body>\n<p>--</p>\n<p>Yours truly</p>\n</body>\n</html>', False, False, False,
             '<html><body>some <b>content</b>\n\n\n<p>--</p>\n<p>Yours truly</p>\n\n\n</body></html>'),
        ]
        for html, content, plaintext_flag, preserve_flag, container_tag, expected in test_samples:
            self.assertEqual(append_content_to_html(html, content, plaintext_flag, preserve_flag, container_tag), expected, 'append_content_to_html is broken')
 if __name__ == '__main__':
    unittest2.main()
--- a/openerp/tools/mail.py
+++ b/openerp/tools/mail.py
@ -121,6 +121,7 @@ def html_email_clean(html):
            be present in the html string. This method therefore takes as input
            html code coming from a sanitized source, like fields.html.
    """
    html = ustr(html)
    modified_html = ''
    # 1. <br[ /]> -> \n, because otherwise the tree is obfuscated
@ -131,6 +132,7 @@ def html_email_clean(html):
        idx = item.end()
    modified_html += html[idx:]
    html = modified_html
    # TDE note: seems to have lots of <div><br></div> in emails... needs to be checks, could be cleaned
    # 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre
    root = lxml.html.fromstring(html)
@ -138,9 +140,28 @@ def html_email_clean(html):
        html = '<div>%s</div>' % html
        root = lxml.html.fromstring(html)
    # 2.5 remove quoted text in nodes
    quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
    for node in root.getiterator():
        if not node.text:
            continue
        idx = 0
        text = ''
        for item in re.finditer(quote_tags, node.text):
            print item
            text += node.text[idx:item.start()]
            idx = item.end()
        text += node.text[idx:]
        node.text = text
    # 3. remove blockquotes
    quotes = [el for el in root.getiterator(tag='blockquote')]
    for node in quotes:
        # copy the node tail into parent text
        if node.tail:
            parent = node.getparent()
            parent.text = parent.text or '' + node.tail
        # remove the node
        node.getparent().remove(node)
    # 4. strip signatures
@ -187,9 +208,7 @@ def html2plaintext(html, body_id=None, encoding='utf-8'):
    ## download here: http://www.peterbe.com/plog/html2plaintext
    html = ustr(html)
-
+    tree = etree.fromstring(html, parser=etree.HTMLParser())
    from lxml.etree import tostring, fromstring, HTMLParser
    tree = fromstring(html, parser=HTMLParser())
    if body_id is not None:
        source = tree.xpath('//*[@id=%s]' % (body_id,))
@ -208,7 +227,7 @@ def html2plaintext(html, body_id=None, encoding='utf-8'):
            link.text = '%s [%s]' % (link.text, i)
            url_index.append(url)
-    html = ustr(tostring(tree, encoding=encoding))
+    html = ustr(etree.tostring(tree, encoding=encoding))
    html = html.replace('<strong>', '*').replace('</strong>', '*')
    html = html.replace('<b>', '*').replace('</b>', '*')
@ -233,7 +252,7 @@ def html2plaintext(html, body_id=None, encoding='utf-8'):
    return html
-def text2html(text, container_tag='div'):
+def plaintext2html(text, container_tag=False):
    """ Convert plaintext into html. Content of the text is escaped to manage
        html entities, using cgi.escape().
        - all \n,\r are replaced by <br />
@ -243,7 +262,7 @@ def text2html(text, container_tag='div'):
        :param string container_tag: container of the html; by default the
            content is embedded into a <div>
    """
-    text = cgi.escape(text)
+    text = cgi.escape(ustr(text))
    # 1. replace \n and \r
    text = text.replace('\n', '<br/>')
@ -261,7 +280,45 @@ def text2html(text, container_tag='div'):
    # 4. container
    if container_tag:
        final = '<%s>%s</%s>' % (container_tag, final, container_tag)
-    return final
+    return ustr(final)
 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
    """ Append extra content at the end of an HTML snippet, trying
        to locate the end of the HTML document (</body>, </html>, or
        EOF), and converting the provided content in html unless ``plaintext``
        is False.
        Content conversion can be done in two ways:
        - wrapping it into a pre (preserve=True)
        - use plaintext2html (preserve=False, using container_tag to wrap the
            whole content)
        A side-effect of this method is to coerce all HTML tags to
        lowercase in ``html``, and strip enclosing <html> or <body> tags in
        content if ``plaintext`` is False.
        :param str html: html tagsoup (doesn't have to be XHTML)
        :param str content: extra content to append
        :param bool plaintext: whether content is plaintext and should
            be wrapped in a <pre/> tag.
        :param bool preserve: if content is plaintext, wrap it into a <pre>
            instead of converting it into html
    """
    html = ustr(html)
    if plaintext and preserve:
        content = u'\n<pre>%s</pre>\n' % ustr(content)
    elif plaintext:
        content = '\n%s\n' % plaintext2html(content, container_tag)
    else:
        content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
        content = u'\n%s\n' % ustr(content)
    # Force all tags to lowercase
    html = re.sub(r'(</?)\W*(\w+)([ >])',
        lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
    insert_location = html.find('</body>')
    if insert_location == -1:
        insert_location = html.find('</html>')
    if insert_location == -1:
        return '%s%s' % (html, content)
    return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
 #----------------------------------------------------------
 # Emails
@ -339,33 +396,3 @@ def email_split(text):
    if not text:
        return []
    return re.findall(r'([^ ,<@]+@[^> ,]+)', text)
 def append_content_to_html(html, content, plaintext=True):
    """Append extra content at the end of an HTML snippet, trying
       to locate the end of the HTML document (</body>, </html>, or
       EOF), and wrapping the provided content in a <pre/> block
       unless ``plaintext`` is False. A side-effect of this
       method is to coerce all HTML tags to lowercase in ``html``,
       and strip enclosing <html> or <body> tags in content if
       ``plaintext`` is False.
       :param str html: html tagsoup (doesn't have to be XHTML)
       :param str content: extra content to append
       :param bool plaintext: whether content is plaintext and should
           be wrapped in a <pre/> tag.
    """
    html = ustr(html)
    if plaintext:
        content = u'\n<pre>%s</pre>\n' % ustr(content)
    else:
        content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
        content = u'\n%s\n' % ustr(content)
    # Force all tags to lowercase
    html = re.sub(r'(</?)\W*(\w+)([ >])',
        lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
    insert_location = html.find('</body>')
    if insert_location == -1:
        insert_location = html.find('</html>')
    if insert_location == -1:
        return '%s%s' % (html, content)
    return '%s%s%s' % (html[:insert_location], content, html[insert_location:])