From ccbb8e09a6ac38d149969cc7e78bfe0512920174 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thibault=20Delavall=C3=A9e?= <tde@openerp.com>
Date: Thu, 25 Apr 2013 12:43:01 +0200
Subject: [PATCH] [REF] html_email_clean: refactored the algorithm to clean it,
 specifically about truncature. Now add classes to node when not removing
 them, to be managed by the web client.

bzr revid: tde@openerp.com-20130425104301-e7mv9o4pcjx6k2cn
---
 openerp/tests/test_mail.py |  68 +++++++++++----
 openerp/tools/mail.py      | 165 +++++++++++++++++++++++--------------
 2 files changed, 156 insertions(+), 77 deletions(-)
diff --git a/openerp/tests/test_mail.py b/openerp/tests/test_mail.py
index 5ec7fe24436..3512014b1e7 100644
--- a/openerp/tests/test_mail.py
+++ b/openerp/tests/test_mail.py
@@ -113,62 +113,100 @@ class TestSanitizer(unittest2.TestCase):
 class TestCleaner(unittest2.TestCase):
     """ Test the email cleaner function that filters the content of incoming emails """
 
-    def test_00_html_email_clean_text(self):
+    def test_00_html_email_clean_signature(self):
+        """ html_email_clean test for signatures """
+        test_data = [("""This is Sparta!\n--\nAdministrator\n+9988776655""",
+                        ['This is Sparta!'],
+                        ['Administrator', '9988776655']),
+                     ("""<p>--\nAdministrator</p>""",
+                        [],
+                        ['--', 'Administrator']),
+                     ("""<p>This is Sparta!\n---\nAdministrator</p>""",
+                        ['This is Sparta!'],
+                        ['---', 'Administrator']),
+                     ("""<p>--<br>Administrator</p>""",
+                        [],
+                        []),
+                     ("""<p>This is Sparta!<br/>--<br>Administrator</p>""",
+                        ['This is Sparta!'],
+                        [])
+                    ]
+        for test, in_lst, out_lst in test_data:
+            new_html = html_email_clean(test, remove=True)
+            for text in in_lst:
+                self.assertIn(text, new_html, 'html_email_cleaner wrongly removed content')
+            for text in out_lst:
+                self.assertNotIn(text, new_html, 'html_email_cleaner did not remove unwanted content')
+
+    def test_05_html_email_clean_signature(self):
+        """ html_email_clean test for quotes """
+        test_data = [("""This is Sparta!\n>Ah bon ?\nCertes\n> Chouette !\nClair""",
+                        ['This is Sparta!', 'Certes', 'Clair'],
+                        ['Ah bon', 'Chouette'])
+                    ]
+        for test, in_lst, out_lst in test_data:
+            new_html = html_email_clean(test, remove=True)
+            for text in in_lst:
+                self.assertIn(text, new_html, 'html_email_cleaner wrongly removed content')
+            for text in out_lst:
+                self.assertNotIn(text, new_html, 'html_email_cleaner did not remove unwanted content')
+
+    def test_10_html_email_clean_text(self):
         """ html_email_clean test for text-based emails """
-        new_html = html_email_clean(test_mail_examples.TEXT_1, remove_unwanted=True)
+        new_html = html_email_clean(test_mail_examples.TEXT_1, remove=True)
         for ext in test_mail_examples.TEXT_1_IN:
             self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
         for ext in test_mail_examples.TEXT_1_OUT:
             self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
 
-        new_html = html_email_clean(test_mail_examples.TEXT_2, remove_unwanted=True)
+        new_html = html_email_clean(test_mail_examples.TEXT_2, remove=True)
         for ext in test_mail_examples.TEXT_2_IN:
             self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
         for ext in test_mail_examples.TEXT_2_OUT:
             self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
 
-    def test_10_html_email_clean_html(self):
-        new_html = html_email_clean(test_mail_examples.HTML_1, remove_unwanted=True)
+    def test_20_html_email_clean_html(self):
+        new_html = html_email_clean(test_mail_examples.HTML_1, remove=True)
         for ext in test_mail_examples.HTML_1_IN:
             self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
         for ext in test_mail_examples.HTML_1_OUT:
             self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
 
-        new_html = html_email_clean(test_mail_examples.HTML_2, remove_unwanted=False)
+        new_html = html_email_clean(test_mail_examples.HTML_2, remove=True)
         for ext in test_mail_examples.HTML_2_IN:
             self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
         for ext in test_mail_examples.HTML_2_OUT:
             self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
 
-        new_html = html_email_clean(test_mail_examples.HTML_3, remove_unwanted=False)
+        new_html = html_email_clean(test_mail_examples.HTML_3, remove=False)
         for ext in test_mail_examples.HTML_3_IN:
             self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
         # for ext in test_mail_examples.HTML_3_OUT:
         #     self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
 
-    def test_20_html_email_clean_msoffice(self):
-        new_html = html_email_clean(test_mail_examples.MSOFFICE_1, remove_unwanted=True)
+    def test_30_html_email_clean_msoffice(self):
+        new_html = html_email_clean(test_mail_examples.MSOFFICE_1, remove=True)
         for ext in test_mail_examples.MSOFFICE_1_IN:
             self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
         for ext in test_mail_examples.MSOFFICE_1_OUT:
             self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
 
-    def test_30_html_email_clean_hotmail(self):
-        new_html = html_email_clean(test_mail_examples.HOTMAIL_1, remove_unwanted=True)
+    def test_40_html_email_clean_hotmail(self):
+        new_html = html_email_clean(test_mail_examples.HOTMAIL_1, remove=True)
         for ext in test_mail_examples.HOTMAIL_1_IN:
             self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
         for ext in test_mail_examples.HOTMAIL_1_OUT:
             self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
 
-    def test_40_html_email_clean_gmail(self):
-        new_html = html_email_clean(test_mail_examples.GMAIL_1, remove_unwanted=True)
+    def test_50_html_email_clean_gmail(self):
+        new_html = html_email_clean(test_mail_examples.GMAIL_1, remove=True)
         for ext in test_mail_examples.GMAIL_1_IN:
             self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
         for ext in test_mail_examples.GMAIL_1_OUT:
             self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
 
-    def test_50_html_email_clean_thunderbird(self):
-        new_html = html_email_clean(test_mail_examples.THUNDERBIRD_1, remove_unwanted=True)
+    def test_60_html_email_clean_thunderbird(self):
+        new_html = html_email_clean(test_mail_examples.THUNDERBIRD_1, remove=True)
         for ext in test_mail_examples.THUNDERBIRD_1_IN:
             self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
         for ext in test_mail_examples.THUNDERBIRD_1_OUT:
diff --git a/openerp/tools/mail.py b/openerp/tools/mail.py
index 7b263603eca..da36aed3865 100644
--- a/openerp/tools/mail.py
+++ b/openerp/tools/mail.py
@@ -75,17 +75,23 @@ def html_sanitize(src):
 # HTML Cleaner
 #----------------------------------------------------------
 
-def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_length=300):
-    """ html_email_clean: clean the html to display in the web client.
-        - strip email quotes (remove blockquote nodes)
-        - strip signatures (remove --\n{\n)Blahblah), by replacing <br> by
-            \n to avoid ignoring signatures converted into html
+def html_email_clean(html, remove=False, shorten=False, max_length=300):
+    """ html_email_clean: clean the html
+        - try to strip email quotes (remove blockquote nodes)
+        - try to strip signatures
+        - allows having a shortened version of the html (read more/read less behavior)
 
         :param string html: sanitized html; tags like html or head should not
             be present in the html string. This method therefore takes as input
             html code coming from a sanitized source, like fields.html.
+        :param boolean remove: remove the html code that is unwanted; otherwise
+            it is only flagged and tagged
+        :param boolean shorten: shorten the html
+        :param int max_length: if shortening, maximum number of characters before
+            shortening
     """
     def _replace_matching_regex(regex, source, replace=''):
+        """ Replace all matching expressions in source by replace """
         if not source:
             return source
         dest = ''
@@ -96,35 +102,38 @@ def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_leng
         dest += source[idx:]
         return dest
 
-    def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs=None):
-        # print '\t_tag_matching_regex_in_text'
+    def _create_node(tag, text, tail=None, attrs={}):
+        new_node = etree.Element(tag)
+        new_node.text = text
+        new_node.tail = tail
+        for key, val in attrs.iteritems():
+            new_node.set(key, val)
+        return new_node
+
+    def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}):
+        new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs)
+        node.insert(index, new_node)
+        return new_node
+
+    def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}):
         text = node.text or ''
-        node.text = ''
+        if not re.search(regex, text):
+            return
+
         cur_node = node
-        idx = 0
-        caca = 0
+        node.text = ''
+        idx, iteration = 0, 0
         for item in re.finditer(regex, text):
-            # print '\t\tfound', item.start(), item.end(), '-', text[item.start():item.end()], '-'
-            if caca == 0:
+            if iteration == 0:
                 cur_node.text = text[idx:item.start()]
             else:
-                cur_node.tail = text[idx:item.start()]
+                _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()])
+            new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs)
 
-            # create element
-            new_node = etree.Element(new_node_tag)
-            new_node.text = text[item.start():item.end()]
-            for key, val in new_node_attrs.iteritems():
-                new_node.set(key, val)
-
-            # insert element in DOM
-            node.insert(caca, new_node)
             cur_node = new_node
             idx = item.end()
-            caca += 1
-        if caca == 0:
-            cur_node.text = (cur_node.text or '') + text[idx:]
-        else:
-            cur_node.tail = text[idx:] + (cur_node.tail or '')
+            iteration += 1
+        new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
 
     if not html or not isinstance(html, basestring):
         return html
@@ -132,30 +141,32 @@ def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_leng
 
     # Pre processing
     # ------------------------------------------------------------
-
-    # --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
+    # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
 
     # html: remove encoding attribute inside tags
     doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
     html = doctype.sub(r"", html)
 
     # html: ClEditor seems to love using <div><br /><div> -> replace with <br />
-    br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)')
+    br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE)
     html = _replace_matching_regex(br_div_tags, html, '<br />')
 
-    # html: <br[ /]> -> \n, to de-obfuscate the tree
-    br_tags = re.compile(r'([<]\s*[bB][rR]\s*\/?[>])')
-    html = _replace_matching_regex(br_tags, html, '__BR_TAG__')
-
     # form a tree
     root = lxml.html.fromstring(html)
     if not len(root) and root.text is None and root.tail is None:
         html = '<div>%s</div>' % html
         root = lxml.html.fromstring(html)
 
+    # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
+    for node in root.getiterator():
+        if node.tail:
+            tail_node = _create_node('span', node.tail)
+            node.tail = None
+            node.addnext(tail_node)
+
     # form node and tag text-based quotes and signature
     quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
-    signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)')
+    signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[^.]+)')
     for node in root.getiterator():
         _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
         _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
@@ -164,59 +175,89 @@ def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_leng
     # ------------------------------------------------------------
 
     # tree: tag nodes
+    # signature_begin = False  # try dynamic signature recognition
     quote_begin = False
     overlength = False
     cur_char_nbr = 0
     for node in root.getiterator():
-        if node.get('class') in ['WordSection1', 'MsoNormal']:
+        # root: try to tag the client used to write the html
+        if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
             root.set('msoffice', '1')
-        if node.get('class') in ['SkyDrivePlaceholder'] or node.get('id') in ['SkyDrivePlaceholder']:
+        if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
             root.set('hotmail', '1')
 
+        # state of the parsing
         if quote_begin:
-            node.set('quote', '1')
+            node.set('in_quote', '1')
+            node.set('tail_remove', '1')
         if overlength:
-            node.set('remove', '1')
+            node.set('in_overlength', '1')
             node.set('tail_remove', '1')
 
         if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
             quote_begin = True
+            node.set('in_quote', '1')
+            node.set('tail_remove', '1')
         if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
             quote_begin = True
+            node.set('in_quote', '1')
+            node.set('tail_remove', '1')
+
+        # shorten:
+        # 1/ truncate the text at the next available space
+        # 2/ create a 'read more' node, next to current node
+        # 3/ add the truncated text in a new node, next to 'read more' node
+        if shorten and not overlength and cur_char_nbr + len(node.text or '') > max_length:
+            overlength = True
+            # truncate text
+            innertext = node.text[0:(max_length - cur_char_nbr)]
+            outertext = node.text[(max_length - cur_char_nbr):]
+            stop_idx = outertext.find(' ')
+            if stop_idx == -1:
+                stop_idx = len(outertext)
+            node.text = innertext + outertext[0:stop_idx]
+            # create <span> ... <a href="#">read more</a></span> node
+            read_more_node = _create_node('span', ' ... ', None, {'class': 'oe_mail_expand'})
+            read_more_link_node = _create_node('a', 'read more', None, {'href': '#', 'class': 'oe_mail_expand'})
+            read_more_node.append(read_more_link_node)
+            # create outertext node
+            new_node = _create_node('span', outertext[stop_idx:])
+            # add newly created nodes in dom
+            node.addnext(new_node)
+            node.addnext(read_more_node)
+            # tag node
+            new_node.set('in_overlength', '1')
 
-        if use_max_length:
-            if not overlength and cur_char_nbr + len(node.text or '') > max_length:
-                overlength = True
-                node.text = node.text[0:(max_length - cur_char_nbr)] + ' <span class="oe_mail_expand"><a href="#">... read more</a></span>'
-                node.set('tail_remove', '1')
             cur_char_nbr += len(node.text or '')
 
         if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
-            node.set('remove', '1')
-        if quote_begin:
-            node.set('remove', '1')
-            node.set('tail_remove', '1')
+            node.set('in_quote', '1')
 
     # Post processing
     # ------------------------------------------------------------
 
-    if remove_unwanted:
-        to_delete = []
-        for node in root.getiterator():
-            if node.get('remove'):
-                # copy the node tail into parent text
-                if node.tail and not node.get('tail_remove'):
-                    parent = node.getparent()
-                    parent.tail = node.tail + (parent.tail or '')
-                to_delete.append(node)
-            if node.get('tail_remove'):
-                node.tail = ''
-        for node in to_delete:
+    to_remove = []
+    for node in root.getiterator():
+        if node.get('in_quote') or node.get('in_overlength'):
+            # copy the node tail into parent text
+            if node.tail and not node.get('tail_remove'):
+                parent = node.getparent()
+                parent.tail = node.tail + (parent.tail or '')
+            to_remove.append(node)
+        if node.get('tail_remove'):
+            node.tail = ''
+    for node in to_remove:
+        if remove:
             node.getparent().remove(node)
+        else:
+            if not 'oe_mail_expand' in node.get('class', ''):  # trick: read more link should be displayed even if it's in overlength
+                node_class = node.get('class', '') + ' ' + 'oe_mail_cleaned'
+                node.set('class', node_class)
 
-    # html: \n back to <br/>
-    html = etree.tostring(root, pretty_print=True)
-    html = html.replace('__BR_TAG__', '<br />')
+    # html: \n that were tail of elements have been encapsulated into <span> -> back to \n
+    html = etree.tostring(root, pretty_print=False)
+    linebreaks = re.compile(r'<span>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
+    html = _replace_matching_regex(linebreaks, html, '\n')
 
     return html