[REF] html_email_clean: refactored the algorithm to clean it, specifically about truncature. Now add classes to node when not removing them, to be managed by the web client.

bzr revid: tde@openerp.com-20130425104301-e7mv9o4pcjx6k2cn
2013-04-25 12:43:01 +02:00 · 2013-04-25 12:43:01 +02:00 · ccbb8e09a6
parent 78ac31e260
commit ccbb8e09a6
2 changed files with 156 additions and 77 deletions
--- a/openerp/tests/test_mail.py
+++ b/openerp/tests/test_mail.py
@ -113,62 +113,100 @@ class TestSanitizer(unittest2.TestCase):
 class TestCleaner(unittest2.TestCase):
    """ Test the email cleaner function that filters the content of incoming emails """
-    def test_00_html_email_clean_text(self):
+    def test_00_html_email_clean_signature(self):
        """ html_email_clean test for signatures """
        test_data = [("""This is Sparta!\n--\nAdministrator\n+9988776655""",
                        ['This is Sparta!'],
                        ['Administrator', '9988776655']),
                     ("""<p>--\nAdministrator</p>""",
                        [],
                        ['--', 'Administrator']),
                     ("""<p>This is Sparta!\n---\nAdministrator</p>""",
                        ['This is Sparta!'],
                        ['---', 'Administrator']),
                     ("""<p>--<br>Administrator</p>""",
                        [],
                        []),
                     ("""<p>This is Sparta!<br/>--<br>Administrator</p>""",
                        ['This is Sparta!'],
                        [])
                    ]
        for test, in_lst, out_lst in test_data:
            new_html = html_email_clean(test, remove=True)
            for text in in_lst:
                self.assertIn(text, new_html, 'html_email_cleaner wrongly removed content')
            for text in out_lst:
                self.assertNotIn(text, new_html, 'html_email_cleaner did not remove unwanted content')
    def test_05_html_email_clean_signature(self):
        """ html_email_clean test for quotes """
        test_data = [("""This is Sparta!\n>Ah bon ?\nCertes\n> Chouette !\nClair""",
                        ['This is Sparta!', 'Certes', 'Clair'],
                        ['Ah bon', 'Chouette'])
                    ]
        for test, in_lst, out_lst in test_data:
            new_html = html_email_clean(test, remove=True)
            for text in in_lst:
                self.assertIn(text, new_html, 'html_email_cleaner wrongly removed content')
            for text in out_lst:
                self.assertNotIn(text, new_html, 'html_email_cleaner did not remove unwanted content')
    def test_10_html_email_clean_text(self):
        """ html_email_clean test for text-based emails """
-        new_html = html_email_clean(test_mail_examples.TEXT_1, remove_unwanted=True)
+        new_html = html_email_clean(test_mail_examples.TEXT_1, remove=True)
        for ext in test_mail_examples.TEXT_1_IN:
            self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
        for ext in test_mail_examples.TEXT_1_OUT:
            self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
-        new_html = html_email_clean(test_mail_examples.TEXT_2, remove_unwanted=True)
+        new_html = html_email_clean(test_mail_examples.TEXT_2, remove=True)
        for ext in test_mail_examples.TEXT_2_IN:
            self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
        for ext in test_mail_examples.TEXT_2_OUT:
            self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
-    def test_10_html_email_clean_html(self):
+    def test_20_html_email_clean_html(self):
-        new_html = html_email_clean(test_mail_examples.HTML_1, remove_unwanted=True)
+        new_html = html_email_clean(test_mail_examples.HTML_1, remove=True)
        for ext in test_mail_examples.HTML_1_IN:
            self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
        for ext in test_mail_examples.HTML_1_OUT:
            self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
-        new_html = html_email_clean(test_mail_examples.HTML_2, remove_unwanted=False)
+        new_html = html_email_clean(test_mail_examples.HTML_2, remove=True)
        for ext in test_mail_examples.HTML_2_IN:
            self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
        for ext in test_mail_examples.HTML_2_OUT:
            self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
-        new_html = html_email_clean(test_mail_examples.HTML_3, remove_unwanted=False)
+        new_html = html_email_clean(test_mail_examples.HTML_3, remove=False)
        for ext in test_mail_examples.HTML_3_IN:
            self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
        # for ext in test_mail_examples.HTML_3_OUT:
        #     self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
-    def test_20_html_email_clean_msoffice(self):
+    def test_30_html_email_clean_msoffice(self):
-        new_html = html_email_clean(test_mail_examples.MSOFFICE_1, remove_unwanted=True)
+        new_html = html_email_clean(test_mail_examples.MSOFFICE_1, remove=True)
        for ext in test_mail_examples.MSOFFICE_1_IN:
            self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
        for ext in test_mail_examples.MSOFFICE_1_OUT:
            self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
-    def test_30_html_email_clean_hotmail(self):
+    def test_40_html_email_clean_hotmail(self):
-        new_html = html_email_clean(test_mail_examples.HOTMAIL_1, remove_unwanted=True)
+        new_html = html_email_clean(test_mail_examples.HOTMAIL_1, remove=True)
        for ext in test_mail_examples.HOTMAIL_1_IN:
            self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
        for ext in test_mail_examples.HOTMAIL_1_OUT:
            self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
-    def test_40_html_email_clean_gmail(self):
+    def test_50_html_email_clean_gmail(self):
-        new_html = html_email_clean(test_mail_examples.GMAIL_1, remove_unwanted=True)
+        new_html = html_email_clean(test_mail_examples.GMAIL_1, remove=True)
        for ext in test_mail_examples.GMAIL_1_IN:
            self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
        for ext in test_mail_examples.GMAIL_1_OUT:
            self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
-    def test_50_html_email_clean_thunderbird(self):
+    def test_60_html_email_clean_thunderbird(self):
-        new_html = html_email_clean(test_mail_examples.THUNDERBIRD_1, remove_unwanted=True)
+        new_html = html_email_clean(test_mail_examples.THUNDERBIRD_1, remove=True)
        for ext in test_mail_examples.THUNDERBIRD_1_IN:
            self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
        for ext in test_mail_examples.THUNDERBIRD_1_OUT:
--- a/openerp/tools/mail.py
+++ b/openerp/tools/mail.py
@ -75,17 +75,23 @@ def html_sanitize(src):
 # HTML Cleaner
 #----------------------------------------------------------
-def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_length=300):
+def html_email_clean(html, remove=False, shorten=False, max_length=300):
-    """ html_email_clean: clean the html to display in the web client.
+    """ html_email_clean: clean the html
-        - strip email quotes (remove blockquote nodes)
+        - try to strip email quotes (remove blockquote nodes)
-        - strip signatures (remove --\n{\n)Blahblah), by replacing <br> by
+        - try to strip signatures
-            \n to avoid ignoring signatures converted into html
+        - allows having a shortened version of the html (read more/read less behavior)
        :param string html: sanitized html; tags like html or head should not
            be present in the html string. This method therefore takes as input
            html code coming from a sanitized source, like fields.html.
        :param boolean remove: remove the html code that is unwanted; otherwise
            it is only flagged and tagged
        :param boolean shorten: shorten the html
        :param int max_length: if shortening, maximum number of characters before
            shortening
    """
    def _replace_matching_regex(regex, source, replace=''):
        """ Replace all matching expressions in source by replace """
        if not source:
            return source
        dest = ''
@ -96,35 +102,38 @@ def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_leng
        dest += source[idx:]
        return dest
-    def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs=None):
+    def _create_node(tag, text, tail=None, attrs={}):
-        # print '\t_tag_matching_regex_in_text'
+        new_node = etree.Element(tag)
        new_node.text = text
        new_node.tail = tail
        for key, val in attrs.iteritems():
            new_node.set(key, val)
        return new_node
    def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}):
        new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs)
        node.insert(index, new_node)
        return new_node
    def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}):
        text = node.text or ''
-        node.text = ''
+        if not re.search(regex, text):
            return
        cur_node = node
-        idx = 0
+        node.text = ''
-        caca = 0
+        idx, iteration = 0, 0
        for item in re.finditer(regex, text):
-            # print '\t\tfound', item.start(), item.end(), '-', text[item.start():item.end()], '-'
+            if iteration == 0:
            if caca == 0:
                cur_node.text = text[idx:item.start()]
            else:
-                cur_node.tail = text[idx:item.start()]
+                _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()])
            new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs)
            # create element
            new_node = etree.Element(new_node_tag)
            new_node.text = text[item.start():item.end()]
            for key, val in new_node_attrs.iteritems():
                new_node.set(key, val)
            # insert element in DOM
            node.insert(caca, new_node)
            cur_node = new_node
            idx = item.end()
-            caca += 1
+            iteration += 1
-        if caca == 0:
+        new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
            cur_node.text = (cur_node.text or '') + text[idx:]
        else:
            cur_node.tail = text[idx:] + (cur_node.tail or '')
    if not html or not isinstance(html, basestring):
        return html
@ -132,30 +141,32 @@ def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_leng
    # Pre processing
    # ------------------------------------------------------------
-
+    # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
    # --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
    # html: remove encoding attribute inside tags
    doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
    html = doctype.sub(r"", html)
    # html: ClEditor seems to love using <div><br /><div> -> replace with <br />
-    br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)')
+    br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE)
    html = _replace_matching_regex(br_div_tags, html, '<br />')
    # html: <br[ /]> -> \n, to de-obfuscate the tree
    br_tags = re.compile(r'([<]\s*[bB][rR]\s*\/?[>])')
    html = _replace_matching_regex(br_tags, html, '__BR_TAG__')
    # form a tree
    root = lxml.html.fromstring(html)
    if not len(root) and root.text is None and root.tail is None:
        html = '<div>%s</div>' % html
        root = lxml.html.fromstring(html)
    # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
    for node in root.getiterator():
        if node.tail:
            tail_node = _create_node('span', node.tail)
            node.tail = None
            node.addnext(tail_node)
    # form node and tag text-based quotes and signature
    quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
-    signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)')
+    signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[^.]+)')
    for node in root.getiterator():
        _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
        _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
@ -164,59 +175,89 @@ def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_leng
    # ------------------------------------------------------------
    # tree: tag nodes
    # signature_begin = False  # try dynamic signature recognition
    quote_begin = False
    overlength = False
    cur_char_nbr = 0
    for node in root.getiterator():
-        if node.get('class') in ['WordSection1', 'MsoNormal']:
+        # root: try to tag the client used to write the html
        if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
            root.set('msoffice', '1')
-        if node.get('class') in ['SkyDrivePlaceholder'] or node.get('id') in ['SkyDrivePlaceholder']:
+        if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
            root.set('hotmail', '1')
        # state of the parsing
        if quote_begin:
-            node.set('quote', '1')
+            node.set('in_quote', '1')
            node.set('tail_remove', '1')
        if overlength:
-            node.set('remove', '1')
+            node.set('in_overlength', '1')
            node.set('tail_remove', '1')
        if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
            quote_begin = True
            node.set('in_quote', '1')
            node.set('tail_remove', '1')
        if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
            quote_begin = True
            node.set('in_quote', '1')
            node.set('tail_remove', '1')
        # shorten:
        # 1/ truncate the text at the next available space
        # 2/ create a 'read more' node, next to current node
        # 3/ add the truncated text in a new node, next to 'read more' node
        if shorten and not overlength and cur_char_nbr + len(node.text or '') > max_length:
            overlength = True
            # truncate text
            innertext = node.text[0:(max_length - cur_char_nbr)]
            outertext = node.text[(max_length - cur_char_nbr):]
            stop_idx = outertext.find(' ')
            if stop_idx == -1:
                stop_idx = len(outertext)
            node.text = innertext + outertext[0:stop_idx]
            # create <span> ... <a href="#">read more</a></span> node
            read_more_node = _create_node('span', ' ... ', None, {'class': 'oe_mail_expand'})
            read_more_link_node = _create_node('a', 'read more', None, {'href': '#', 'class': 'oe_mail_expand'})
            read_more_node.append(read_more_link_node)
            # create outertext node
            new_node = _create_node('span', outertext[stop_idx:])
            # add newly created nodes in dom
            node.addnext(new_node)
            node.addnext(read_more_node)
            # tag node
            new_node.set('in_overlength', '1')
        if use_max_length:
            if not overlength and cur_char_nbr + len(node.text or '') > max_length:
                overlength = True
                node.text = node.text[0:(max_length - cur_char_nbr)] + ' <span class="oe_mail_expand"><a href="#">... read more</a></span>'
                node.set('tail_remove', '1')
            cur_char_nbr += len(node.text or '')
        if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
-            node.set('remove', '1')
+            node.set('in_quote', '1')
        if quote_begin:
            node.set('remove', '1')
            node.set('tail_remove', '1')
    # Post processing
    # ------------------------------------------------------------
-    if remove_unwanted:
+    to_remove = []
-        to_delete = []
+    for node in root.getiterator():
-        for node in root.getiterator():
+        if node.get('in_quote') or node.get('in_overlength'):
-            if node.get('remove'):
+            # copy the node tail into parent text
-                # copy the node tail into parent text
+            if node.tail and not node.get('tail_remove'):
-                if node.tail and not node.get('tail_remove'):
+                parent = node.getparent()
-                    parent = node.getparent()
+                parent.tail = node.tail + (parent.tail or '')
-                    parent.tail = node.tail + (parent.tail or '')
+            to_remove.append(node)
-                to_delete.append(node)
+        if node.get('tail_remove'):
-            if node.get('tail_remove'):
+            node.tail = ''
-                node.tail = ''
+    for node in to_remove:
-        for node in to_delete:
+        if remove:
            node.getparent().remove(node)
        else:
            if not 'oe_mail_expand' in node.get('class', ''):  # trick: read more link should be displayed even if it's in overlength
                node_class = node.get('class', '') + ' ' + 'oe_mail_cleaned'
                node.set('class', node_class)
-    # html: \n back to <br/>
+    # html: \n that were tail of elements have been encapsulated into <span> -> back to \n
-    html = etree.tostring(root, pretty_print=True)
+    html = etree.tostring(root, pretty_print=False)
-    html = html.replace('__BR_TAG__', '<br />')
+    linebreaks = re.compile(r'<span>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
    html = _replace_matching_regex(linebreaks, html, '\n')
    return html