[IMP] tools: mail: added html_cleaner, to clean html code to display in chatter (remove quotes, signatures, ...).

bzr revid: tde@openerp.com-20121106121737-ej57bijc7ypksu8v
2012-11-06 13:17:37 +01:00 · 2012-11-06 13:17:37 +01:00 · 1d52a68dd9
parent 5f53f5253b
commit 1d52a68dd9
1 changed files with 58 additions and 0 deletions
--- a/openerp/tools/mail.py
+++ b/openerp/tools/mail.py
@ -19,6 +19,8 @@
 #
 ##############################################################################

+from lxml.html.soupparser import fromstring
+from lxml.etree import tostring
 import lxml.html
 import openerp.pooler as pooler
 import operator
@ -94,6 +96,62 @@ def append_to(elements, dest_node):
            dest_node.append(element)


+#----------------------------------------------------------
+# HTML Cleaner
+#----------------------------------------------------------
+
+def html_email_clean(html):
+    """ html_email_clean: clean the html to display in the web client.
+        - strip email quotes (remove blockquote nodes)
+        - strip signatures (remove --\n{\n)Blahblah), by replacing <br> by
+            \n to avoid ignoring signatures converted into html
+    """
+    modified_html = ''
+
+    # 1. <br[ /]> -> \n, because otherwise the tree is obfuscated
+    br_tags = re.compile(r'([<]\s*br\s*\/?[>])')
+    idx = 0
+    for item in re.finditer(br_tags, html):
+        modified_html += html[idx:item.start()] + '\n'
+        idx = item.end()
+    modified_html += html[idx:]
+
+    # 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre
+    root = fromstring(modified_html)
+    if not len(root) and root.text is None and root.tail is None:
+        modified_html = '<div>%s</div>' % modified_html
+        root = fromstring(modified_html)
+
+    # 3. remove blockquotes
+    quotes = [el for el in root.iterchildren(tag='blockquote')]
+    for node in quotes:
+        node.getparent().remove(node)
+
+    # 4. strip signatures
+    signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)')
+    for elem in root.getiterator():
+        if elem.text:
+            match = re.search(signature, elem.text)
+            if match:
+                elem.text = elem.text[:match.start()] + elem.text[match.end():]
+        if elem.tail:
+            match = re.search(signature, elem.tail)
+            if match:
+                elem.tail = elem.tail[:match.start()] + elem.tail[match.end():]
+
+    # 5. \n back to <br/>
+    for el in root.iterchildren():
+        if el.tag == 'pre':
+            continue
+        if el.text:
+            el.text = el.text.replace('\n', '<br />')
+        if el.tail:
+            el.tail = el.tail.replace('\n', '<br />')
+
+    new_html = tostring(root, pretty_print=True)
+    return new_html
+
+
 #----------------------------------------------------------
 # Emails
 #----------------------------------------------------------