[IMP] tools: mail: added html_cleaner, to clean html code to display in chatter (remove quotes, signatures, ...).

bzr revid: tde@openerp.com-20121106121737-ej57bijc7ypksu8v
This commit is contained in:
Thibault Delavallée 2012-11-06 13:17:37 +01:00
parent 5f53f5253b
commit 1d52a68dd9
1 changed files with 58 additions and 0 deletions

View File

@ -19,6 +19,8 @@
#
##############################################################################
from lxml.html.soupparser import fromstring
from lxml.etree import tostring
import lxml.html
import openerp.pooler as pooler
import operator
@ -94,6 +96,62 @@ def append_to(elements, dest_node):
dest_node.append(element)
#----------------------------------------------------------
# HTML Cleaner
#----------------------------------------------------------
def html_email_clean(html):
""" html_email_clean: clean the html to display in the web client.
- strip email quotes (remove blockquote nodes)
- strip signatures (remove --\n{\n)Blahblah), by replacing <br> by
\n to avoid ignoring signatures converted into html
"""
modified_html = ''
# 1. <br[ /]> -> \n, because otherwise the tree is obfuscated
br_tags = re.compile(r'([<]\s*br\s*\/?[>])')
idx = 0
for item in re.finditer(br_tags, html):
modified_html += html[idx:item.start()] + '\n'
idx = item.end()
modified_html += html[idx:]
# 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre
root = fromstring(modified_html)
if not len(root) and root.text is None and root.tail is None:
modified_html = '<div>%s</div>' % modified_html
root = fromstring(modified_html)
# 3. remove blockquotes
quotes = [el for el in root.iterchildren(tag='blockquote')]
for node in quotes:
node.getparent().remove(node)
# 4. strip signatures
signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)')
for elem in root.getiterator():
if elem.text:
match = re.search(signature, elem.text)
if match:
elem.text = elem.text[:match.start()] + elem.text[match.end():]
if elem.tail:
match = re.search(signature, elem.tail)
if match:
elem.tail = elem.tail[:match.start()] + elem.tail[match.end():]
# 5. \n back to <br/>
for el in root.iterchildren():
if el.tag == 'pre':
continue
if el.text:
el.text = el.text.replace('\n', '<br />')
if el.tail:
el.tail = el.tail.replace('\n', '<br />')
new_html = tostring(root, pretty_print=True)
return new_html
#----------------------------------------------------------
# Emails
#----------------------------------------------------------