[IMP] tools: mail: improved basic html_email_clean.

bzr revid: tde@openerp.com-20121107085807-td2okmbr61pm4g1e
This commit is contained in:
Thibault Delavallée 2012-11-07 09:58:07 +01:00
parent ee6e9894a4
commit 3ba739cbf5
1 changed files with 24 additions and 18 deletions

View File

@ -19,11 +19,11 @@
#
##############################################################################
from lxml.etree import tostring, Element
try:
from lxml.html.soupparser import fromstring as parser_fromstring
except ImportError:
from lxml.html import fromstring as parser_fromstring
from lxml import etree
# try:
# from lxml.html.soupparser import fromstring as parser_fromstring
# except ImportError:
# from lxml.html import fromstring as parser_fromstring
import logging
import lxml.html
import openerp.pooler as pooler
@ -122,15 +122,16 @@ def html_email_clean(html):
br_tags = re.compile(r'([<]\s*br\s*\/?[>])')
idx = 0
for item in re.finditer(br_tags, html):
modified_html += html[idx:item.start()] + '\n'
modified_html += html[idx:item.start()] + '__BR_TAG__'
idx = item.end()
modified_html += html[idx:]
html = modified_html
# 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre
root = parser_fromstring(modified_html)
root = lxml.html.fromstring(html)
if not len(root) and root.text is None and root.tail is None:
modified_html = '<div>%s</div>' % modified_html
root = parser_fromstring(modified_html)
html = '<div>%s</div>' % html
root = lxml.html.fromstring(html)
# 3. remove blockquotes
quotes = [el for el in root.iterchildren(tag='blockquote')]
@ -150,16 +151,21 @@ def html_email_clean(html):
elem.tail = elem.tail[:match.start()] + elem.tail[match.end():]
# 5. \n back to <br/>
for el in root.iterchildren():
if el.tag == 'pre':
continue
if el.text:
el.text = el.text.replace('\n', '<br />')
if el.tail:
el.tail = el.tail.replace('\n', '<br />')
html = etree.tostring(root, pretty_print=True)
html = html.replace('__BR_TAG__', '<br />')
new_html = tostring(root, pretty_print=True)
return new_html
# 6. Misc cleaning :
# - ClEditor seems to love using <div><br /><div> -> replace with <br />
modified_html = ''
br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)')
idx = 0
for item in re.finditer(br_div_tags, html):
modified_html += html[idx:item.start()] + '<br />'
idx = item.end()
modified_html += html[idx:]
html = modified_html
return html
#----------------------------------------------------------