[IMP] tools: html_email_clean: a bit more robust.

bzr revid: tde@openerp.com-20121114141452-n5jy3yzbmlbd3811
This commit is contained in:
Thibault Delavallée 2012-11-14 15:14:52 +01:00
parent 8e15ab0dff
commit 23743683ea
1 changed files with 1 additions and 2 deletions

View File

@ -133,9 +133,8 @@ def html_email_clean(html):
html = ustr(html)
# 1. <br[ /]> -> \n, because otherwise the tree is obfuscated
br_tags = re.compile(r'([<]\s*br\s*\/?[>])')
br_tags = re.compile(r'([<]\s*[bB][rR]\s*\/?[>])')
html = _replace_matching_regex(br_tags, html, '__BR_TAG__')
# TDE note: seems to have lots of <div><br></div> in emails... needs to be checks, could be cleaned
# 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre
root = lxml.html.fromstring(html)