[REF] html_email_clean: refactored the algorithm to clean it, specifically about truncature. Now add classes to node when not removing them, to be managed by the web client.

bzr revid: tde@openerp.com-20130425104301-e7mv9o4pcjx6k2cn
This commit is contained in:
Thibault Delavallée 2013-04-25 12:43:01 +02:00
parent 78ac31e260
commit ccbb8e09a6
2 changed files with 156 additions and 77 deletions

View File

@ -113,62 +113,100 @@ class TestSanitizer(unittest2.TestCase):
class TestCleaner(unittest2.TestCase): class TestCleaner(unittest2.TestCase):
""" Test the email cleaner function that filters the content of incoming emails """ """ Test the email cleaner function that filters the content of incoming emails """
def test_00_html_email_clean_text(self): def test_00_html_email_clean_signature(self):
""" html_email_clean test for signatures """
test_data = [("""This is Sparta!\n--\nAdministrator\n+9988776655""",
['This is Sparta!'],
['Administrator', '9988776655']),
("""<p>--\nAdministrator</p>""",
[],
['--', 'Administrator']),
("""<p>This is Sparta!\n---\nAdministrator</p>""",
['This is Sparta!'],
['---', 'Administrator']),
("""<p>--<br>Administrator</p>""",
[],
[]),
("""<p>This is Sparta!<br/>--<br>Administrator</p>""",
['This is Sparta!'],
[])
]
for test, in_lst, out_lst in test_data:
new_html = html_email_clean(test, remove=True)
for text in in_lst:
self.assertIn(text, new_html, 'html_email_cleaner wrongly removed content')
for text in out_lst:
self.assertNotIn(text, new_html, 'html_email_cleaner did not remove unwanted content')
def test_05_html_email_clean_signature(self):
""" html_email_clean test for quotes """
test_data = [("""This is Sparta!\n>Ah bon ?\nCertes\n> Chouette !\nClair""",
['This is Sparta!', 'Certes', 'Clair'],
['Ah bon', 'Chouette'])
]
for test, in_lst, out_lst in test_data:
new_html = html_email_clean(test, remove=True)
for text in in_lst:
self.assertIn(text, new_html, 'html_email_cleaner wrongly removed content')
for text in out_lst:
self.assertNotIn(text, new_html, 'html_email_cleaner did not remove unwanted content')
def test_10_html_email_clean_text(self):
""" html_email_clean test for text-based emails """ """ html_email_clean test for text-based emails """
new_html = html_email_clean(test_mail_examples.TEXT_1, remove_unwanted=True) new_html = html_email_clean(test_mail_examples.TEXT_1, remove=True)
for ext in test_mail_examples.TEXT_1_IN: for ext in test_mail_examples.TEXT_1_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.TEXT_1_OUT: for ext in test_mail_examples.TEXT_1_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
new_html = html_email_clean(test_mail_examples.TEXT_2, remove_unwanted=True) new_html = html_email_clean(test_mail_examples.TEXT_2, remove=True)
for ext in test_mail_examples.TEXT_2_IN: for ext in test_mail_examples.TEXT_2_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.TEXT_2_OUT: for ext in test_mail_examples.TEXT_2_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
def test_10_html_email_clean_html(self): def test_20_html_email_clean_html(self):
new_html = html_email_clean(test_mail_examples.HTML_1, remove_unwanted=True) new_html = html_email_clean(test_mail_examples.HTML_1, remove=True)
for ext in test_mail_examples.HTML_1_IN: for ext in test_mail_examples.HTML_1_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.HTML_1_OUT: for ext in test_mail_examples.HTML_1_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
new_html = html_email_clean(test_mail_examples.HTML_2, remove_unwanted=False) new_html = html_email_clean(test_mail_examples.HTML_2, remove=True)
for ext in test_mail_examples.HTML_2_IN: for ext in test_mail_examples.HTML_2_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.HTML_2_OUT: for ext in test_mail_examples.HTML_2_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
new_html = html_email_clean(test_mail_examples.HTML_3, remove_unwanted=False) new_html = html_email_clean(test_mail_examples.HTML_3, remove=False)
for ext in test_mail_examples.HTML_3_IN: for ext in test_mail_examples.HTML_3_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
# for ext in test_mail_examples.HTML_3_OUT: # for ext in test_mail_examples.HTML_3_OUT:
# self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') # self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
def test_20_html_email_clean_msoffice(self): def test_30_html_email_clean_msoffice(self):
new_html = html_email_clean(test_mail_examples.MSOFFICE_1, remove_unwanted=True) new_html = html_email_clean(test_mail_examples.MSOFFICE_1, remove=True)
for ext in test_mail_examples.MSOFFICE_1_IN: for ext in test_mail_examples.MSOFFICE_1_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.MSOFFICE_1_OUT: for ext in test_mail_examples.MSOFFICE_1_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
def test_30_html_email_clean_hotmail(self): def test_40_html_email_clean_hotmail(self):
new_html = html_email_clean(test_mail_examples.HOTMAIL_1, remove_unwanted=True) new_html = html_email_clean(test_mail_examples.HOTMAIL_1, remove=True)
for ext in test_mail_examples.HOTMAIL_1_IN: for ext in test_mail_examples.HOTMAIL_1_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.HOTMAIL_1_OUT: for ext in test_mail_examples.HOTMAIL_1_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
def test_40_html_email_clean_gmail(self): def test_50_html_email_clean_gmail(self):
new_html = html_email_clean(test_mail_examples.GMAIL_1, remove_unwanted=True) new_html = html_email_clean(test_mail_examples.GMAIL_1, remove=True)
for ext in test_mail_examples.GMAIL_1_IN: for ext in test_mail_examples.GMAIL_1_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.GMAIL_1_OUT: for ext in test_mail_examples.GMAIL_1_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
def test_50_html_email_clean_thunderbird(self): def test_60_html_email_clean_thunderbird(self):
new_html = html_email_clean(test_mail_examples.THUNDERBIRD_1, remove_unwanted=True) new_html = html_email_clean(test_mail_examples.THUNDERBIRD_1, remove=True)
for ext in test_mail_examples.THUNDERBIRD_1_IN: for ext in test_mail_examples.THUNDERBIRD_1_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.THUNDERBIRD_1_OUT: for ext in test_mail_examples.THUNDERBIRD_1_OUT:

View File

@ -75,17 +75,23 @@ def html_sanitize(src):
# HTML Cleaner # HTML Cleaner
#---------------------------------------------------------- #----------------------------------------------------------
def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_length=300): def html_email_clean(html, remove=False, shorten=False, max_length=300):
""" html_email_clean: clean the html to display in the web client. """ html_email_clean: clean the html
- strip email quotes (remove blockquote nodes) - try to strip email quotes (remove blockquote nodes)
- strip signatures (remove --\n{\n)Blahblah), by replacing <br> by - try to strip signatures
\n to avoid ignoring signatures converted into html - allows having a shortened version of the html (read more/read less behavior)
:param string html: sanitized html; tags like html or head should not :param string html: sanitized html; tags like html or head should not
be present in the html string. This method therefore takes as input be present in the html string. This method therefore takes as input
html code coming from a sanitized source, like fields.html. html code coming from a sanitized source, like fields.html.
:param boolean remove: remove the html code that is unwanted; otherwise
it is only flagged and tagged
:param boolean shorten: shorten the html
:param int max_length: if shortening, maximum number of characters before
shortening
""" """
def _replace_matching_regex(regex, source, replace=''): def _replace_matching_regex(regex, source, replace=''):
""" Replace all matching expressions in source by replace """
if not source: if not source:
return source return source
dest = '' dest = ''
@ -96,35 +102,38 @@ def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_leng
dest += source[idx:] dest += source[idx:]
return dest return dest
def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs=None): def _create_node(tag, text, tail=None, attrs={}):
# print '\t_tag_matching_regex_in_text' new_node = etree.Element(tag)
new_node.text = text
new_node.tail = tail
for key, val in attrs.iteritems():
new_node.set(key, val)
return new_node
def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}):
new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs)
node.insert(index, new_node)
return new_node
def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}):
text = node.text or '' text = node.text or ''
node.text = '' if not re.search(regex, text):
return
cur_node = node cur_node = node
idx = 0 node.text = ''
caca = 0 idx, iteration = 0, 0
for item in re.finditer(regex, text): for item in re.finditer(regex, text):
# print '\t\tfound', item.start(), item.end(), '-', text[item.start():item.end()], '-' if iteration == 0:
if caca == 0:
cur_node.text = text[idx:item.start()] cur_node.text = text[idx:item.start()]
else: else:
cur_node.tail = text[idx:item.start()] _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()])
new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs)
# create element
new_node = etree.Element(new_node_tag)
new_node.text = text[item.start():item.end()]
for key, val in new_node_attrs.iteritems():
new_node.set(key, val)
# insert element in DOM
node.insert(caca, new_node)
cur_node = new_node cur_node = new_node
idx = item.end() idx = item.end()
caca += 1 iteration += 1
if caca == 0: new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
cur_node.text = (cur_node.text or '') + text[idx:]
else:
cur_node.tail = text[idx:] + (cur_node.tail or '')
if not html or not isinstance(html, basestring): if not html or not isinstance(html, basestring):
return html return html
@ -132,30 +141,32 @@ def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_leng
# Pre processing # Pre processing
# ------------------------------------------------------------ # ------------------------------------------------------------
# TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
# --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
# html: remove encoding attribute inside tags # html: remove encoding attribute inside tags
doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL) doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
html = doctype.sub(r"", html) html = doctype.sub(r"", html)
# html: ClEditor seems to love using <div><br /><div> -> replace with <br /> # html: ClEditor seems to love using <div><br /><div> -> replace with <br />
br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)') br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE)
html = _replace_matching_regex(br_div_tags, html, '<br />') html = _replace_matching_regex(br_div_tags, html, '<br />')
# html: <br[ /]> -> \n, to de-obfuscate the tree
br_tags = re.compile(r'([<]\s*[bB][rR]\s*\/?[>])')
html = _replace_matching_regex(br_tags, html, '__BR_TAG__')
# form a tree # form a tree
root = lxml.html.fromstring(html) root = lxml.html.fromstring(html)
if not len(root) and root.text is None and root.tail is None: if not len(root) and root.text is None and root.tail is None:
html = '<div>%s</div>' % html html = '<div>%s</div>' % html
root = lxml.html.fromstring(html) root = lxml.html.fromstring(html)
# remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
for node in root.getiterator():
if node.tail:
tail_node = _create_node('span', node.tail)
node.tail = None
node.addnext(tail_node)
# form node and tag text-based quotes and signature # form node and tag text-based quotes and signature
quote_tags = re.compile(r'(\n(>)+[^\n\r]*)') quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)') signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[^.]+)')
for node in root.getiterator(): for node in root.getiterator():
_tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'}) _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
_tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'}) _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
@ -164,59 +175,89 @@ def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_leng
# ------------------------------------------------------------ # ------------------------------------------------------------
# tree: tag nodes # tree: tag nodes
# signature_begin = False # try dynamic signature recognition
quote_begin = False quote_begin = False
overlength = False overlength = False
cur_char_nbr = 0 cur_char_nbr = 0
for node in root.getiterator(): for node in root.getiterator():
if node.get('class') in ['WordSection1', 'MsoNormal']: # root: try to tag the client used to write the html
if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
root.set('msoffice', '1') root.set('msoffice', '1')
if node.get('class') in ['SkyDrivePlaceholder'] or node.get('id') in ['SkyDrivePlaceholder']: if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
root.set('hotmail', '1') root.set('hotmail', '1')
# state of the parsing
if quote_begin: if quote_begin:
node.set('quote', '1') node.set('in_quote', '1')
node.set('tail_remove', '1')
if overlength: if overlength:
node.set('remove', '1') node.set('in_overlength', '1')
node.set('tail_remove', '1') node.set('tail_remove', '1')
if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''): if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
quote_begin = True quote_begin = True
node.set('in_quote', '1')
node.set('tail_remove', '1')
if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')): if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
quote_begin = True quote_begin = True
node.set('in_quote', '1')
node.set('tail_remove', '1')
# shorten:
# 1/ truncate the text at the next available space
# 2/ create a 'read more' node, next to current node
# 3/ add the truncated text in a new node, next to 'read more' node
if shorten and not overlength and cur_char_nbr + len(node.text or '') > max_length:
overlength = True
# truncate text
innertext = node.text[0:(max_length - cur_char_nbr)]
outertext = node.text[(max_length - cur_char_nbr):]
stop_idx = outertext.find(' ')
if stop_idx == -1:
stop_idx = len(outertext)
node.text = innertext + outertext[0:stop_idx]
# create <span> ... <a href="#">read more</a></span> node
read_more_node = _create_node('span', ' ... ', None, {'class': 'oe_mail_expand'})
read_more_link_node = _create_node('a', 'read more', None, {'href': '#', 'class': 'oe_mail_expand'})
read_more_node.append(read_more_link_node)
# create outertext node
new_node = _create_node('span', outertext[stop_idx:])
# add newly created nodes in dom
node.addnext(new_node)
node.addnext(read_more_node)
# tag node
new_node.set('in_overlength', '1')
if use_max_length:
if not overlength and cur_char_nbr + len(node.text or '') > max_length:
overlength = True
node.text = node.text[0:(max_length - cur_char_nbr)] + ' <span class="oe_mail_expand"><a href="#">... read more</a></span>'
node.set('tail_remove', '1')
cur_char_nbr += len(node.text or '') cur_char_nbr += len(node.text or '')
if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'): if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
node.set('remove', '1') node.set('in_quote', '1')
if quote_begin:
node.set('remove', '1')
node.set('tail_remove', '1')
# Post processing # Post processing
# ------------------------------------------------------------ # ------------------------------------------------------------
if remove_unwanted: to_remove = []
to_delete = [] for node in root.getiterator():
for node in root.getiterator(): if node.get('in_quote') or node.get('in_overlength'):
if node.get('remove'): # copy the node tail into parent text
# copy the node tail into parent text if node.tail and not node.get('tail_remove'):
if node.tail and not node.get('tail_remove'): parent = node.getparent()
parent = node.getparent() parent.tail = node.tail + (parent.tail or '')
parent.tail = node.tail + (parent.tail or '') to_remove.append(node)
to_delete.append(node) if node.get('tail_remove'):
if node.get('tail_remove'): node.tail = ''
node.tail = '' for node in to_remove:
for node in to_delete: if remove:
node.getparent().remove(node) node.getparent().remove(node)
else:
if not 'oe_mail_expand' in node.get('class', ''): # trick: read more link should be displayed even if it's in overlength
node_class = node.get('class', '') + ' ' + 'oe_mail_cleaned'
node.set('class', node_class)
# html: \n back to <br/> # html: \n that were tail of elements have been encapsulated into <span> -> back to \n
html = etree.tostring(root, pretty_print=True) html = etree.tostring(root, pretty_print=False)
html = html.replace('__BR_TAG__', '<br />') linebreaks = re.compile(r'<span>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
html = _replace_matching_regex(linebreaks, html, '\n')
return html return html