[REF] html_email_clean: refactored the algorithm to clean it, specifically about truncature. Now add classes to node when not removing them, to be managed by the web client.
bzr revid: tde@openerp.com-20130425104301-e7mv9o4pcjx6k2cn
This commit is contained in:
parent
78ac31e260
commit
ccbb8e09a6
|
@ -113,62 +113,100 @@ class TestSanitizer(unittest2.TestCase):
|
||||||
class TestCleaner(unittest2.TestCase):
|
class TestCleaner(unittest2.TestCase):
|
||||||
""" Test the email cleaner function that filters the content of incoming emails """
|
""" Test the email cleaner function that filters the content of incoming emails """
|
||||||
|
|
||||||
def test_00_html_email_clean_text(self):
|
def test_00_html_email_clean_signature(self):
|
||||||
|
""" html_email_clean test for signatures """
|
||||||
|
test_data = [("""This is Sparta!\n--\nAdministrator\n+9988776655""",
|
||||||
|
['This is Sparta!'],
|
||||||
|
['Administrator', '9988776655']),
|
||||||
|
("""<p>--\nAdministrator</p>""",
|
||||||
|
[],
|
||||||
|
['--', 'Administrator']),
|
||||||
|
("""<p>This is Sparta!\n---\nAdministrator</p>""",
|
||||||
|
['This is Sparta!'],
|
||||||
|
['---', 'Administrator']),
|
||||||
|
("""<p>--<br>Administrator</p>""",
|
||||||
|
[],
|
||||||
|
[]),
|
||||||
|
("""<p>This is Sparta!<br/>--<br>Administrator</p>""",
|
||||||
|
['This is Sparta!'],
|
||||||
|
[])
|
||||||
|
]
|
||||||
|
for test, in_lst, out_lst in test_data:
|
||||||
|
new_html = html_email_clean(test, remove=True)
|
||||||
|
for text in in_lst:
|
||||||
|
self.assertIn(text, new_html, 'html_email_cleaner wrongly removed content')
|
||||||
|
for text in out_lst:
|
||||||
|
self.assertNotIn(text, new_html, 'html_email_cleaner did not remove unwanted content')
|
||||||
|
|
||||||
|
def test_05_html_email_clean_signature(self):
|
||||||
|
""" html_email_clean test for quotes """
|
||||||
|
test_data = [("""This is Sparta!\n>Ah bon ?\nCertes\n> Chouette !\nClair""",
|
||||||
|
['This is Sparta!', 'Certes', 'Clair'],
|
||||||
|
['Ah bon', 'Chouette'])
|
||||||
|
]
|
||||||
|
for test, in_lst, out_lst in test_data:
|
||||||
|
new_html = html_email_clean(test, remove=True)
|
||||||
|
for text in in_lst:
|
||||||
|
self.assertIn(text, new_html, 'html_email_cleaner wrongly removed content')
|
||||||
|
for text in out_lst:
|
||||||
|
self.assertNotIn(text, new_html, 'html_email_cleaner did not remove unwanted content')
|
||||||
|
|
||||||
|
def test_10_html_email_clean_text(self):
|
||||||
""" html_email_clean test for text-based emails """
|
""" html_email_clean test for text-based emails """
|
||||||
new_html = html_email_clean(test_mail_examples.TEXT_1, remove_unwanted=True)
|
new_html = html_email_clean(test_mail_examples.TEXT_1, remove=True)
|
||||||
for ext in test_mail_examples.TEXT_1_IN:
|
for ext in test_mail_examples.TEXT_1_IN:
|
||||||
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
||||||
for ext in test_mail_examples.TEXT_1_OUT:
|
for ext in test_mail_examples.TEXT_1_OUT:
|
||||||
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
|
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
|
||||||
|
|
||||||
new_html = html_email_clean(test_mail_examples.TEXT_2, remove_unwanted=True)
|
new_html = html_email_clean(test_mail_examples.TEXT_2, remove=True)
|
||||||
for ext in test_mail_examples.TEXT_2_IN:
|
for ext in test_mail_examples.TEXT_2_IN:
|
||||||
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
||||||
for ext in test_mail_examples.TEXT_2_OUT:
|
for ext in test_mail_examples.TEXT_2_OUT:
|
||||||
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
|
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
|
||||||
|
|
||||||
def test_10_html_email_clean_html(self):
|
def test_20_html_email_clean_html(self):
|
||||||
new_html = html_email_clean(test_mail_examples.HTML_1, remove_unwanted=True)
|
new_html = html_email_clean(test_mail_examples.HTML_1, remove=True)
|
||||||
for ext in test_mail_examples.HTML_1_IN:
|
for ext in test_mail_examples.HTML_1_IN:
|
||||||
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
||||||
for ext in test_mail_examples.HTML_1_OUT:
|
for ext in test_mail_examples.HTML_1_OUT:
|
||||||
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
|
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
|
||||||
|
|
||||||
new_html = html_email_clean(test_mail_examples.HTML_2, remove_unwanted=False)
|
new_html = html_email_clean(test_mail_examples.HTML_2, remove=True)
|
||||||
for ext in test_mail_examples.HTML_2_IN:
|
for ext in test_mail_examples.HTML_2_IN:
|
||||||
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
||||||
for ext in test_mail_examples.HTML_2_OUT:
|
for ext in test_mail_examples.HTML_2_OUT:
|
||||||
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
|
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
|
||||||
|
|
||||||
new_html = html_email_clean(test_mail_examples.HTML_3, remove_unwanted=False)
|
new_html = html_email_clean(test_mail_examples.HTML_3, remove=False)
|
||||||
for ext in test_mail_examples.HTML_3_IN:
|
for ext in test_mail_examples.HTML_3_IN:
|
||||||
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
||||||
# for ext in test_mail_examples.HTML_3_OUT:
|
# for ext in test_mail_examples.HTML_3_OUT:
|
||||||
# self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
|
# self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
|
||||||
|
|
||||||
def test_20_html_email_clean_msoffice(self):
|
def test_30_html_email_clean_msoffice(self):
|
||||||
new_html = html_email_clean(test_mail_examples.MSOFFICE_1, remove_unwanted=True)
|
new_html = html_email_clean(test_mail_examples.MSOFFICE_1, remove=True)
|
||||||
for ext in test_mail_examples.MSOFFICE_1_IN:
|
for ext in test_mail_examples.MSOFFICE_1_IN:
|
||||||
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
||||||
for ext in test_mail_examples.MSOFFICE_1_OUT:
|
for ext in test_mail_examples.MSOFFICE_1_OUT:
|
||||||
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
|
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
|
||||||
|
|
||||||
def test_30_html_email_clean_hotmail(self):
|
def test_40_html_email_clean_hotmail(self):
|
||||||
new_html = html_email_clean(test_mail_examples.HOTMAIL_1, remove_unwanted=True)
|
new_html = html_email_clean(test_mail_examples.HOTMAIL_1, remove=True)
|
||||||
for ext in test_mail_examples.HOTMAIL_1_IN:
|
for ext in test_mail_examples.HOTMAIL_1_IN:
|
||||||
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
||||||
for ext in test_mail_examples.HOTMAIL_1_OUT:
|
for ext in test_mail_examples.HOTMAIL_1_OUT:
|
||||||
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
|
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
|
||||||
|
|
||||||
def test_40_html_email_clean_gmail(self):
|
def test_50_html_email_clean_gmail(self):
|
||||||
new_html = html_email_clean(test_mail_examples.GMAIL_1, remove_unwanted=True)
|
new_html = html_email_clean(test_mail_examples.GMAIL_1, remove=True)
|
||||||
for ext in test_mail_examples.GMAIL_1_IN:
|
for ext in test_mail_examples.GMAIL_1_IN:
|
||||||
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
||||||
for ext in test_mail_examples.GMAIL_1_OUT:
|
for ext in test_mail_examples.GMAIL_1_OUT:
|
||||||
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
|
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
|
||||||
|
|
||||||
def test_50_html_email_clean_thunderbird(self):
|
def test_60_html_email_clean_thunderbird(self):
|
||||||
new_html = html_email_clean(test_mail_examples.THUNDERBIRD_1, remove_unwanted=True)
|
new_html = html_email_clean(test_mail_examples.THUNDERBIRD_1, remove=True)
|
||||||
for ext in test_mail_examples.THUNDERBIRD_1_IN:
|
for ext in test_mail_examples.THUNDERBIRD_1_IN:
|
||||||
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
|
||||||
for ext in test_mail_examples.THUNDERBIRD_1_OUT:
|
for ext in test_mail_examples.THUNDERBIRD_1_OUT:
|
||||||
|
|
|
@ -75,17 +75,23 @@ def html_sanitize(src):
|
||||||
# HTML Cleaner
|
# HTML Cleaner
|
||||||
#----------------------------------------------------------
|
#----------------------------------------------------------
|
||||||
|
|
||||||
def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_length=300):
|
def html_email_clean(html, remove=False, shorten=False, max_length=300):
|
||||||
""" html_email_clean: clean the html to display in the web client.
|
""" html_email_clean: clean the html
|
||||||
- strip email quotes (remove blockquote nodes)
|
- try to strip email quotes (remove blockquote nodes)
|
||||||
- strip signatures (remove --\n{\n)Blahblah), by replacing <br> by
|
- try to strip signatures
|
||||||
\n to avoid ignoring signatures converted into html
|
- allows having a shortened version of the html (read more/read less behavior)
|
||||||
|
|
||||||
:param string html: sanitized html; tags like html or head should not
|
:param string html: sanitized html; tags like html or head should not
|
||||||
be present in the html string. This method therefore takes as input
|
be present in the html string. This method therefore takes as input
|
||||||
html code coming from a sanitized source, like fields.html.
|
html code coming from a sanitized source, like fields.html.
|
||||||
|
:param boolean remove: remove the html code that is unwanted; otherwise
|
||||||
|
it is only flagged and tagged
|
||||||
|
:param boolean shorten: shorten the html
|
||||||
|
:param int max_length: if shortening, maximum number of characters before
|
||||||
|
shortening
|
||||||
"""
|
"""
|
||||||
def _replace_matching_regex(regex, source, replace=''):
|
def _replace_matching_regex(regex, source, replace=''):
|
||||||
|
""" Replace all matching expressions in source by replace """
|
||||||
if not source:
|
if not source:
|
||||||
return source
|
return source
|
||||||
dest = ''
|
dest = ''
|
||||||
|
@ -96,35 +102,38 @@ def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_leng
|
||||||
dest += source[idx:]
|
dest += source[idx:]
|
||||||
return dest
|
return dest
|
||||||
|
|
||||||
def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs=None):
|
def _create_node(tag, text, tail=None, attrs={}):
|
||||||
# print '\t_tag_matching_regex_in_text'
|
new_node = etree.Element(tag)
|
||||||
|
new_node.text = text
|
||||||
|
new_node.tail = tail
|
||||||
|
for key, val in attrs.iteritems():
|
||||||
|
new_node.set(key, val)
|
||||||
|
return new_node
|
||||||
|
|
||||||
|
def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}):
|
||||||
|
new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs)
|
||||||
|
node.insert(index, new_node)
|
||||||
|
return new_node
|
||||||
|
|
||||||
|
def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}):
|
||||||
text = node.text or ''
|
text = node.text or ''
|
||||||
node.text = ''
|
if not re.search(regex, text):
|
||||||
|
return
|
||||||
|
|
||||||
cur_node = node
|
cur_node = node
|
||||||
idx = 0
|
node.text = ''
|
||||||
caca = 0
|
idx, iteration = 0, 0
|
||||||
for item in re.finditer(regex, text):
|
for item in re.finditer(regex, text):
|
||||||
# print '\t\tfound', item.start(), item.end(), '-', text[item.start():item.end()], '-'
|
if iteration == 0:
|
||||||
if caca == 0:
|
|
||||||
cur_node.text = text[idx:item.start()]
|
cur_node.text = text[idx:item.start()]
|
||||||
else:
|
else:
|
||||||
cur_node.tail = text[idx:item.start()]
|
_insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()])
|
||||||
|
new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs)
|
||||||
|
|
||||||
# create element
|
|
||||||
new_node = etree.Element(new_node_tag)
|
|
||||||
new_node.text = text[item.start():item.end()]
|
|
||||||
for key, val in new_node_attrs.iteritems():
|
|
||||||
new_node.set(key, val)
|
|
||||||
|
|
||||||
# insert element in DOM
|
|
||||||
node.insert(caca, new_node)
|
|
||||||
cur_node = new_node
|
cur_node = new_node
|
||||||
idx = item.end()
|
idx = item.end()
|
||||||
caca += 1
|
iteration += 1
|
||||||
if caca == 0:
|
new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
|
||||||
cur_node.text = (cur_node.text or '') + text[idx:]
|
|
||||||
else:
|
|
||||||
cur_node.tail = text[idx:] + (cur_node.tail or '')
|
|
||||||
|
|
||||||
if not html or not isinstance(html, basestring):
|
if not html or not isinstance(html, basestring):
|
||||||
return html
|
return html
|
||||||
|
@ -132,30 +141,32 @@ def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_leng
|
||||||
|
|
||||||
# Pre processing
|
# Pre processing
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
# TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
|
||||||
# --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
|
|
||||||
|
|
||||||
# html: remove encoding attribute inside tags
|
# html: remove encoding attribute inside tags
|
||||||
doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
|
doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
|
||||||
html = doctype.sub(r"", html)
|
html = doctype.sub(r"", html)
|
||||||
|
|
||||||
# html: ClEditor seems to love using <div><br /><div> -> replace with <br />
|
# html: ClEditor seems to love using <div><br /><div> -> replace with <br />
|
||||||
br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)')
|
br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE)
|
||||||
html = _replace_matching_regex(br_div_tags, html, '<br />')
|
html = _replace_matching_regex(br_div_tags, html, '<br />')
|
||||||
|
|
||||||
# html: <br[ /]> -> \n, to de-obfuscate the tree
|
|
||||||
br_tags = re.compile(r'([<]\s*[bB][rR]\s*\/?[>])')
|
|
||||||
html = _replace_matching_regex(br_tags, html, '__BR_TAG__')
|
|
||||||
|
|
||||||
# form a tree
|
# form a tree
|
||||||
root = lxml.html.fromstring(html)
|
root = lxml.html.fromstring(html)
|
||||||
if not len(root) and root.text is None and root.tail is None:
|
if not len(root) and root.text is None and root.tail is None:
|
||||||
html = '<div>%s</div>' % html
|
html = '<div>%s</div>' % html
|
||||||
root = lxml.html.fromstring(html)
|
root = lxml.html.fromstring(html)
|
||||||
|
|
||||||
|
# remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
|
||||||
|
for node in root.getiterator():
|
||||||
|
if node.tail:
|
||||||
|
tail_node = _create_node('span', node.tail)
|
||||||
|
node.tail = None
|
||||||
|
node.addnext(tail_node)
|
||||||
|
|
||||||
# form node and tag text-based quotes and signature
|
# form node and tag text-based quotes and signature
|
||||||
quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
|
quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
|
||||||
signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)')
|
signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[^.]+)')
|
||||||
for node in root.getiterator():
|
for node in root.getiterator():
|
||||||
_tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
|
_tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
|
||||||
_tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
|
_tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
|
||||||
|
@ -164,59 +175,89 @@ def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_leng
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
# tree: tag nodes
|
# tree: tag nodes
|
||||||
|
# signature_begin = False # try dynamic signature recognition
|
||||||
quote_begin = False
|
quote_begin = False
|
||||||
overlength = False
|
overlength = False
|
||||||
cur_char_nbr = 0
|
cur_char_nbr = 0
|
||||||
for node in root.getiterator():
|
for node in root.getiterator():
|
||||||
if node.get('class') in ['WordSection1', 'MsoNormal']:
|
# root: try to tag the client used to write the html
|
||||||
|
if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
|
||||||
root.set('msoffice', '1')
|
root.set('msoffice', '1')
|
||||||
if node.get('class') in ['SkyDrivePlaceholder'] or node.get('id') in ['SkyDrivePlaceholder']:
|
if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
|
||||||
root.set('hotmail', '1')
|
root.set('hotmail', '1')
|
||||||
|
|
||||||
|
# state of the parsing
|
||||||
if quote_begin:
|
if quote_begin:
|
||||||
node.set('quote', '1')
|
node.set('in_quote', '1')
|
||||||
|
node.set('tail_remove', '1')
|
||||||
if overlength:
|
if overlength:
|
||||||
node.set('remove', '1')
|
node.set('in_overlength', '1')
|
||||||
node.set('tail_remove', '1')
|
node.set('tail_remove', '1')
|
||||||
|
|
||||||
if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
|
if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
|
||||||
quote_begin = True
|
quote_begin = True
|
||||||
|
node.set('in_quote', '1')
|
||||||
|
node.set('tail_remove', '1')
|
||||||
if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
|
if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
|
||||||
quote_begin = True
|
quote_begin = True
|
||||||
|
node.set('in_quote', '1')
|
||||||
|
node.set('tail_remove', '1')
|
||||||
|
|
||||||
|
# shorten:
|
||||||
|
# 1/ truncate the text at the next available space
|
||||||
|
# 2/ create a 'read more' node, next to current node
|
||||||
|
# 3/ add the truncated text in a new node, next to 'read more' node
|
||||||
|
if shorten and not overlength and cur_char_nbr + len(node.text or '') > max_length:
|
||||||
|
overlength = True
|
||||||
|
# truncate text
|
||||||
|
innertext = node.text[0:(max_length - cur_char_nbr)]
|
||||||
|
outertext = node.text[(max_length - cur_char_nbr):]
|
||||||
|
stop_idx = outertext.find(' ')
|
||||||
|
if stop_idx == -1:
|
||||||
|
stop_idx = len(outertext)
|
||||||
|
node.text = innertext + outertext[0:stop_idx]
|
||||||
|
# create <span> ... <a href="#">read more</a></span> node
|
||||||
|
read_more_node = _create_node('span', ' ... ', None, {'class': 'oe_mail_expand'})
|
||||||
|
read_more_link_node = _create_node('a', 'read more', None, {'href': '#', 'class': 'oe_mail_expand'})
|
||||||
|
read_more_node.append(read_more_link_node)
|
||||||
|
# create outertext node
|
||||||
|
new_node = _create_node('span', outertext[stop_idx:])
|
||||||
|
# add newly created nodes in dom
|
||||||
|
node.addnext(new_node)
|
||||||
|
node.addnext(read_more_node)
|
||||||
|
# tag node
|
||||||
|
new_node.set('in_overlength', '1')
|
||||||
|
|
||||||
if use_max_length:
|
|
||||||
if not overlength and cur_char_nbr + len(node.text or '') > max_length:
|
|
||||||
overlength = True
|
|
||||||
node.text = node.text[0:(max_length - cur_char_nbr)] + ' <span class="oe_mail_expand"><a href="#">... read more</a></span>'
|
|
||||||
node.set('tail_remove', '1')
|
|
||||||
cur_char_nbr += len(node.text or '')
|
cur_char_nbr += len(node.text or '')
|
||||||
|
|
||||||
if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
|
if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
|
||||||
node.set('remove', '1')
|
node.set('in_quote', '1')
|
||||||
if quote_begin:
|
|
||||||
node.set('remove', '1')
|
|
||||||
node.set('tail_remove', '1')
|
|
||||||
|
|
||||||
# Post processing
|
# Post processing
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
if remove_unwanted:
|
to_remove = []
|
||||||
to_delete = []
|
for node in root.getiterator():
|
||||||
for node in root.getiterator():
|
if node.get('in_quote') or node.get('in_overlength'):
|
||||||
if node.get('remove'):
|
# copy the node tail into parent text
|
||||||
# copy the node tail into parent text
|
if node.tail and not node.get('tail_remove'):
|
||||||
if node.tail and not node.get('tail_remove'):
|
parent = node.getparent()
|
||||||
parent = node.getparent()
|
parent.tail = node.tail + (parent.tail or '')
|
||||||
parent.tail = node.tail + (parent.tail or '')
|
to_remove.append(node)
|
||||||
to_delete.append(node)
|
if node.get('tail_remove'):
|
||||||
if node.get('tail_remove'):
|
node.tail = ''
|
||||||
node.tail = ''
|
for node in to_remove:
|
||||||
for node in to_delete:
|
if remove:
|
||||||
node.getparent().remove(node)
|
node.getparent().remove(node)
|
||||||
|
else:
|
||||||
|
if not 'oe_mail_expand' in node.get('class', ''): # trick: read more link should be displayed even if it's in overlength
|
||||||
|
node_class = node.get('class', '') + ' ' + 'oe_mail_cleaned'
|
||||||
|
node.set('class', node_class)
|
||||||
|
|
||||||
# html: \n back to <br/>
|
# html: \n that were tail of elements have been encapsulated into <span> -> back to \n
|
||||||
html = etree.tostring(root, pretty_print=True)
|
html = etree.tostring(root, pretty_print=False)
|
||||||
html = html.replace('__BR_TAG__', '<br />')
|
linebreaks = re.compile(r'<span>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
|
||||||
|
html = _replace_matching_regex(linebreaks, html, '\n')
|
||||||
|
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue