[FIX] tools: html_email_clean: fixed regex for

signature that was buggy when having dots.
Also fixed read more link addition.

Added test case that triggered the error.

bzr revid: tde@openerp.com-20131016103516-w44j6r5oaljpwvmx
master
Thibault Delavallée 10 years ago
parent 54f740960e
commit 983d5eb9fa

@ -223,6 +223,13 @@ class TestCleaner(unittest2.TestCase):
for ext in test_mail_examples.THUNDERBIRD_1_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
def test_70_read_more(self):
new_html = html_email_clean(test_mail_examples.BUG1, remove=True, shorten=True, max_length=100)
for ext in test_mail_examples.BUG_1_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed valid content')
for ext in test_mail_examples.BUG_1_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not removed invalid content')
def test_90_misc(self):
# False boolean for text must return empty string
new_html = html_email_clean(False)

@ -637,3 +637,59 @@ MSOFFICE_3 = """<div>
MSOFFICE_3_IN = ['I saw your boss yesterday']
MSOFFICE_3_OUT = ['I noticed you recently downloaded OpenERP.', 'You indicated that you wish', 'Belgium: +32.81.81.37.00']
# ------------------------------------------------------------
# Test cases coming from bugs
# ------------------------------------------------------------
# bug: read more not apparent, strange message in read more span
BUG1 = """<pre>Hi Migration Team,
Paragraph 1, blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah.
Paragraph 2, blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah.
Paragraph 3, blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah.
Thanks.
Regards,
--
Olivier Laurent
Migration Manager
OpenERP SA
Chaussée de Namur, 40
B-1367 Gérompont
Tel: +32.81.81.37.00
Web: http://www.openerp.com</pre>"""
BUG_1_IN = [
'Hi Migration Team',
'Paragraph 1'
]
BUG_1_OUT = [
'Olivier Laurent',
'Chaussée de Namur',
'81.81.37.00',
'openerp.com',
]

@ -206,7 +206,7 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300):
# form node and tag text-based quotes and signature
quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[^.]+)')
signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[\s\S]+)')
for node in root.getiterator():
_tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
_tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
@ -263,8 +263,7 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300):
# create outertext node
new_node = _create_node('span', outertext[stop_idx:])
# add newly created nodes in dom
node.addnext(new_node)
node.addnext(read_more_node)
node.append(read_more_node)
# tag node
new_node.set('in_overlength', '1')

Loading…
Cancel
Save