[REF] html_email_clean: refactoring of the algorithm, now better taking into account hotmail / msoffice emails. Added tests.

bzr revid: tde@openerp.com-20130423144903-he22jz9zs6nc6dsd
This commit is contained in:
Thibault Delavallée 2013-04-23 16:49:03 +02:00
parent 261dea6fec
commit f97bd8bd63
3 changed files with 516 additions and 207 deletions

View File

@ -23,146 +23,9 @@
##############################################################################
import unittest2
from . import test_mail_examples
from openerp.tools import html_sanitize, html_email_clean, append_content_to_html, plaintext2html
HTML_SOURCE = """
<font size="2" style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; ">test1</font>
<div style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; font-size: 12px; font-style: normal; ">
<b>test2</b></div><div style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; font-size: 12px; ">
<i>test3</i></div><div style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; font-size: 12px; ">
<u>test4</u></div><div style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; font-size: 12px; ">
<strike>test5</strike></div><div style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; ">
<font size="5">test6</font></div><div><ul><li><font color="#1f1f1f" face="monospace" size="2">test7</font></li><li>
<font color="#1f1f1f" face="monospace" size="2">test8</font></li></ul><div><ol><li><font color="#1f1f1f" face="monospace" size="2">test9</font>
</li><li><font color="#1f1f1f" face="monospace" size="2">test10</font></li></ol></div></div>
<blockquote style="margin: 0 0 0 40px; border: none; padding: 0px;"><div><div><div><font color="#1f1f1f" face="monospace" size="2">
test11</font></div></div></div></blockquote><blockquote style="margin: 0 0 0 40px; border: none; padding: 0px;">
<blockquote style="margin: 0 0 0 40px; border: none; padding: 0px;"><div><font color="#1f1f1f" face="monospace" size="2">
test12</font></div><div><font color="#1f1f1f" face="monospace" size="2"><br></font></div></blockquote></blockquote>
<font color="#1f1f1f" face="monospace" size="2"><a href="http://google.com">google</a></font>
<a href="javascript:alert('malicious code')">test link</a>
"""
EDI_LIKE_HTML_SOURCE = """<div style="font-family: 'Lucica Grande', Ubuntu, Arial, Verdana, sans-serif; font-size: 12px; color: rgb(34, 34, 34); background-color: #FFF; ">
<p>Hello ${object.partner_id.name},</p>
<p>A new invoice is available for you: </p>
<p style="border-left: 1px solid #8e0000; margin-left: 30px;">
&nbsp;&nbsp;<strong>REFERENCES</strong><br />
&nbsp;&nbsp;Invoice number: <strong>${object.number}</strong><br />
&nbsp;&nbsp;Invoice total: <strong>${object.amount_total} ${object.currency_id.name}</strong><br />
&nbsp;&nbsp;Invoice date: ${object.date_invoice}<br />
&nbsp;&nbsp;Order reference: ${object.origin}<br />
&nbsp;&nbsp;Your contact: <a href="mailto:${object.user_id.email or ''}?subject=Invoice%20${object.number}">${object.user_id.name}</a>
</p>
<br/>
<p>It is also possible to directly pay with Paypal:</p>
<a style="margin-left: 120px;" href="${object.paypal_url}">
<img class="oe_edi_paypal_button" src="https://www.paypal.com/en_US/i/btn/btn_paynowCC_LG.gif"/>
</a>
<br/>
<p>If you have any question, do not hesitate to contact us.</p>
<p>Thank you for choosing ${object.company_id.name or 'us'}!</p>
<br/>
<br/>
<div style="width: 375px; margin: 0px; padding: 0px; background-color: #8E0000; border-top-left-radius: 5px 5px; border-top-right-radius: 5px 5px; background-repeat: repeat no-repeat;">
<h3 style="margin: 0px; padding: 2px 14px; font-size: 12px; color: #DDD;">
<strong style="text-transform:uppercase;">${object.company_id.name}</strong></h3>
</div>
<div style="width: 347px; margin: 0px; padding: 5px 14px; line-height: 16px; background-color: #F2F2F2;">
<span style="color: #222; margin-bottom: 5px; display: block; ">
${object.company_id.street}<br/>
${object.company_id.street2}<br/>
${object.company_id.zip} ${object.company_id.city}<br/>
${object.company_id.state_id and ('%s, ' % object.company_id.state_id.name) or ''} ${object.company_id.country_id.name or ''}<br/>
</span>
<div style="margin-top: 0px; margin-right: 0px; margin-bottom: 0px; margin-left: 0px; padding-top: 0px; padding-right: 0px; padding-bottom: 0px; padding-left: 0px; ">
Phone:&nbsp; ${object.company_id.phone}
</div>
<div>
Web :&nbsp;<a href="${object.company_id.website}">${object.company_id.website}</a>
</div>
</div>
</div></body></html>"""
TEXT_MAIL1 = """I contact you about our meeting for tomorrow. Here is the schedule I propose:
9 AM: brainstorming about our new amazing business app</span></li>
9.45 AM: summary
10 AM: meeting with Fabien to present our app
Is everything ok for you ?
--
Administrator"""
HTML_MAIL1 = """<div>
<font><span>I contact you about our meeting for tomorrow. Here is the schedule I propose:</span></font>
</div>
<div><ul>
<li><span>9 AM: brainstorming about our new amazing business app</span></li>
<li><span>9.45 AM: summary</span></li>
<li><span>10 AM: meeting with Fabien to present our app</span></li>
</ul></div>
<div><font><span>Is everything ok for you ?</span></font></div>"""
GMAIL_REPLY1_SAN = """Hello,<div><br></div><div>Ok for me. I am replying directly in gmail, without signature.</div><div><br></div><div>Kind regards,</div><div><br></div><div>Demo.<br><br><div>On Thu, Nov 8, 2012 at 5:29 PM, <span>&lt;<a href="mailto:dummy@example.com">dummy@example.com</a>&gt;</span> wrote:<br><blockquote><div>I contact you about our meeting for tomorrow. Here is the schedule I propose:</div><div><ul><li>9 AM: brainstorming about our new amazing business app&lt;/span&gt;&lt;/li&gt;</li>
<li>9.45 AM: summary</li><li>10 AM: meeting with Fabien to present our app</li></ul></div><div>Is everything ok for you ?</div>
<div><p>--<br>Administrator</p></div>
<div><p>Log in our portal at: <a href="http://localhost:8069#action=login&amp;db=mail_1&amp;login=demo">http://localhost:8069#action=login&amp;db=mail_1&amp;login=demo</a></p></div>
</blockquote></div><br></div>"""
THUNDERBIRD_16_REPLY1_SAN = """ <div>On 11/08/2012 05:29 PM,
<a href="mailto:dummy@example.com">dummy@example.com</a> wrote:<br></div>
<blockquote>
<div>I contact you about our meeting for tomorrow. Here is the
schedule I propose:</div>
<div>
<ul><li>9 AM: brainstorming about our new amazing business
app&lt;/span&gt;&lt;/li&gt;</li>
<li>9.45 AM: summary</li>
<li>10 AM: meeting with Fabien to present our app</li>
</ul></div>
<div>Is everything ok for you ?</div>
<div>
<p>--<br>
Administrator</p>
</div>
<div>
<p>Log in our portal at:
<a href="http://localhost:8069#action=login&amp;db=mail_1&amp;token=rHdWcUART5PhEnJRaXjH">http://localhost:8069#action=login&amp;db=mail_1&amp;token=rHdWcUART5PhEnJRaXjH</a></p>
</div>
</blockquote>
Ok for me. I am replying directly below your mail, using
Thunderbird, with a signature.<br><br>
Did you receive my email about my new laptop, by the way ?<br><br>
Raoul.<br><pre>--
Raoul Grosbedonn&#233;e
</pre>"""
TEXT_TPL = """Salut Raoul!
Le 28 oct. 2012 à 00:02, Raoul Grosbedon a écrit :
> C'est sûr que je suis intéressé (quote)!
Trouloulou pouet pouet. Je ne vais quand même pas écrire de vrais mails, non mais ho.
> 2012/10/27 Bert Tartopoils :
>> Diantre, me disè-je en envoyant un message similaire à Martine, mais comment vas-tu (quote)?
>>
>> A la base le contenu était un vrai mail, mais je l'ai quand même réécrit pour ce test, histoire de dire que, quand même, on ne met pas n'importe quoi ici. (quote)
>>
>> Et sinon bon courage pour trouver tes clefs (quote).
>>
>> Bert TARTOPOILS
>> bert.tartopoils@miam.miam
>>
>
>
> --
> Raoul Grosbedon
Bert TARTOPOILS
bert.tartopoils@miam.miam
"""
class TestSanitizer(unittest2.TestCase):
""" Test the html sanitizer that filters html to remove unwanted attributes """
@ -223,22 +86,21 @@ class TestSanitizer(unittest2.TestCase):
self.assertTrue('ha.ckers.org' not in html or 'http://ha.ckers.org/xss.css' in html, 'html_sanitize did not remove a malicious code in %s (%s)' % (content, html))
def test_html(self):
sanitized_html = html_sanitize(HTML_SOURCE)
sanitized_html = html_sanitize(test_mail_examples.MISC_HTML_SOURCE)
for tag in ['<div', '<b', '<i', '<u', '<strike', '<li', '<blockquote', '<a href']:
self.assertIn(tag, sanitized_html, 'html_sanitize stripped too much of original html')
for attr in ['javascript']:
self.assertNotIn(attr, sanitized_html, 'html_sanitize did not remove enough unwanted attributes')
emails =[("Charles <charles.bidule@truc.fr>", "Charles &lt;charles.bidule@truc.fr&gt;"),
emails = [("Charles <charles.bidule@truc.fr>", "Charles &lt;charles.bidule@truc.fr&gt;"),
("Dupuis <'tr/-: ${dupuis#$'@truc.baz.fr>", "Dupuis &lt;'tr/-: ${dupuis#$'@truc.baz.fr&gt;"),
("Technical <service/technical+2@open.com>", "Technical &lt;service/technical+2@open.com&gt;"),
("Div nico <div-nico@open.com>", "Div nico &lt;div-nico@open.com&gt;")]
for email in emails:
self.assertIn(email[1], html_sanitize(email[0]), 'html_sanitize stripped emails of original html')
def test_edi_source(self):
html = html_sanitize(EDI_LIKE_HTML_SOURCE)
html = html_sanitize(test_mail_examples.EDI_LIKE_HTML_SOURCE)
self.assertIn('div style="font-family: \'Lucica Grande\', Ubuntu, Arial, Verdana, sans-serif; font-size: 12px; color: rgb(34, 34, 34); background-color: #FFF;', html,
'html_sanitize removed valid style attribute')
self.assertIn('<span style="color: #222; margin-bottom: 5px; display: block; ">', html,
@ -251,36 +113,73 @@ class TestSanitizer(unittest2.TestCase):
class TestCleaner(unittest2.TestCase):
""" Test the email cleaner function that filters the content of incoming emails """
def test_html_email_clean(self):
# Test1: reply through gmail: quote in blockquote, signature --\nAdministrator
new_html = html_email_clean(GMAIL_REPLY1_SAN)
self.assertNotIn('blockquote', new_html, 'html_email_cleaner did not remove a blockquote')
self.assertNotIn('I contact you about our meeting', new_html, 'html_email_cleaner wrongly removed the quoted content')
self.assertNotIn('Administrator', new_html, 'html_email_cleaner did not erase the signature')
self.assertIn('Ok for me', new_html, 'html_email_cleaner erased too much content')
def test_00_html_email_clean_text(self):
""" html_email_clean test for text-based emails """
new_html = html_email_clean(test_mail_examples.TEXT_1, remove_unwanted=True)
for ext in test_mail_examples.TEXT_1_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.TEXT_1_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
# Test2: reply through Tunderbird 16.0.2
new_html = html_email_clean(THUNDERBIRD_16_REPLY1_SAN)
self.assertNotIn('blockquote', new_html, 'html_email_cleaner did not remove a blockquote')
self.assertNotIn('I contact you about our meeting', new_html, 'html_email_cleaner wrongly removed the quoted content')
self.assertNotIn('Administrator', new_html, 'html_email_cleaner did not erase the signature')
self.assertNotIn('Grosbedonn', new_html, 'html_email_cleaner did not erase the signature')
self.assertIn('Ok for me', new_html, 'html_email_cleaner erased too much content')
new_html = html_email_clean(test_mail_examples.TEXT_2, remove_unwanted=True)
for ext in test_mail_examples.TEXT_2_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.TEXT_2_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
# Test3: text email
new_html = html_email_clean(TEXT_MAIL1)
self.assertIn('I contact you about our meeting', new_html, 'html_email_cleaner wrongly removed the quoted content')
self.assertNotIn('Administrator', new_html, 'html_email_cleaner did not erase the signature')
def test_10_html_email_clean_html(self):
new_html = html_email_clean(test_mail_examples.HTML_1, remove_unwanted=True)
for ext in test_mail_examples.HTML_1_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.HTML_1_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
# Test4: more complex text email
new_html = html_email_clean(TEXT_TPL)
self.assertNotIn('quote', new_html, 'html_email_cleaner did not remove correctly plaintext quotes')
new_html = html_email_clean(test_mail_examples.HTML_2, remove_unwanted=False)
for ext in test_mail_examples.HTML_2_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.HTML_2_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
# Test5: False boolean for text must return empty string
new_html = html_email_clean(test_mail_examples.HTML_3, remove_unwanted=False)
for ext in test_mail_examples.HTML_3_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
# for ext in test_mail_examples.HTML_3_OUT:
# self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
def test_20_html_email_clean_msoffice(self):
new_html = html_email_clean(test_mail_examples.MSOFFICE_1, remove_unwanted=True)
for ext in test_mail_examples.MSOFFICE_1_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.MSOFFICE_1_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
def test_30_html_email_clean_hotmail(self):
new_html = html_email_clean(test_mail_examples.HOTMAIL_1, remove_unwanted=True)
for ext in test_mail_examples.HOTMAIL_1_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.HOTMAIL_1_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
def test_40_html_email_clean_gmail(self):
new_html = html_email_clean(test_mail_examples.GMAIL_1, remove_unwanted=True)
for ext in test_mail_examples.GMAIL_1_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.GMAIL_1_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
def test_50_html_email_clean_thunderbird(self):
new_html = html_email_clean(test_mail_examples.THUNDERBIRD_1, remove_unwanted=True)
for ext in test_mail_examples.THUNDERBIRD_1_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.THUNDERBIRD_1_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
def test_90_html_email_clean_misc(self):
# False boolean for text must return empty string
new_html = html_email_clean(False)
self.assertEqual(new_html, False, 'html_email_cleaner did change a False in an other value.')
# Test6: Message with xml and doctype tags don't crash
# Message with xml and doctype tags don't crash
new_html = html_email_clean(u'<?xml version="1.0" encoding="iso-8859-1"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"\n "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\n <head>\n <title>404 - Not Found</title>\n </head>\n <body>\n <h1>404 - Not Found</h1>\n </body>\n</html>\n')
self.assertNotIn('encoding', new_html, 'html_email_cleaner did not remove correctly encoding attributes')

View File

@ -0,0 +1,354 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
MISC_HTML_SOURCE = """
<font size="2" style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; ">test1</font>
<div style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; font-size: 12px; font-style: normal; ">
<b>test2</b></div><div style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; font-size: 12px; ">
<i>test3</i></div><div style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; font-size: 12px; ">
<u>test4</u></div><div style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; font-size: 12px; ">
<strike>test5</strike></div><div style="color: rgb(31, 31, 31); font-family: monospace; font-variant: normal; line-height: normal; ">
<font size="5">test6</font></div><div><ul><li><font color="#1f1f1f" face="monospace" size="2">test7</font></li><li>
<font color="#1f1f1f" face="monospace" size="2">test8</font></li></ul><div><ol><li><font color="#1f1f1f" face="monospace" size="2">test9</font>
</li><li><font color="#1f1f1f" face="monospace" size="2">test10</font></li></ol></div></div>
<blockquote style="margin: 0 0 0 40px; border: none; padding: 0px;"><div><div><div><font color="#1f1f1f" face="monospace" size="2">
test11</font></div></div></div></blockquote><blockquote style="margin: 0 0 0 40px; border: none; padding: 0px;">
<blockquote style="margin: 0 0 0 40px; border: none; padding: 0px;"><div><font color="#1f1f1f" face="monospace" size="2">
test12</font></div><div><font color="#1f1f1f" face="monospace" size="2"><br></font></div></blockquote></blockquote>
<font color="#1f1f1f" face="monospace" size="2"><a href="http://google.com">google</a></font>
<a href="javascript:alert('malicious code')">test link</a>
"""
EDI_LIKE_HTML_SOURCE = """<div style="font-family: 'Lucica Grande', Ubuntu, Arial, Verdana, sans-serif; font-size: 12px; color: rgb(34, 34, 34); background-color: #FFF; ">
<p>Hello ${object.partner_id.name},</p>
<p>A new invoice is available for you: </p>
<p style="border-left: 1px solid #8e0000; margin-left: 30px;">
&nbsp;&nbsp;<strong>REFERENCES</strong><br />
&nbsp;&nbsp;Invoice number: <strong>${object.number}</strong><br />
&nbsp;&nbsp;Invoice total: <strong>${object.amount_total} ${object.currency_id.name}</strong><br />
&nbsp;&nbsp;Invoice date: ${object.date_invoice}<br />
&nbsp;&nbsp;Order reference: ${object.origin}<br />
&nbsp;&nbsp;Your contact: <a href="mailto:${object.user_id.email or ''}?subject=Invoice%20${object.number}">${object.user_id.name}</a>
</p>
<br/>
<p>It is also possible to directly pay with Paypal:</p>
<a style="margin-left: 120px;" href="${object.paypal_url}">
<img class="oe_edi_paypal_button" src="https://www.paypal.com/en_US/i/btn/btn_paynowCC_LG.gif"/>
</a>
<br/>
<p>If you have any question, do not hesitate to contact us.</p>
<p>Thank you for choosing ${object.company_id.name or 'us'}!</p>
<br/>
<br/>
<div style="width: 375px; margin: 0px; padding: 0px; background-color: #8E0000; border-top-left-radius: 5px 5px; border-top-right-radius: 5px 5px; background-repeat: repeat no-repeat;">
<h3 style="margin: 0px; padding: 2px 14px; font-size: 12px; color: #DDD;">
<strong style="text-transform:uppercase;">${object.company_id.name}</strong></h3>
</div>
<div style="width: 347px; margin: 0px; padding: 5px 14px; line-height: 16px; background-color: #F2F2F2;">
<span style="color: #222; margin-bottom: 5px; display: block; ">
${object.company_id.street}<br/>
${object.company_id.street2}<br/>
${object.company_id.zip} ${object.company_id.city}<br/>
${object.company_id.state_id and ('%s, ' % object.company_id.state_id.name) or ''} ${object.company_id.country_id.name or ''}<br/>
</span>
<div style="margin-top: 0px; margin-right: 0px; margin-bottom: 0px; margin-left: 0px; padding-top: 0px; padding-right: 0px; padding-bottom: 0px; padding-left: 0px; ">
Phone:&nbsp; ${object.company_id.phone}
</div>
<div>
Web :&nbsp;<a href="${object.company_id.website}">${object.company_id.website}</a>
</div>
</div>
</div></body></html>"""
TEXT_1 = """I contact you about our meeting tomorrow. Here is the schedule I propose:
9 AM: brainstorming about our new amazing business app
9.45 AM: summary
10 AM: meeting with Ignasse to present our app
Is everything ok for you ?
--
MySignature"""
TEXT_1_IN = ["""I contact you about our meeting tomorrow. Here is the schedule I propose:
9 AM: brainstorming about our new amazing business app
9.45 AM: summary
10 AM: meeting with Ignasse to present our app
Is everything ok for you ?"""]
TEXT_1_OUT = ["""--
MySignature"""]
TEXT_2 = """Salut Raoul!
Le 28 oct. 2012 à 00:02, Raoul Grosbedon a écrit :
> I contact you about our meeting tomorrow. Here is the schedule I propose: (quote)
Of course. This seems viable.
> 2012/10/27 Bert Tartopoils :
>> blahblahblah (quote)?
>>
>> blahblahblah (quote)
>>
>> Bert TARTOPOILS
>> bert.tartopoils@miam.miam
>>
>
>
> --
> RaoulSignature
Bert TARTOPOILS
bert.tartopoils@miam.miam
"""
TEXT_2_IN = ["Salut Raoul!", "Of course. This seems viable."]
TEXT_2_OUT = ["I contact you about our meeting tomorrow. Here is the schedule I propose: (quote)",
"""> 2012/10/27 Bert Tartopoils :
>> blahblahblah (quote)?
>>
>> blahblahblah (quote)
>>
>> Bert TARTOPOILS
>> bert.tartopoils@miam.miam
>>
>
>
> --
> RaoulSignature"""]
HTML_1 = """<p>I contact you about our meeting for tomorrow. Here is the schedule I propose: (keep)
9 AM: brainstorming about our new amazing business app
9.45 AM: summary
10 AM: meeting with Ignasse to present our app
Is everything ok for you ?
--
MySignature</p>"""
HTML_1_IN = ["""I contact you about our meeting for tomorrow. Here is the schedule I propose: (keep)
9 AM: brainstorming about our new amazing business app
9.45 AM: summary
10 AM: meeting with Ignasse to present our app
Is everything ok for you ?"""]
HTML_1_OUT = ["""--
MySignature"""]
HTML_2 = """<div>
<font><span>I contact you about our meeting for tomorrow. Here is the schedule I propose:</span></font>
</div>
<div>
<ul>
<li><span>9 AM: brainstorming about our new amazing business app</span></li>
<li><span>9.45 AM: summary</span></li>
<li><span>10 AM: meeting with Fabien to present our app</span></li>
</ul>
</div>
<div>
<font><span>Is everything ok for you ?</span></font>
</div>"""
HTML_2_IN = ["<font><span>I contact you about our meeting for tomorrow. Here is the schedule I propose:</span></font>",
"<li><span>9 AM: brainstorming about our new amazing business app</span></li>",
"<li><span>9.45 AM: summary</span></li>",
"<li><span>10 AM: meeting with Fabien to present our app</span></li>",
"<font><span>Is everything ok for you ?</span></font>"]
HTML_2_OUT = []
HTML_3 = """<div><pre>This is an answer.
Regards,
XXXXXX
----- Mail original -----</pre>
<pre>Hi,
My CRM-related question.
Regards,
XXXX</pre></div>"""
HTML_3_IN = ["""<div><pre>This is an answer.
Regards,
XXXXXX
----- Mail original -----</pre>"""]
HTML_3_OUT = ["Hi,", "My CRM-related question.",
"Regards,"]
GMAIL_1 = """Hello,<div><br></div><div>Ok for me. I am replying directly in gmail, without signature.</div><div><br></div><div>Kind regards,</div><div><br></div><div>Demo.<br><br><div>On Thu, Nov 8, 2012 at 5:29 PM, <span>&lt;<a href="mailto:dummy@example.com">dummy@example.com</a>&gt;</span> wrote:<br><blockquote><div>I contact you about our meeting for tomorrow. Here is the schedule I propose:</div><div><ul><li>9 AM: brainstorming about our new amazing business app&lt;/span&gt;&lt;/li&gt;</li>
<li>9.45 AM: summary</li><li>10 AM: meeting with Fabien to present our app</li></ul></div><div>Is everything ok for you ?</div>
<div><p>--<br>Administrator</p></div>
<div><p>Log in our portal at: <a href="http://localhost:8069#action=login&amp;db=mail_1&amp;login=demo">http://localhost:8069#action=login&amp;db=mail_1&amp;login=demo</a></p></div>
</blockquote></div><br></div>"""
GMAIL_1_IN = ['Ok for me. I am replying directly in gmail, without signature.']
GMAIL_1_OUT = ['Administrator', 'Log in our portal at:']
THUNDERBIRD_1 = """<div>On 11/08/2012 05:29 PM,
<a href="mailto:dummy@example.com">dummy@example.com</a> wrote:<br></div>
<blockquote>
<div>I contact you about our meeting for tomorrow. Here is the
schedule I propose:</div>
<div>
<ul><li>9 AM: brainstorming about our new amazing business
app&lt;/span&gt;&lt;/li&gt;</li>
<li>9.45 AM: summary</li>
<li>10 AM: meeting with Fabien to present our app</li>
</ul></div>
<div>Is everything ok for you ?</div>
<div>
<p>--<br>
Administrator</p>
</div>
<div>
<p>Log in our portal at:
<a href="http://localhost:8069#action=login&amp;db=mail_1&amp;token=rHdWcUART5PhEnJRaXjH">http://localhost:8069#action=login&amp;db=mail_1&amp;token=rHdWcUART5PhEnJRaXjH</a></p>
</div>
</blockquote>
Ok for me. I am replying directly below your mail, using Thunderbird, with a signature.<br><br>
Did you receive my email about my new laptop, by the way ?<br><br>
Raoul.<br><pre>--
Raoul Grosbedonn&#233;e
</pre>"""
THUNDERBIRD_1_IN = ['Ok for me. I am replying directly below your mail, using Thunderbird, with a signature.']
THUNDERBIRD_1_OUT = ['I contact you about our meeting for tomorrow.', 'Raoul Grosbedon']
HOTMAIL_1 = """<div>
<div dir="ltr"><br>&nbsp;
I have an amazing company, i'm learning OpenERP, it is a small company yet, but plannig to grow up quickly.
<br>&nbsp;<br>Kindest regards,<br>xxx<br>
<div>
<div id="SkyDrivePlaceholder">
</div>
<hr id="stopSpelling">
Subject: Re: your OpenERP.com registration<br>From: xxx@xxx.xxx<br>To: xxx@xxx.xxx<br>Date: Wed, 27 Mar 2013 17:12:12 +0000
<br><br>
Hello xxx,
<br>
I noticed you recently created an OpenERP.com account to access OpenERP Apps.
<br>
You indicated that you wish to use OpenERP in your own company.
We would like to know more about your your business needs and requirements, and see how
we can help you. When would you be available to discuss your project ?<br>
Best regards,<br>
<pre>
<a href="http://openerp.com" target="_blank">http://openerp.com</a>
Belgium: +32.81.81.37.00
U.S.: +1 (650) 307-6736
India: +91 (79) 40 500 100
</pre>
</div>
</div>
</div>"""
HOTMAIL_1_IN = ["I have an amazing company, i'm learning OpenERP, it is a small company yet, but plannig to grow up quickly."]
HOTMAIL_1_OUT = ["Subject: Re: your OpenERP.com registration", " I noticed you recently created an OpenERP.com account to access OpenERP Apps.",
"We would like to know more about your your business needs and requirements", "Belgium: +32.81.81.37.00"]
MSOFFICE_1 = """
<div>
<div class="WordSection1">
<p class="MsoNormal">
<span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">
Our requirements are simple. Just looking to replace some spreadsheets for tracking quotes and possibly using the timecard module.
We are a company of 25 engineers providing product design services to clients.
</span>
</p>
<p></p>
<p></p>
<p class="MsoNormal">
<span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">
Ill install on a windows server and run a very limited trial to see how it works.
If we adopt OpenERP we will probably move to Linux or look for a hosted SaaS option.
</span>
</p>
<p></p>
<p></p>
<p class="MsoNormal">
<span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">
<br>
I am also evaluating Adempiere and maybe others.
</span>
</p>
<p></p>
<p></p>
<p class="MsoNormal">
<span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">
</span>
</p>
<p>&nbsp;</p>
<p></p>
<p class="MsoNormal">
<span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">
I expect the trial will take 2-3 months as this is not a high priority for us.
</span>
</p>
<p></p>
<p></p>
<p class="MsoNormal">
<span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">
</span>
</p>
<p>&nbsp;</p>
<p></p>
<p class="MsoNormal">
<span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">
Alan
</span>
</p>
<p></p>
<p></p>
<p class="MsoNormal">
<span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">
</span>
</p>
<p>&nbsp;</p>
<p></p>
<div>
<div style="border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in">
<p class="MsoNormal">
<b><span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;">
From:
</span></b>
<span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;">
OpenERP Enterprise [mailto:sales@openerp.com]
<br><b>Sent:</b> Monday, 11 March, 2013 14:47<br><b>To:</b> Alan Widmer<br><b>Subject:</b> Re: your OpenERP.com registration
</span>
</p>
<p></p>
<p></p>
</div>
</div>
<p class="MsoNormal"></p>
<p>&nbsp;</p>
<p>Hello Alan Widmer, </p>
<p></p>
<p>I noticed you recently downloaded OpenERP. </p>
<p></p>
<p>
Uou mentioned you wish to use OpenERP in your own company. Please let me more about your
business needs and requirements? When will you be available to discuss about your project?
</p>
<p></p>
<p>Thanks for your interest in OpenERP, </p>
<p></p>
<p>Feel free to contact me if you have any questions, </p>
<p></p>
<p>Looking forward to hear from you soon. </p>
<p></p>
<pre><p>&nbsp;</p></pre>
<pre>--<p></p></pre>
<pre>Nicolas<p></p></pre>
<pre><a href="http://openerp.com">http://openerp.com</a><p></p></pre>
<pre>Belgium: +32.81.81.37.00<p></p></pre>
<pre>U.S.: +1 (650) 307-6736<p></p></pre>
<pre>India: +91 (79) 40 500 100<p></p></pre>
<pre>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<p></p></pre>
</div>
</div>"""
MSOFFICE_1_IN = ['Our requirements are simple. Just looking to replace some spreadsheets for tracking quotes and possibly using the timecard module.']
MSOFFICE_1_OUT = ['I noticed you recently downloaded OpenERP.', 'Uou mentioned you wish to use OpenERP in your own company.']

View File

@ -52,7 +52,7 @@ def html_sanitize(src):
# html encode email tags
part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
src = part.sub(lambda m: cgi.escape(m.group(1)), src)
# some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
try:
cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, kill_tags=tags_to_kill, remove_tags=tags_to_remove)
@ -60,11 +60,14 @@ def html_sanitize(src):
except TypeError, e:
# lxml.clean version < 2.3.1 does not have a kill_tags attribute
# to remove in 2014
cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill+tags_to_remove)
cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill + tags_to_remove)
cleaned = cleaner.clean_html(src)
except:
_logger.warning('html_sanitize failed to parse %s' % (src))
cleaned = '<p>Impossible to parse</p>'
except etree.ParserError, e:
_logger.warning('html_sanitize: ParserError "%s" obtained when sanitizing "%s"' % (e, src))
cleaned = '<p>ParserError when sanitizing</p>'
except Exception, e:
_logger.warning('html_sanitize: unknown error "%s" obtained when sanitizing "%s"' % (e, src))
cleaned = '<p>Unknown error when sanitizing</p>'
return cleaned
@ -72,7 +75,7 @@ def html_sanitize(src):
# HTML Cleaner
#----------------------------------------------------------
def html_email_clean(html):
def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_length=300):
""" html_email_clean: clean the html to display in the web client.
- strip email quotes (remove blockquote nodes)
- strip signatures (remove --\n{\n)Blahblah), by replacing <br> by
@ -83,6 +86,8 @@ def html_email_clean(html):
html code coming from a sanitized source, like fields.html.
"""
def _replace_matching_regex(regex, source, replace=''):
if not source:
return source
dest = ''
idx = 0
for item in re.finditer(regex, source):
@ -91,63 +96,114 @@ def html_email_clean(html):
dest += source[idx:]
return dest
def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs=None):
# print '\t_tag_matching_regex_in_text'
text = node.text or ''
node.text = ''
cur_node = node
idx = 0
caca = 0
for item in re.finditer(regex, text):
# print '\t\tfound', item.start(), item.end(), '-', text[item.start():item.end()], '-'
if caca == 0:
cur_node.text = text[idx:item.start()]
else:
cur_node.tail = text[idx:item.start()]
# create element
new_node = etree.Element(new_node_tag)
new_node.text = text[item.start():item.end()]
for key, val in new_node_attrs.iteritems():
new_node.set(key, val)
# insert element in DOM
node.insert(caca, new_node)
cur_node = new_node
idx = item.end()
caca += 1
if caca == 0:
cur_node.text = (cur_node.text or '') + text[idx:]
else:
cur_node.tail = text[idx:] + (cur_node.tail or '')
if not html or not isinstance(html, basestring):
return html
html = ustr(html)
# 0. remove encoding attribute inside tags
# Pre processing
# ------------------------------------------------------------
# --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
# html: remove encoding attribute inside tags
doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
html = doctype.sub(r"", html)
# 1. <br[ /]> -> \n, because otherwise the tree is obfuscated
# html: ClEditor seems to love using <div><br /><div> -> replace with <br />
br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)')
html = _replace_matching_regex(br_div_tags, html, '<br />')
# html: <br[ /]> -> \n, to de-obfuscate the tree
br_tags = re.compile(r'([<]\s*[bB][rR]\s*\/?[>])')
html = _replace_matching_regex(br_tags, html, '__BR_TAG__')
# 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre
# form a tree
root = lxml.html.fromstring(html)
if not len(root) and root.text is None and root.tail is None:
html = '<div>%s</div>' % html
root = lxml.html.fromstring(html)
# 2.5 remove quoted text in nodes
# form node and tag text-based quotes and signature
quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
for node in root.getiterator():
if not node.text:
continue
node.text = _replace_matching_regex(quote_tags, node.text)
# 3. remove blockquotes
quotes = [el for el in root.getiterator(tag='blockquote')]
for node in quotes:
# copy the node tail into parent text
if node.tail:
parent = node.getparent()
parent.text = parent.text or '' + node.tail
# remove the node
node.getparent().remove(node)
# 4. strip signatures
signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)')
for elem in root.getiterator():
if elem.text:
match = re.search(signature, elem.text)
if match:
elem.text = elem.text[:match.start()] + elem.text[match.end():]
if elem.tail:
match = re.search(signature, elem.tail)
if match:
elem.tail = elem.tail[:match.start()] + elem.tail[match.end():]
for node in root.getiterator():
_tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
_tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
# 5. \n back to <br/>
# Processing
# ------------------------------------------------------------
# tree: tag nodes
quote_begin = False
for node in root.getiterator():
if node.get('class') in ['WordSection1', 'MsoNormal']:
root.set('msoffice', '1')
if node.get('class') in ['SkyDrivePlaceholder'] or node.get('id') in ['SkyDrivePlaceholder']:
root.set('hotmail', '1')
if quote_begin:
node.set('quote', '1')
if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
quote_begin = True
if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
quote_begin = True
if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
node.set('remove', '1')
if quote_begin:
node.set('remove', '1')
node.set('tail_remove', '1')
# Post processing
# ------------------------------------------------------------
if remove_unwanted:
to_delete = []
for node in root.getiterator():
if node.get('remove'):
# copy the node tail into parent text
if node.tail and not node.get('tail_remove'):
parent = node.getparent()
parent.tail = node.tail + (parent.tail or '')
to_delete.append(node)
for node in to_delete:
node.getparent().remove(node)
# html: \n back to <br/>
html = etree.tostring(root, pretty_print=True)
html = html.replace('__BR_TAG__', '<br />')
# 6. Misc cleaning :
# - ClEditor seems to love using <div><br /><div> -> replace with <br />
br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)')
html = _replace_matching_regex(br_div_tags, html, '<br />')
return html