diff --git a/openerp/tests/test_mail.py b/openerp/tests/test_mail.py index ddb8fc4a195..5ec7fe24436 100644 --- a/openerp/tests/test_mail.py +++ b/openerp/tests/test_mail.py @@ -23,146 +23,9 @@ ############################################################################## import unittest2 +from . import test_mail_examples from openerp.tools import html_sanitize, html_email_clean, append_content_to_html, plaintext2html -HTML_SOURCE = """ -test1 -
-test2
-test3
-test4
-test5
-test6
  1. test9 -
  2. test10
-
-test11
-
-test12

-google -test link -""" - -EDI_LIKE_HTML_SOURCE = """
-

Hello ${object.partner_id.name},

-

A new invoice is available for you:

-

-   REFERENCES
-   Invoice number: ${object.number}
-   Invoice total: ${object.amount_total} ${object.currency_id.name}
-   Invoice date: ${object.date_invoice}
-   Order reference: ${object.origin}
-   Your contact: ${object.user_id.name} -

-
-

It is also possible to directly pay with Paypal:

- - - -
-

If you have any question, do not hesitate to contact us.

-

Thank you for choosing ${object.company_id.name or 'us'}!

-
-
-
-

- ${object.company_id.name}

-
-
- - ${object.company_id.street}
- ${object.company_id.street2}
- ${object.company_id.zip} ${object.company_id.city}
- ${object.company_id.state_id and ('%s, ' % object.company_id.state_id.name) or ''} ${object.company_id.country_id.name or ''}
-
-
- Phone:  ${object.company_id.phone} -
-
- Web : ${object.company_id.website} -
-
-
""" - -TEXT_MAIL1 = """I contact you about our meeting for tomorrow. Here is the schedule I propose: -9 AM: brainstorming about our new amazing business app -9.45 AM: summary -10 AM: meeting with Fabien to present our app -Is everything ok for you ? --- -Administrator""" - -HTML_MAIL1 = """
-I contact you about our meeting for tomorrow. Here is the schedule I propose: -
-
-
Is everything ok for you ?
""" - -GMAIL_REPLY1_SAN = """Hello,

Ok for me. I am replying directly in gmail, without signature.

Kind regards,

Demo.

On Thu, Nov 8, 2012 at 5:29 PM, <dummy@example.com> wrote:
I contact you about our meeting for tomorrow. Here is the schedule I propose:
  • 9 AM: brainstorming about our new amazing business app</span></li>
  • -
  • 9.45 AM: summary
  • 10 AM: meeting with Fabien to present our app
Is everything ok for you ?
-

--
Administrator

- -

Log in our portal at: http://localhost:8069#action=login&db=mail_1&login=demo

-

""" - -THUNDERBIRD_16_REPLY1_SAN = """
On 11/08/2012 05:29 PM, - dummy@example.com wrote:
-
-
I contact you about our meeting for tomorrow. Here is the - schedule I propose:
-
-
-
Is everything ok for you ?
-
-

--
- Administrator

-
-
-

Log in our portal at: -http://localhost:8069#action=login&db=mail_1&token=rHdWcUART5PhEnJRaXjH

-
-
- Ok for me. I am replying directly below your mail, using - Thunderbird, with a signature.

- Did you receive my email about my new laptop, by the way ?

- Raoul.
-- 
-Raoul Grosbedonnée
-
""" - -TEXT_TPL = """Salut Raoul! -Le 28 oct. 2012 à 00:02, Raoul Grosbedon a écrit : - -> C'est sûr que je suis intéressé (quote)! - -Trouloulou pouet pouet. Je ne vais quand même pas écrire de vrais mails, non mais ho. - -> 2012/10/27 Bert Tartopoils : ->> Diantre, me disè-je en envoyant un message similaire à Martine, mais comment vas-tu (quote)? ->> ->> A la base le contenu était un vrai mail, mais je l'ai quand même réécrit pour ce test, histoire de dire que, quand même, on ne met pas n'importe quoi ici. (quote) ->> ->> Et sinon bon courage pour trouver tes clefs (quote). ->> ->> Bert TARTOPOILS ->> bert.tartopoils@miam.miam ->> -> -> -> -- -> Raoul Grosbedon - -Bert TARTOPOILS -bert.tartopoils@miam.miam -""" - class TestSanitizer(unittest2.TestCase): """ Test the html sanitizer that filters html to remove unwanted attributes """ @@ -223,22 +86,21 @@ class TestSanitizer(unittest2.TestCase): self.assertTrue('ha.ckers.org' not in html or 'http://ha.ckers.org/xss.css' in html, 'html_sanitize did not remove a malicious code in %s (%s)' % (content, html)) def test_html(self): - sanitized_html = html_sanitize(HTML_SOURCE) + sanitized_html = html_sanitize(test_mail_examples.MISC_HTML_SOURCE) for tag in ['", "Charles <charles.bidule@truc.fr>"), + emails = [("Charles ", "Charles <charles.bidule@truc.fr>"), ("Dupuis <'tr/-: ${dupuis#$'@truc.baz.fr>", "Dupuis <'tr/-: ${dupuis#$'@truc.baz.fr>"), ("Technical ", "Technical <service/technical+2@open.com>"), ("Div nico ", "Div nico <div-nico@open.com>")] for email in emails: self.assertIn(email[1], html_sanitize(email[0]), 'html_sanitize stripped emails of original html') - def test_edi_source(self): - html = html_sanitize(EDI_LIKE_HTML_SOURCE) + html = html_sanitize(test_mail_examples.EDI_LIKE_HTML_SOURCE) self.assertIn('div style="font-family: \'Lucica Grande\', Ubuntu, Arial, Verdana, sans-serif; font-size: 12px; color: rgb(34, 34, 34); background-color: #FFF;', html, 'html_sanitize removed valid style attribute') self.assertIn('', html, @@ -251,36 +113,73 @@ class TestSanitizer(unittest2.TestCase): class TestCleaner(unittest2.TestCase): """ Test the email cleaner function that filters the content of incoming emails """ - def test_html_email_clean(self): - # Test1: reply through gmail: quote in blockquote, signature --\nAdministrator - new_html = html_email_clean(GMAIL_REPLY1_SAN) - self.assertNotIn('blockquote', new_html, 'html_email_cleaner did not remove a blockquote') - self.assertNotIn('I contact you about our meeting', new_html, 'html_email_cleaner wrongly removed the quoted content') - self.assertNotIn('Administrator', new_html, 'html_email_cleaner did not erase the signature') - self.assertIn('Ok for me', new_html, 'html_email_cleaner erased too much content') + def test_00_html_email_clean_text(self): + """ html_email_clean test for text-based emails """ + new_html = html_email_clean(test_mail_examples.TEXT_1, remove_unwanted=True) + for ext in test_mail_examples.TEXT_1_IN: + self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') + for ext in test_mail_examples.TEXT_1_OUT: + self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') - # Test2: reply through Tunderbird 16.0.2 - new_html = html_email_clean(THUNDERBIRD_16_REPLY1_SAN) - self.assertNotIn('blockquote', new_html, 'html_email_cleaner did not remove a blockquote') - self.assertNotIn('I contact you about our meeting', new_html, 'html_email_cleaner wrongly removed the quoted content') - self.assertNotIn('Administrator', new_html, 'html_email_cleaner did not erase the signature') - self.assertNotIn('Grosbedonn', new_html, 'html_email_cleaner did not erase the signature') - self.assertIn('Ok for me', new_html, 'html_email_cleaner erased too much content') + new_html = html_email_clean(test_mail_examples.TEXT_2, remove_unwanted=True) + for ext in test_mail_examples.TEXT_2_IN: + self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') + for ext in test_mail_examples.TEXT_2_OUT: + self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') - # Test3: text email - new_html = html_email_clean(TEXT_MAIL1) - self.assertIn('I contact you about our meeting', new_html, 'html_email_cleaner wrongly removed the quoted content') - self.assertNotIn('Administrator', new_html, 'html_email_cleaner did not erase the signature') + def test_10_html_email_clean_html(self): + new_html = html_email_clean(test_mail_examples.HTML_1, remove_unwanted=True) + for ext in test_mail_examples.HTML_1_IN: + self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') + for ext in test_mail_examples.HTML_1_OUT: + self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') - # Test4: more complex text email - new_html = html_email_clean(TEXT_TPL) - self.assertNotIn('quote', new_html, 'html_email_cleaner did not remove correctly plaintext quotes') + new_html = html_email_clean(test_mail_examples.HTML_2, remove_unwanted=False) + for ext in test_mail_examples.HTML_2_IN: + self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') + for ext in test_mail_examples.HTML_2_OUT: + self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') - # Test5: False boolean for text must return empty string + new_html = html_email_clean(test_mail_examples.HTML_3, remove_unwanted=False) + for ext in test_mail_examples.HTML_3_IN: + self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') + # for ext in test_mail_examples.HTML_3_OUT: + # self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') + + def test_20_html_email_clean_msoffice(self): + new_html = html_email_clean(test_mail_examples.MSOFFICE_1, remove_unwanted=True) + for ext in test_mail_examples.MSOFFICE_1_IN: + self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') + for ext in test_mail_examples.MSOFFICE_1_OUT: + self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') + + def test_30_html_email_clean_hotmail(self): + new_html = html_email_clean(test_mail_examples.HOTMAIL_1, remove_unwanted=True) + for ext in test_mail_examples.HOTMAIL_1_IN: + self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') + for ext in test_mail_examples.HOTMAIL_1_OUT: + self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') + + def test_40_html_email_clean_gmail(self): + new_html = html_email_clean(test_mail_examples.GMAIL_1, remove_unwanted=True) + for ext in test_mail_examples.GMAIL_1_IN: + self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') + for ext in test_mail_examples.GMAIL_1_OUT: + self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') + + def test_50_html_email_clean_thunderbird(self): + new_html = html_email_clean(test_mail_examples.THUNDERBIRD_1, remove_unwanted=True) + for ext in test_mail_examples.THUNDERBIRD_1_IN: + self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') + for ext in test_mail_examples.THUNDERBIRD_1_OUT: + self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') + + def test_90_html_email_clean_misc(self): + # False boolean for text must return empty string new_html = html_email_clean(False) self.assertEqual(new_html, False, 'html_email_cleaner did change a False in an other value.') - # Test6: Message with xml and doctype tags don't crash + # Message with xml and doctype tags don't crash new_html = html_email_clean(u'\n\n\n \n 404 - Not Found\n \n \n

404 - Not Found

\n \n\n') self.assertNotIn('encoding', new_html, 'html_email_cleaner did not remove correctly encoding attributes') diff --git a/openerp/tests/test_mail_examples.py b/openerp/tests/test_mail_examples.py new file mode 100644 index 00000000000..b0a0faf97ca --- /dev/null +++ b/openerp/tests/test_mail_examples.py @@ -0,0 +1,354 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +MISC_HTML_SOURCE = """ +test1 +
+test2
+test3
+test4
+test5
+test6
  • test7
  • +test8
  1. test9 +
  2. test10
+
+test11
+
+test12

+google +test link +""" + +EDI_LIKE_HTML_SOURCE = """
+

Hello ${object.partner_id.name},

+

A new invoice is available for you:

+

+   REFERENCES
+   Invoice number: ${object.number}
+   Invoice total: ${object.amount_total} ${object.currency_id.name}
+   Invoice date: ${object.date_invoice}
+   Order reference: ${object.origin}
+   Your contact: ${object.user_id.name} +

+
+

It is also possible to directly pay with Paypal:

+ + + +
+

If you have any question, do not hesitate to contact us.

+

Thank you for choosing ${object.company_id.name or 'us'}!

+
+
+
+

+ ${object.company_id.name}

+
+
+ + ${object.company_id.street}
+ ${object.company_id.street2}
+ ${object.company_id.zip} ${object.company_id.city}
+ ${object.company_id.state_id and ('%s, ' % object.company_id.state_id.name) or ''} ${object.company_id.country_id.name or ''}
+
+
+ Phone:  ${object.company_id.phone} +
+ +
+
""" + +TEXT_1 = """I contact you about our meeting tomorrow. Here is the schedule I propose: +9 AM: brainstorming about our new amazing business app +9.45 AM: summary +10 AM: meeting with Ignasse to present our app +Is everything ok for you ? +-- +MySignature""" + +TEXT_1_IN = ["""I contact you about our meeting tomorrow. Here is the schedule I propose: +9 AM: brainstorming about our new amazing business app +9.45 AM: summary +10 AM: meeting with Ignasse to present our app +Is everything ok for you ?"""] +TEXT_1_OUT = ["""-- +MySignature"""] + +TEXT_2 = """Salut Raoul! +Le 28 oct. 2012 à 00:02, Raoul Grosbedon a écrit : + +> I contact you about our meeting tomorrow. Here is the schedule I propose: (quote) + +Of course. This seems viable. + +> 2012/10/27 Bert Tartopoils : +>> blahblahblah (quote)? +>> +>> blahblahblah (quote) +>> +>> Bert TARTOPOILS +>> bert.tartopoils@miam.miam +>> +> +> +> -- +> RaoulSignature + +Bert TARTOPOILS +bert.tartopoils@miam.miam +""" + +TEXT_2_IN = ["Salut Raoul!", "Of course. This seems viable."] +TEXT_2_OUT = ["I contact you about our meeting tomorrow. Here is the schedule I propose: (quote)", + """> 2012/10/27 Bert Tartopoils : +>> blahblahblah (quote)? +>> +>> blahblahblah (quote) +>> +>> Bert TARTOPOILS +>> bert.tartopoils@miam.miam +>> +> +> +> -- +> RaoulSignature"""] + +HTML_1 = """

I contact you about our meeting for tomorrow. Here is the schedule I propose: (keep) +9 AM: brainstorming about our new amazing business app +9.45 AM: summary +10 AM: meeting with Ignasse to present our app +Is everything ok for you ? +-- +MySignature

""" + +HTML_1_IN = ["""I contact you about our meeting for tomorrow. Here is the schedule I propose: (keep) +9 AM: brainstorming about our new amazing business app +9.45 AM: summary +10 AM: meeting with Ignasse to present our app +Is everything ok for you ?"""] +HTML_1_OUT = ["""-- +MySignature"""] + +HTML_2 = """
+ I contact you about our meeting for tomorrow. Here is the schedule I propose: +
+
+
    +
  • 9 AM: brainstorming about our new amazing business app
  • +
  • 9.45 AM: summary
  • +
  • 10 AM: meeting with Fabien to present our app
  • +
+
+
+ Is everything ok for you ? +
""" + +HTML_2_IN = ["I contact you about our meeting for tomorrow. Here is the schedule I propose:", + "
  • 9 AM: brainstorming about our new amazing business app
  • ", + "
  • 9.45 AM: summary
  • ", + "
  • 10 AM: meeting with Fabien to present our app
  • ", + "Is everything ok for you ?"] +HTML_2_OUT = [] + +HTML_3 = """
    This is an answer.
    +
    +Regards,
    +XXXXXX
    +----- Mail original -----
    + + +
    Hi, 
    +
    +
    +My CRM-related question.
    +
    +Regards, 
    +
    +XXXX
    """ + +HTML_3_IN = ["""
    This is an answer.
    +
    +Regards,
    +XXXXXX
    +----- Mail original -----
    """] +HTML_3_OUT = ["Hi,", "My CRM-related question.", + "Regards,"] + +GMAIL_1 = """Hello,

    Ok for me. I am replying directly in gmail, without signature.

    Kind regards,

    Demo.

    On Thu, Nov 8, 2012 at 5:29 PM, <dummy@example.com> wrote:
    I contact you about our meeting for tomorrow. Here is the schedule I propose:
    • 9 AM: brainstorming about our new amazing business app</span></li>
    • +
    • 9.45 AM: summary
    • 10 AM: meeting with Fabien to present our app
    Is everything ok for you ?
    +

    --
    Administrator

    + + +

    """ + +GMAIL_1_IN = ['Ok for me. I am replying directly in gmail, without signature.'] +GMAIL_1_OUT = ['Administrator', 'Log in our portal at:'] + +THUNDERBIRD_1 = """
    On 11/08/2012 05:29 PM, + dummy@example.com wrote:
    +
    +
    I contact you about our meeting for tomorrow. Here is the + schedule I propose:
    +
    +
    • 9 AM: brainstorming about our new amazing business + app</span></li>
    • +
    • 9.45 AM: summary
    • +
    • 10 AM: meeting with Fabien to present our app
    • +
    +
    Is everything ok for you ?
    +
    +

    --
    + Administrator

    +
    + +
    + Ok for me. I am replying directly below your mail, using Thunderbird, with a signature.

    + Did you receive my email about my new laptop, by the way ?

    + Raoul.
    -- 
    +Raoul Grosbedonnée
    +
    """ + +THUNDERBIRD_1_IN = ['Ok for me. I am replying directly below your mail, using Thunderbird, with a signature.'] +THUNDERBIRD_1_OUT = ['I contact you about our meeting for tomorrow.', 'Raoul Grosbedon'] + +HOTMAIL_1 = """
    +

      + I have an amazing company, i'm learning OpenERP, it is a small company yet, but plannig to grow up quickly. +
     
    Kindest regards,
    xxx
    +
    +
    +
    +
    + Subject: Re: your OpenERP.com registration
    From: xxx@xxx.xxx
    To: xxx@xxx.xxx
    Date: Wed, 27 Mar 2013 17:12:12 +0000 +

    + Hello xxx, +
    + I noticed you recently created an OpenERP.com account to access OpenERP Apps. +
    + You indicated that you wish to use OpenERP in your own company. + We would like to know more about your your business needs and requirements, and see how + we can help you. When would you be available to discuss your project ?
    + Best regards,
    +
    +                http://openerp.com
    +                Belgium: +32.81.81.37.00
    +                U.S.: +1 (650) 307-6736
    +                India: +91 (79) 40 500 100
    +            
    +
    +
    +
    """ + +HOTMAIL_1_IN = ["I have an amazing company, i'm learning OpenERP, it is a small company yet, but plannig to grow up quickly."] +HOTMAIL_1_OUT = ["Subject: Re: your OpenERP.com registration", " I noticed you recently created an OpenERP.com account to access OpenERP Apps.", + "We would like to know more about your your business needs and requirements", "Belgium: +32.81.81.37.00"] + +MSOFFICE_1 = """ +
    +
    +

    + + Our requirements are simple. Just looking to replace some spreadsheets for tracking quotes and possibly using the timecard module. + We are a company of 25 engineers providing product design services to clients. + +

    +

    +

    +

    + + I’ll install on a windows server and run a very limited trial to see how it works. + If we adopt OpenERP we will probably move to Linux or look for a hosted SaaS option. + +

    +

    +

    +

    + +
    + I am also evaluating Adempiere and maybe others. +
    +

    +

    +

    +

    + + +

    +

     

    +

    +

    + + I expect the trial will take 2-3 months as this is not a high priority for us. + +

    +

    +

    +

    + + +

    +

     

    +

    +

    + + Alan + +

    +

    +

    +

    + + +

    +

     

    +

    +
    +
    +

    + + From: + + + OpenERP Enterprise [mailto:sales@openerp.com] +
    Sent: Monday, 11 March, 2013 14:47
    To: Alan Widmer
    Subject: Re: your OpenERP.com registration +
    +

    +

    +

    +
    +
    +

    +

     

    +

    Hello Alan Widmer,

    +

    +

    I noticed you recently downloaded OpenERP.

    +

    +

    + Uou mentioned you wish to use OpenERP in your own company. Please let me more about your + business needs and requirements? When will you be available to discuss about your project? +

    +

    +

    Thanks for your interest in OpenERP,

    +

    +

    Feel free to contact me if you have any questions,

    +

    +

    Looking forward to hear from you soon.

    +

    +

     

    +
    --

    +
    Nicolas

    +
    http://openerp.com

    +
    Belgium: +32.81.81.37.00

    +
    U.S.: +1 (650) 307-6736

    +
    India: +91 (79) 40 500 100

    +
                           

    +
    +
    """ + +MSOFFICE_1_IN = ['Our requirements are simple. Just looking to replace some spreadsheets for tracking quotes and possibly using the timecard module.'] +MSOFFICE_1_OUT = ['I noticed you recently downloaded OpenERP.', 'Uou mentioned you wish to use OpenERP in your own company.'] diff --git a/openerp/tools/mail.py b/openerp/tools/mail.py index 5970ce47040..1b1081c813b 100644 --- a/openerp/tools/mail.py +++ b/openerp/tools/mail.py @@ -52,7 +52,7 @@ def html_sanitize(src): # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) src = part.sub(lambda m: cgi.escape(m.group(1)), src) - + # some corner cases make the parser crash (such as in test_mail) try: cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, kill_tags=tags_to_kill, remove_tags=tags_to_remove) @@ -60,11 +60,14 @@ def html_sanitize(src): except TypeError, e: # lxml.clean version < 2.3.1 does not have a kill_tags attribute # to remove in 2014 - cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill+tags_to_remove) + cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill + tags_to_remove) cleaned = cleaner.clean_html(src) - except: - _logger.warning('html_sanitize failed to parse %s' % (src)) - cleaned = '

    Impossible to parse

    ' + except etree.ParserError, e: + _logger.warning('html_sanitize: ParserError "%s" obtained when sanitizing "%s"' % (e, src)) + cleaned = '

    ParserError when sanitizing

    ' + except Exception, e: + _logger.warning('html_sanitize: unknown error "%s" obtained when sanitizing "%s"' % (e, src)) + cleaned = '

    Unknown error when sanitizing

    ' return cleaned @@ -72,7 +75,7 @@ def html_sanitize(src): # HTML Cleaner #---------------------------------------------------------- -def html_email_clean(html): +def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_length=300): """ html_email_clean: clean the html to display in the web client. - strip email quotes (remove blockquote nodes) - strip signatures (remove --\n{\n)Blahblah), by replacing
    by @@ -83,6 +86,8 @@ def html_email_clean(html): html code coming from a sanitized source, like fields.html. """ def _replace_matching_regex(regex, source, replace=''): + if not source: + return source dest = '' idx = 0 for item in re.finditer(regex, source): @@ -91,63 +96,114 @@ def html_email_clean(html): dest += source[idx:] return dest + def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs=None): + # print '\t_tag_matching_regex_in_text' + text = node.text or '' + node.text = '' + cur_node = node + idx = 0 + caca = 0 + for item in re.finditer(regex, text): + # print '\t\tfound', item.start(), item.end(), '-', text[item.start():item.end()], '-' + if caca == 0: + cur_node.text = text[idx:item.start()] + else: + cur_node.tail = text[idx:item.start()] + + # create element + new_node = etree.Element(new_node_tag) + new_node.text = text[item.start():item.end()] + for key, val in new_node_attrs.iteritems(): + new_node.set(key, val) + + # insert element in DOM + node.insert(caca, new_node) + cur_node = new_node + idx = item.end() + caca += 1 + if caca == 0: + cur_node.text = (cur_node.text or '') + text[idx:] + else: + cur_node.tail = text[idx:] + (cur_node.tail or '') + if not html or not isinstance(html, basestring): return html - html = ustr(html) - # 0. remove encoding attribute inside tags + # Pre processing + # ------------------------------------------------------------ + + # --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}' + + # html: remove encoding attribute inside tags doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL) html = doctype.sub(r"", html) - # 1. -> \n, because otherwise the tree is obfuscated + # html: ClEditor seems to love using

    -> replace with
    + br_div_tags = re.compile(r'(
    \s*\s*<\/div>)') + html = _replace_matching_regex(br_div_tags, html, '
    ') + + # html: -> \n, to de-obfuscate the tree br_tags = re.compile(r'([<]\s*[bB][rR]\s*\/?[>])') html = _replace_matching_regex(br_tags, html, '__BR_TAG__') - # 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre + # form a tree root = lxml.html.fromstring(html) if not len(root) and root.text is None and root.tail is None: html = '
    %s
    ' % html root = lxml.html.fromstring(html) - # 2.5 remove quoted text in nodes + # form node and tag text-based quotes and signature quote_tags = re.compile(r'(\n(>)+[^\n\r]*)') - for node in root.getiterator(): - if not node.text: - continue - node.text = _replace_matching_regex(quote_tags, node.text) - - # 3. remove blockquotes - quotes = [el for el in root.getiterator(tag='blockquote')] - for node in quotes: - # copy the node tail into parent text - if node.tail: - parent = node.getparent() - parent.text = parent.text or '' + node.tail - # remove the node - node.getparent().remove(node) - - # 4. strip signatures signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)') - for elem in root.getiterator(): - if elem.text: - match = re.search(signature, elem.text) - if match: - elem.text = elem.text[:match.start()] + elem.text[match.end():] - if elem.tail: - match = re.search(signature, elem.tail) - if match: - elem.tail = elem.tail[:match.start()] + elem.tail[match.end():] + for node in root.getiterator(): + _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'}) + _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'}) - # 5. \n back to
    + # Processing + # ------------------------------------------------------------ + + # tree: tag nodes + quote_begin = False + for node in root.getiterator(): + if node.get('class') in ['WordSection1', 'MsoNormal']: + root.set('msoffice', '1') + if node.get('class') in ['SkyDrivePlaceholder'] or node.get('id') in ['SkyDrivePlaceholder']: + root.set('hotmail', '1') + + if quote_begin: + node.set('quote', '1') + + if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''): + quote_begin = True + if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')): + quote_begin = True + + if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'): + node.set('remove', '1') + if quote_begin: + node.set('remove', '1') + node.set('tail_remove', '1') + + # Post processing + # ------------------------------------------------------------ + + if remove_unwanted: + to_delete = [] + for node in root.getiterator(): + if node.get('remove'): + # copy the node tail into parent text + if node.tail and not node.get('tail_remove'): + parent = node.getparent() + parent.tail = node.tail + (parent.tail or '') + to_delete.append(node) + for node in to_delete: + node.getparent().remove(node) + + # html: \n back to
    html = etree.tostring(root, pretty_print=True) html = html.replace('__BR_TAG__', '
    ') - # 6. Misc cleaning : - # - ClEditor seems to love using

    -> replace with
    - br_div_tags = re.compile(r'(
    \s*\s*<\/div>)') - html = _replace_matching_regex(br_div_tags, html, '
    ') - return html