[IMP] tools: improved html_email_clean.

It can now take into account sections, and protect them. This allows to display a read more link after a whole section. This will be used in the website, to display the first section of blogs for example. Also added some options on the read more link, allowing to tune its tag, content, as well as the link. Also improved html_sanitize, taking improvements from the trunk branch of server, waiting for the whole trunk branch to be merged. bzr revid: tde@openerp.com-20131001142151-rt1g6zpxozd1eau2
2013-10-01 16:21:51 +02:00 · 2013-10-01 16:21:51 +02:00 · 6069810f03
parent bfd080beb3
commit 6069810f03
3 changed files with 319 additions and 50 deletions
--- a/openerp/tests/test_mail.py
+++ b/openerp/tests/test_mail.py
@ -223,6 +223,40 @@ class TestCleaner(unittest2.TestCase):
        for ext in test_mail_examples.THUNDERBIRD_1_OUT:
            self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')

+    def test_70_read_more_and_shorten(self):
+        expand_options = {
+            'oe_expand_container_class': 'span_class',
+            'oe_expand_container_content': 'Herbert Einstein',
+            'oe_expand_separator_node': 'br_lapin',
+            'oe_expand_a_class': 'a_class',
+            'oe_expand_a_content': 'read mee',
+        }
+        new_html = html_email_clean(test_mail_examples.OERP_WEBSITE_HTML_1, remove=True, shorten=True, max_length=100, expand_options=expand_options)
+        for ext in test_mail_examples.OERP_WEBSITE_HTML_1_IN:
+            self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
+        for ext in test_mail_examples.OERP_WEBSITE_HTML_1_OUT:
+            self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase overlimit content')
+        for ext in ['<span class="span_class">Herbert Einstein<br_lapin></br_lapin><a href="#" class="a_class">read mee</a></span>']:
+            self.assertIn(ext, new_html, 'html_email_cleaner wrongly take into account specific expand options')
+
+        new_html = html_email_clean(test_mail_examples.OERP_WEBSITE_HTML_2, remove=True, shorten=True, max_length=200, expand_options=expand_options, protect_sections=False)
+        for ext in test_mail_examples.OERP_WEBSITE_HTML_2_IN:
+            self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
+        for ext in test_mail_examples.OERP_WEBSITE_HTML_2_OUT:
+            self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase overlimit content')
+        for ext in ['<span class="span_class">Herbert Einstein<br_lapin></br_lapin><a href="#" class="a_class">read mee</a></span>']:
+            self.assertIn(ext, new_html, 'html_email_cleaner wrongly take into account specific expand options')
+
+        new_html = html_email_clean(test_mail_examples.OERP_WEBSITE_HTML_2, remove=True, shorten=True, max_length=200, expand_options=expand_options, protect_sections=True)
+        for ext in test_mail_examples.OERP_WEBSITE_HTML_2_IN:
+            self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
+        for ext in test_mail_examples.OERP_WEBSITE_HTML_2_OUT:
+            self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase overlimit content')
+        for ext in [
+                '<span class="span_class">Herbert Einstein<br_lapin></br_lapin><a href="#" class="a_class">read mee</a></span>',
+                'tasks using the gantt chart and control deadlines']:
+            self.assertIn(ext, new_html, 'html_email_cleaner wrongly take into account specific expand options')
+
    def test_90_misc(self):
        # False boolean for text must return empty string
        new_html = html_email_clean(False)
--- a/openerp/tests/test_mail_examples.py
+++ b/openerp/tests/test_mail_examples.py
@ -60,6 +60,154 @@ EDI_LIKE_HTML_SOURCE = """<div style="font-family: 'Lucica Grande', Ubuntu, Aria
    </div>
 </div></body></html>"""

+OERP_WEBSITE_HTML_1 = """
+<div>
+    <div class="container">
+        <div class="row">
+            <div class="col-md-12 text-center mt16 mb16" data-snippet-id="colmd">
+                <h2>OpenERP HR Features</h2>
+                <h3 class="text-muted">Manage your company most important asset: People</h3>
+            </div>
+            <div class="col-md-4" data-snippet-id="colmd">
+                <img class="img-rounded img-responsive" src="/website/static/src/img/china_thumb.jpg">
+                <h4 class="mt16">Streamline Recruitments</h4>
+                <p>Post job offers and keep track of each application received. Follow applicants in your recruitment process with the smart kanban view.</p>
+                <p>Save time by automating some communications with email templates. Resumes are indexed automatically, allowing you to easily find for specific profiles.</p>
+            </div>
+            <div class="col-md-4" data-snippet-id="colmd">
+                <img class="img-rounded img-responsive" src="/website/static/src/img/desert_thumb.jpg">
+                <h4 class="mt16">Enterprise Social Network</h4>
+                <p>Break down information silos. Share knowledge and best practices amongst all employees. Follow specific people or documents and join groups of interests to share expertise and documents.</p>
+                <p>Interact with your collegues in real time with live chat.</p>
+            </div>
+            <div class="col-md-4" data-snippet-id="colmd">
+                <img class="img-rounded img-responsive" src="/website/static/src/img/deers_thumb.jpg">
+                <h4 class="mt16">Leaves Management</h4>
+                <p>Keep track of the vacation days accrued by each employee. Employees enter their requests (paid holidays, sick leave, etc), for managers to approve and validate. It's all done in just a few clicks. The agenda of each employee is updated accordingly.</p>
+            </div>
+        </div>
+    </div>
+</div>"""
+
+OERP_WEBSITE_HTML_1_IN = [
+    'Manage your company most important asset: People',
+    'img class="img-rounded img-responsive" src="/website/static/src/img/china_thumb.jpg"',
+]
+OERP_WEBSITE_HTML_1_OUT = [
+    'Break down information silos.',
+    'Keep track of the vacation days accrued by each employee',
+    'img class="img-rounded img-responsive" src="/website/static/src/img/deers_thumb.jpg',
+]
+
+OERP_WEBSITE_HTML_2 = """
+<div class="mt16 cke_widget_editable cke_widget_element oe_editable oe_dirty" data-oe-model="blog.post" data-oe-id="6" data-oe-field="content" data-oe-type="html" data-oe-translate="0" data-oe-expression="blog_post.content" data-cke-widget-data="{}" data-cke-widget-keep-attr="0" data-widget="oeref" contenteditable="true" data-cke-widget-editable="text">
+    <section class="mt16 mb16" data-snippet-id="text-block">
+        <div class="container">
+            <div class="row">
+                <div class="col-md-12 text-center mt16 mb32" data-snippet-id="colmd">
+                    <h2>
+                        OpenERP Project Management
+                    </h2>
+                    <h3 class="text-muted">Infinitely flexible. Incredibly easy to use.</h3>
+                </div>
+                <div class="col-md-12 mb16 mt16" data-snippet-id="colmd">
+                    <p>
+                        OpenERP's <b>collaborative and realtime</b> project
+                        management helps your team get work done. Keep
+                        track of everything, from the big picture to the
+                        minute details, from the customer contract to the
+                        billing.
+                    </p><p>
+                        Organize projects around <b>your own processes</b>. Work
+                        on tasks and issues using the kanban view, schedule
+                        tasks using the gantt chart and control deadlines
+                        in the calendar view. Every project may have it's
+                        own stages allowing teams to optimize their job.
+                    </p>
+                </div>
+            </div>
+        </div>
+    </section>
+    <section class="" data-snippet-id="image-text">
+        <div class="container">
+            <div class="row">
+                <div class="col-md-6 mt16 mb16" data-snippet-id="colmd">
+                    <img class="img-responsive shadow" src="/website/static/src/img/image_text.jpg">
+                </div>
+                <div class="col-md-6 mt32" data-snippet-id="colmd">
+                    <h3>Manage Your Shops</h3>
+                    <p>
+                        OpenERP's Point of Sale introduces a super clean
+                        interface with no installation required that runs
+                        online and offline on modern hardwares.
+                    </p><p>
+                        It's full integration with the company inventory
+                        and accounting, gives you real time statistics and
+                        consolidations amongst all shops without the hassle
+                        of integrating several applications.
+                    </p>
+                </div>
+            </div>
+        </div>
+    </section>
+    <section class="" data-snippet-id="text-image">
+        <div class="container">
+            <div class="row">
+                <div class="col-md-6 mt32" data-snippet-id="colmd">
+                    <h3>Enterprise Social Network</h3>
+                    <p>
+                        Make every employee feel more connected and engaged
+                        with twitter-like features for your own company. Follow
+                        people, share best practices, 'like' top ideas, etc.
+                    </p><p>
+                        Connect with experts, follow what interests you, share
+                        documents and promote best practices with OpenERP
+                        Social application. Get work done with effective
+                        collaboration across departments, geographies
+                        and business applications.
+                    </p>
+                </div>
+                <div class="col-md-6 mt16 mb16" data-snippet-id="colmd">
+                    <img class="img-responsive shadow" src="/website/static/src/img/text_image.png">
+                </div>
+            </div>
+        </div>
+    </section><section class="" data-snippet-id="portfolio">
+        <div class="container">
+            <div class="row">
+                <div class="col-md-12 text-center mt16 mb32" data-snippet-id="colmd">
+                    <h2>Our Porfolio</h2>
+                    <h4 class="text-muted">More than 500 successful projects</h4>
+                </div>
+                <div class="col-md-4" data-snippet-id="colmd">
+                    <img class="img-thumbnail img-responsive" src="/website/static/src/img/deers.jpg">
+                    <img class="img-thumbnail img-responsive" src="/website/static/src/img/desert.jpg">
+                    <img class="img-thumbnail img-responsive" src="/website/static/src/img/china.jpg">
+                </div>
+                <div class="col-md-4" data-snippet-id="colmd">
+                    <img class="img-thumbnail img-responsive" src="/website/static/src/img/desert.jpg">
+                    <img class="img-thumbnail img-responsive" src="/website/static/src/img/china.jpg">
+                    <img class="img-thumbnail img-responsive" src="/website/static/src/img/deers.jpg">
+                </div>
+                <div class="col-md-4" data-snippet-id="colmd">
+                    <img class="img-thumbnail img-responsive" src="/website/static/src/img/landscape.jpg">
+                    <img class="img-thumbnail img-responsive" src="/website/static/src/img/china.jpg">
+                    <img class="img-thumbnail img-responsive" src="/website/static/src/img/desert.jpg">
+                </div>
+            </div>
+        </div>
+    </section>
+</div>
+"""
+
+OERP_WEBSITE_HTML_2_IN = [
+    'management helps your team get work done',
+]
+OERP_WEBSITE_HTML_2_OUT = [
+    'Make every employee feel more connected',
+    'img class="img-responsive shadow" src="/website/static/src/img/text_image.png',
+]
+
 TEXT_1 = """I contact you about our meeting tomorrow. Here is the schedule I propose:
 9 AM: brainstorming about our new amazing business app
 9.45 AM: summary
--- a/openerp/tools/mail.py
+++ b/openerp/tools/mail.py
@ -43,6 +43,10 @@ _logger = logging.getLogger(__name__)
 tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
 tags_to_remove = ['html', 'body', 'font']

+# allow new semantic HTML5 tags
+allowed_tags = clean.defs.tags | frozenset('article section header footer hgroup nav aside figure'.split())
+safe_attrs = clean.defs.safe_attrs | frozenset(['style'])
+

 def html_sanitize(src, silent=True):
    if not src:
@ -59,6 +63,8 @@ def html_sanitize(src, silent=True):
        'page_structure': True,
        'style': False,             # do not remove style attributes
        'forms': True,              # remove form tags
+        'remove_unknown_tags': False,
+        'allow_tags': allowed_tags,
    }
    if etree.LXML_VERSION >= (2, 3, 1):
        # kill_tags attribute has been added in version 2.3.1
@ -72,7 +78,7 @@ def html_sanitize(src, silent=True):
    if etree.LXML_VERSION >= (3, 1, 0):
        kwargs.update({
            'safe_attrs_only': True,
-            'safe_attrs': clean.defs.safe_attrs | set(['style']),
+            'safe_attrs': safe_attrs,
        })
    else:
        # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
@ -99,7 +105,8 @@ def html_sanitize(src, silent=True):
 # HTML Cleaner
 #----------------------------------------------------------

-def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_options=None, br=False):
+def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_options=None,
+                     protect_sections=False):
    """ html_email_clean: clean the html by doing the following steps:

     - try to strip email quotes, by removing blockquotes or having some client-
@ -124,6 +131,32 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_o
                            be flagged as to remove
    :param int max_length: if shortening, maximum number of characters before
                           shortening
+    :param dict expand_options: options for the read more link when shortening
+                                the content.The used keys are the following:
+
+                                 - oe_expand_container_tag: class applied to the
+                                   container of the whole read more link
+                                 - oe_expand_container_class: class applied to the
+                                   link container (default: oe_mail_expand)
+                                 - oe_expand_container_content: content of the
+                                   container (default: ...)
+                                 - oe_expand_separator_node: optional separator, like
+                                   adding ... <br /><br /> <a ...>read more</a> (default: void)
+                                 - oe_expand_a_href: href of the read more link itself
+                                   (default: #)
+                                 - oe_expand_a_class: class applied to the <a> containing
+                                   the link itself (default: oe_mail_expand)
+                                 - oe_expand_a_content: content of the <a> (default: read more)
+
+                                The formatted read more link is the following:
+                                <cont_tag class="oe_expand_container_class">
+                                    oe_expand_container_content
+                                    if expand_options.get('oe_expand_separator_node'):
+                                        <oe_expand_separator_node/>
+                                    <a href="oe_expand_a_href" class="oe_expand_a_class">
+                                        oe_expand_a_content
+                                    </a>
+                                </span>
    """
    def _replace_matching_regex(regex, source, replace=''):
        """ Replace all matching expressions in source by replace """
@ -170,6 +203,50 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_o
            iteration += 1
        new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})

+    def _truncate_node(node, position, find_first_blank=True):
+        # truncate text
+        innertext = node.text[0:position]
+        outertext = node.text[position:]
+        if find_first_blank:
+            stop_idx = outertext.find(' ')
+            if stop_idx == -1:
+                stop_idx = len(outertext)
+        else:
+            stop_idx = 0
+        node.text = innertext + outertext[0:stop_idx]
+        # create <span> ... <a href="#">read more</a></span> node
+        read_more_node = _create_node(
+            expand_options.get('oe_expand_container_tag', 'span'),
+            expand_options.get('oe_expand_container_content', ' ... '),
+            None,
+            {'class': expand_options.get('oe_expand_container_class', 'oe_mail_expand')}
+        )
+        if expand_options.get('oe_expand_separator_node'):
+            read_more_separator_node = _create_node(
+                expand_options.get('oe_expand_separator_node'),
+                '',
+                None,
+                {}
+            )
+            read_more_node.append(read_more_separator_node)
+        read_more_link_node = _create_node(
+            'a',
+            expand_options.get('oe_expand_a_content', 'read more'),
+            None,
+            {
+                'href': expand_options.get('oe_expand_a_href', '#'),
+                'class': expand_options.get('oe_expand_a_class', 'oe_mail_expand'),
+            }
+        )
+        read_more_node.append(read_more_link_node)
+        # create outertext node
+        overtext_node = _create_node('span', outertext[stop_idx:])
+        # tag node
+        overtext_node.set('in_overlength', '1')
+        # add newly created nodes in dom
+        node.append(read_more_node)
+        node.append(overtext_node)
+
    if expand_options is None:
        expand_options = {}

@ -216,22 +293,39 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_o
    # signature_begin = False  # try dynamic signature recognition
    quote_begin = False
    overlength = False
+    overlength_section_id = None
+    overlength_section_count = 0
    cur_char_nbr = 0
-    for node in root.getiterator():
+    # for node in root.getiterator():
+    for node in root.iter():
+        # update: add a text argument
+        if node.text is None:
+            node.text = ''
+
        # root: try to tag the client used to write the html
        if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
            root.set('msoffice', '1')
        if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
            root.set('hotmail', '1')

-        # state of the parsing
+        # protect sections by tagging section limits and blocks contained inside sections, using an increasing id to re-find them later
+        if node.tag == 'section':
+            overlength_section_count += 1
+            node.set('section_closure', str(overlength_section_count))
+        if node.getparent() is not None and (node.getparent().get('section_closure') or node.getparent().get('section_inner')):
+            node.set('section_inner', str(overlength_section_count))
+
+        # state of the parsing: flag quotes and tails to remove
        if quote_begin:
            node.set('in_quote', '1')
            node.set('tail_remove', '1')
+        # state of the parsing: flag when being in over-length content, depending on section content if defined (only when having protect_sections)
        if overlength:
-            node.set('in_overlength', '1')
-            node.set('tail_remove', '1')
+            if not overlength_section_id or int(node.get('section_inner', overlength_section_count + 1)) > overlength_section_count:
+                node.set('in_overlength', '1')
+                node.set('tail_remove', '1')

+        # find quote in msoffice / hotmail / blockquote / text quote and signatures
        if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
            quote_begin = True
            node.set('in_quote', '1')
@ -240,56 +334,49 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_o
            quote_begin = True
            node.set('in_quote', '1')
            node.set('tail_remove', '1')
-
-        # shorten:
-        # 1/ truncate the text at the next available space
-        # 2/ create a 'read more' node, next to current node
-        # 3/ add the truncated text in a new node, next to 'read more' node
-        if shorten and not overlength and cur_char_nbr + len(node.text or '') > max_length:
-            overlength = True
-            # truncate text
-            innertext = node.text[0:(max_length - cur_char_nbr)]
-            outertext = node.text[(max_length - cur_char_nbr):]
-            stop_idx = outertext.find(' ')
-            if stop_idx == -1:
-                stop_idx = len(outertext)
-            node.text = innertext + outertext[0:stop_idx]
-            # create <span> ... <a href="#">read more</a></span> node
-            read_more_node = _create_node(
-                'span',
-                ' ... ',
-                None,
-                {'class': expand_options.get('oe_expand_span_class', 'oe_mail_expand')}
-            )
-            if br:
-                read_more_node.append(_create_node('br','',None, {}))
-            read_more_link_node = _create_node(
-                'a',
-                'read more',
-                None,
-                {
-                    'href': expand_options.get('oe_expand_href', '#'),
-                    'class': expand_options.get('oe_expand_a_class', 'oe_mail_expand'),
-                }
-            )
-            read_more_node.append(read_more_link_node)
-            # create outertext node
-            new_node = _create_node('span', outertext[stop_idx:])
-            # add newly created nodes in dom
-            node.append(read_more_node)
-            # tag node
-            new_node.set('in_overlength', '1')
-
-            cur_char_nbr += len(node.text or '')
-
        if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
            node.set('in_quote', '1')

+        # shorten:
+        # if protect section:
+        #   1/ find the first parent not being inside a section
+        #   2/ add the read more link
+        # else:
+        #   1/ truncate the text at the next available space
+        #   2/ create a 'read more' node, next to current node
+        #   3/ add the truncated text in a new node, next to 'read more' node
+        node_text = (node.text or '').strip().strip('\n').strip()
+        if shorten and not overlength and cur_char_nbr + len(node_text) > max_length:
+            overlength = True
+            if protect_sections:
+                node_to_truncate = node
+                while node_to_truncate.getparent() is not None and \
+                        (node_to_truncate.getparent().get('section_inner') or node_to_truncate.getparent().get('section_closure')):
+                    node_to_truncate = node_to_truncate.getparent()
+                overlength_section_id = node_to_truncate.get('section_closure')
+                position = len(node_to_truncate.text)
+                find_first_blank = False
+            else:
+                node_to_truncate = node
+                position = max_length - cur_char_nbr
+                find_first_blank = True
+            node_to_truncate.set('truncate', '1')
+            node_to_truncate.set('truncate_position', str(position))
+            node_to_truncate.set('truncate_blank', str(find_first_blank))
+        cur_char_nbr += len(node_text)
+
+    # Tree modification
+    # ------------------------------------------------------------
+
+    for node in root.iter():
+        if node.get('truncate'):
+            _truncate_node(node, int(node.get('truncate_position', '0')), bool(node.get('truncate_blank', 'True')))
+
    # Post processing
    # ------------------------------------------------------------

    to_remove = []
-    for node in root.getiterator():
+    for node in root.iter():
        if node.get('in_quote') or node.get('in_overlength'):
            # copy the node tail into parent text
            if node.tail and not node.get('tail_remove'):
@ -302,7 +389,7 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_o
        if remove:
            node.getparent().remove(node)
        else:
-            if not 'oe_mail_expand' in node.get('class', ''):  # trick: read more link should be displayed even if it's in overlength
+            if not expand_options.get('oe_expand_a_class', 'oe_mail_expand') in node.get('class', ''):  # trick: read more link should be displayed even if it's in overlength
                node_class = node.get('class', '') + ' ' + 'oe_mail_cleaned'
                node.set('class', node_class)