diff --git a/openerp/tests/test_mail.py b/openerp/tests/test_mail.py index add41b1c031..e7b7ec012df 100755 --- a/openerp/tests/test_mail.py +++ b/openerp/tests/test_mail.py @@ -223,6 +223,40 @@ class TestCleaner(unittest2.TestCase): for ext in test_mail_examples.THUNDERBIRD_1_OUT: self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content') + def test_70_read_more_and_shorten(self): + expand_options = { + 'oe_expand_container_class': 'span_class', + 'oe_expand_container_content': 'Herbert Einstein', + 'oe_expand_separator_node': 'br_lapin', + 'oe_expand_a_class': 'a_class', + 'oe_expand_a_content': 'read mee', + } + new_html = html_email_clean(test_mail_examples.OERP_WEBSITE_HTML_1, remove=True, shorten=True, max_length=100, expand_options=expand_options) + for ext in test_mail_examples.OERP_WEBSITE_HTML_1_IN: + self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') + for ext in test_mail_examples.OERP_WEBSITE_HTML_1_OUT: + self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase overlimit content') + for ext in ['Herbert Einsteinread mee']: + self.assertIn(ext, new_html, 'html_email_cleaner wrongly take into account specific expand options') + + new_html = html_email_clean(test_mail_examples.OERP_WEBSITE_HTML_2, remove=True, shorten=True, max_length=200, expand_options=expand_options, protect_sections=False) + for ext in test_mail_examples.OERP_WEBSITE_HTML_2_IN: + self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') + for ext in test_mail_examples.OERP_WEBSITE_HTML_2_OUT: + self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase overlimit content') + for ext in ['Herbert Einsteinread mee']: + self.assertIn(ext, new_html, 'html_email_cleaner wrongly take into account specific expand options') + + new_html = html_email_clean(test_mail_examples.OERP_WEBSITE_HTML_2, remove=True, shorten=True, max_length=200, expand_options=expand_options, protect_sections=True) + for ext in test_mail_examples.OERP_WEBSITE_HTML_2_IN: + self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content') + for ext in test_mail_examples.OERP_WEBSITE_HTML_2_OUT: + self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase overlimit content') + for ext in [ + 'Herbert Einsteinread mee', + 'tasks using the gantt chart and control deadlines']: + self.assertIn(ext, new_html, 'html_email_cleaner wrongly take into account specific expand options') + def test_90_misc(self): # False boolean for text must return empty string new_html = html_email_clean(False) diff --git a/openerp/tests/test_mail_examples.py b/openerp/tests/test_mail_examples.py index 7631a28e662..cafb0b9a22b 100644 --- a/openerp/tests/test_mail_examples.py +++ b/openerp/tests/test_mail_examples.py @@ -60,6 +60,154 @@ EDI_LIKE_HTML_SOURCE = """
+
+
+
+

OpenERP HR Features

+

Manage your company most important asset: People

+
+
+ +

Streamline Recruitments

+

Post job offers and keep track of each application received. Follow applicants in your recruitment process with the smart kanban view.

+

Save time by automating some communications with email templates. Resumes are indexed automatically, allowing you to easily find for specific profiles.

+
+
+ +

Enterprise Social Network

+

Break down information silos. Share knowledge and best practices amongst all employees. Follow specific people or documents and join groups of interests to share expertise and documents.

+

Interact with your collegues in real time with live chat.

+
+
+ +

Leaves Management

+

Keep track of the vacation days accrued by each employee. Employees enter their requests (paid holidays, sick leave, etc), for managers to approve and validate. It's all done in just a few clicks. The agenda of each employee is updated accordingly.

+
+
+
+
""" + +OERP_WEBSITE_HTML_1_IN = [ + 'Manage your company most important asset: People', + 'img class="img-rounded img-responsive" src="/website/static/src/img/china_thumb.jpg"', +] +OERP_WEBSITE_HTML_1_OUT = [ + 'Break down information silos.', + 'Keep track of the vacation days accrued by each employee', + 'img class="img-rounded img-responsive" src="/website/static/src/img/deers_thumb.jpg', +] + +OERP_WEBSITE_HTML_2 = """ +
+
+
+
+
+

+ OpenERP Project Management +

+

Infinitely flexible. Incredibly easy to use.

+
+
+

+ OpenERP's collaborative and realtime project + management helps your team get work done. Keep + track of everything, from the big picture to the + minute details, from the customer contract to the + billing. +

+ Organize projects around your own processes. Work + on tasks and issues using the kanban view, schedule + tasks using the gantt chart and control deadlines + in the calendar view. Every project may have it's + own stages allowing teams to optimize their job. +

+
+
+
+
+
+
+
+
+ +
+
+

Manage Your Shops

+

+ OpenERP's Point of Sale introduces a super clean + interface with no installation required that runs + online and offline on modern hardwares. +

+ It's full integration with the company inventory + and accounting, gives you real time statistics and + consolidations amongst all shops without the hassle + of integrating several applications. +

+
+
+
+
+
+
+
+
+

Enterprise Social Network

+

+ Make every employee feel more connected and engaged + with twitter-like features for your own company. Follow + people, share best practices, 'like' top ideas, etc. +

+ Connect with experts, follow what interests you, share + documents and promote best practices with OpenERP + Social application. Get work done with effective + collaboration across departments, geographies + and business applications. +

+
+
+ +
+
+
+
+
+
+
+

Our Porfolio

+

More than 500 successful projects

+
+
+ + + +
+
+ + + +
+
+ + + +
+
+
+
+
+""" + +OERP_WEBSITE_HTML_2_IN = [ + 'management helps your team get work done', +] +OERP_WEBSITE_HTML_2_OUT = [ + 'Make every employee feel more connected', + 'img class="img-responsive shadow" src="/website/static/src/img/text_image.png', +] + TEXT_1 = """I contact you about our meeting tomorrow. Here is the schedule I propose: 9 AM: brainstorming about our new amazing business app 9.45 AM: summary diff --git a/openerp/tools/mail.py b/openerp/tools/mail.py index 847b8fc542b..91cafe97e45 100644 --- a/openerp/tools/mail.py +++ b/openerp/tools/mail.py @@ -43,6 +43,10 @@ _logger = logging.getLogger(__name__) tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"] tags_to_remove = ['html', 'body', 'font'] +# allow new semantic HTML5 tags +allowed_tags = clean.defs.tags | frozenset('article section header footer hgroup nav aside figure'.split()) +safe_attrs = clean.defs.safe_attrs | frozenset(['style']) + def html_sanitize(src, silent=True): if not src: @@ -59,6 +63,8 @@ def html_sanitize(src, silent=True): 'page_structure': True, 'style': False, # do not remove style attributes 'forms': True, # remove form tags + 'remove_unknown_tags': False, + 'allow_tags': allowed_tags, } if etree.LXML_VERSION >= (2, 3, 1): # kill_tags attribute has been added in version 2.3.1 @@ -72,7 +78,7 @@ def html_sanitize(src, silent=True): if etree.LXML_VERSION >= (3, 1, 0): kwargs.update({ 'safe_attrs_only': True, - 'safe_attrs': clean.defs.safe_attrs | set(['style']), + 'safe_attrs': safe_attrs, }) else: # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style" @@ -99,7 +105,8 @@ def html_sanitize(src, silent=True): # HTML Cleaner #---------------------------------------------------------- -def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_options=None, br=False): +def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_options=None, + protect_sections=False): """ html_email_clean: clean the html by doing the following steps: - try to strip email quotes, by removing blockquotes or having some client- @@ -124,6 +131,32 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_o be flagged as to remove :param int max_length: if shortening, maximum number of characters before shortening + :param dict expand_options: options for the read more link when shortening + the content.The used keys are the following: + + - oe_expand_container_tag: class applied to the + container of the whole read more link + - oe_expand_container_class: class applied to the + link container (default: oe_mail_expand) + - oe_expand_container_content: content of the + container (default: ...) + - oe_expand_separator_node: optional separator, like + adding ...

read more (default: void) + - oe_expand_a_href: href of the read more link itself + (default: #) + - oe_expand_a_class: class applied to the containing + the link itself (default: oe_mail_expand) + - oe_expand_a_content: content of the (default: read more) + + The formatted read more link is the following: + + oe_expand_container_content + if expand_options.get('oe_expand_separator_node'): + + + oe_expand_a_content + + """ def _replace_matching_regex(regex, source, replace=''): """ Replace all matching expressions in source by replace """ @@ -170,6 +203,50 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_o iteration += 1 new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {}) + def _truncate_node(node, position, find_first_blank=True): + # truncate text + innertext = node.text[0:position] + outertext = node.text[position:] + if find_first_blank: + stop_idx = outertext.find(' ') + if stop_idx == -1: + stop_idx = len(outertext) + else: + stop_idx = 0 + node.text = innertext + outertext[0:stop_idx] + # create ... read more node + read_more_node = _create_node( + expand_options.get('oe_expand_container_tag', 'span'), + expand_options.get('oe_expand_container_content', ' ... '), + None, + {'class': expand_options.get('oe_expand_container_class', 'oe_mail_expand')} + ) + if expand_options.get('oe_expand_separator_node'): + read_more_separator_node = _create_node( + expand_options.get('oe_expand_separator_node'), + '', + None, + {} + ) + read_more_node.append(read_more_separator_node) + read_more_link_node = _create_node( + 'a', + expand_options.get('oe_expand_a_content', 'read more'), + None, + { + 'href': expand_options.get('oe_expand_a_href', '#'), + 'class': expand_options.get('oe_expand_a_class', 'oe_mail_expand'), + } + ) + read_more_node.append(read_more_link_node) + # create outertext node + overtext_node = _create_node('span', outertext[stop_idx:]) + # tag node + overtext_node.set('in_overlength', '1') + # add newly created nodes in dom + node.append(read_more_node) + node.append(overtext_node) + if expand_options is None: expand_options = {} @@ -216,22 +293,39 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_o # signature_begin = False # try dynamic signature recognition quote_begin = False overlength = False + overlength_section_id = None + overlength_section_count = 0 cur_char_nbr = 0 - for node in root.getiterator(): + # for node in root.getiterator(): + for node in root.iter(): + # update: add a text argument + if node.text is None: + node.text = '' + # root: try to tag the client used to write the html if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''): root.set('msoffice', '1') if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''): root.set('hotmail', '1') - # state of the parsing + # protect sections by tagging section limits and blocks contained inside sections, using an increasing id to re-find them later + if node.tag == 'section': + overlength_section_count += 1 + node.set('section_closure', str(overlength_section_count)) + if node.getparent() is not None and (node.getparent().get('section_closure') or node.getparent().get('section_inner')): + node.set('section_inner', str(overlength_section_count)) + + # state of the parsing: flag quotes and tails to remove if quote_begin: node.set('in_quote', '1') node.set('tail_remove', '1') + # state of the parsing: flag when being in over-length content, depending on section content if defined (only when having protect_sections) if overlength: - node.set('in_overlength', '1') - node.set('tail_remove', '1') + if not overlength_section_id or int(node.get('section_inner', overlength_section_count + 1)) > overlength_section_count: + node.set('in_overlength', '1') + node.set('tail_remove', '1') + # find quote in msoffice / hotmail / blockquote / text quote and signatures if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''): quote_begin = True node.set('in_quote', '1') @@ -240,56 +334,49 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_o quote_begin = True node.set('in_quote', '1') node.set('tail_remove', '1') - - # shorten: - # 1/ truncate the text at the next available space - # 2/ create a 'read more' node, next to current node - # 3/ add the truncated text in a new node, next to 'read more' node - if shorten and not overlength and cur_char_nbr + len(node.text or '') > max_length: - overlength = True - # truncate text - innertext = node.text[0:(max_length - cur_char_nbr)] - outertext = node.text[(max_length - cur_char_nbr):] - stop_idx = outertext.find(' ') - if stop_idx == -1: - stop_idx = len(outertext) - node.text = innertext + outertext[0:stop_idx] - # create ... read more node - read_more_node = _create_node( - 'span', - ' ... ', - None, - {'class': expand_options.get('oe_expand_span_class', 'oe_mail_expand')} - ) - if br: - read_more_node.append(_create_node('br','',None, {})) - read_more_link_node = _create_node( - 'a', - 'read more', - None, - { - 'href': expand_options.get('oe_expand_href', '#'), - 'class': expand_options.get('oe_expand_a_class', 'oe_mail_expand'), - } - ) - read_more_node.append(read_more_link_node) - # create outertext node - new_node = _create_node('span', outertext[stop_idx:]) - # add newly created nodes in dom - node.append(read_more_node) - # tag node - new_node.set('in_overlength', '1') - - cur_char_nbr += len(node.text or '') - if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'): node.set('in_quote', '1') + # shorten: + # if protect section: + # 1/ find the first parent not being inside a section + # 2/ add the read more link + # else: + # 1/ truncate the text at the next available space + # 2/ create a 'read more' node, next to current node + # 3/ add the truncated text in a new node, next to 'read more' node + node_text = (node.text or '').strip().strip('\n').strip() + if shorten and not overlength and cur_char_nbr + len(node_text) > max_length: + overlength = True + if protect_sections: + node_to_truncate = node + while node_to_truncate.getparent() is not None and \ + (node_to_truncate.getparent().get('section_inner') or node_to_truncate.getparent().get('section_closure')): + node_to_truncate = node_to_truncate.getparent() + overlength_section_id = node_to_truncate.get('section_closure') + position = len(node_to_truncate.text) + find_first_blank = False + else: + node_to_truncate = node + position = max_length - cur_char_nbr + find_first_blank = True + node_to_truncate.set('truncate', '1') + node_to_truncate.set('truncate_position', str(position)) + node_to_truncate.set('truncate_blank', str(find_first_blank)) + cur_char_nbr += len(node_text) + + # Tree modification + # ------------------------------------------------------------ + + for node in root.iter(): + if node.get('truncate'): + _truncate_node(node, int(node.get('truncate_position', '0')), bool(node.get('truncate_blank', 'True'))) + # Post processing # ------------------------------------------------------------ to_remove = [] - for node in root.getiterator(): + for node in root.iter(): if node.get('in_quote') or node.get('in_overlength'): # copy the node tail into parent text if node.tail and not node.get('tail_remove'): @@ -302,7 +389,7 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_o if remove: node.getparent().remove(node) else: - if not 'oe_mail_expand' in node.get('class', ''): # trick: read more link should be displayed even if it's in overlength + if not expand_options.get('oe_expand_a_class', 'oe_mail_expand') in node.get('class', ''): # trick: read more link should be displayed even if it's in overlength node_class = node.get('class', '') + ' ' + 'oe_mail_cleaned' node.set('class', node_class)