[IMP] tools: improved html_email_clean.

It can now take into account sections, and protect them. This allows to display a read more link
after a whole section. This will be used in the website, to display the first section of blogs for
example.

Also added some options on the read more link, allowing to tune its tag, content, as well as the link.

Also improved html_sanitize, taking improvements from the trunk branch of server, waiting for the whole
trunk branch to be merged.

bzr revid: tde@openerp.com-20131001142151-rt1g6zpxozd1eau2
This commit is contained in:
Thibault Delavallée 2013-10-01 16:21:51 +02:00
parent bfd080beb3
commit 6069810f03
3 changed files with 319 additions and 50 deletions

View File

@ -223,6 +223,40 @@ class TestCleaner(unittest2.TestCase):
for ext in test_mail_examples.THUNDERBIRD_1_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase signature / quoted content')
def test_70_read_more_and_shorten(self):
expand_options = {
'oe_expand_container_class': 'span_class',
'oe_expand_container_content': 'Herbert Einstein',
'oe_expand_separator_node': 'br_lapin',
'oe_expand_a_class': 'a_class',
'oe_expand_a_content': 'read mee',
}
new_html = html_email_clean(test_mail_examples.OERP_WEBSITE_HTML_1, remove=True, shorten=True, max_length=100, expand_options=expand_options)
for ext in test_mail_examples.OERP_WEBSITE_HTML_1_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.OERP_WEBSITE_HTML_1_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase overlimit content')
for ext in ['<span class="span_class">Herbert Einstein<br_lapin></br_lapin><a href="#" class="a_class">read mee</a></span>']:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly take into account specific expand options')
new_html = html_email_clean(test_mail_examples.OERP_WEBSITE_HTML_2, remove=True, shorten=True, max_length=200, expand_options=expand_options, protect_sections=False)
for ext in test_mail_examples.OERP_WEBSITE_HTML_2_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.OERP_WEBSITE_HTML_2_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase overlimit content')
for ext in ['<span class="span_class">Herbert Einstein<br_lapin></br_lapin><a href="#" class="a_class">read mee</a></span>']:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly take into account specific expand options')
new_html = html_email_clean(test_mail_examples.OERP_WEBSITE_HTML_2, remove=True, shorten=True, max_length=200, expand_options=expand_options, protect_sections=True)
for ext in test_mail_examples.OERP_WEBSITE_HTML_2_IN:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed not quoted content')
for ext in test_mail_examples.OERP_WEBSITE_HTML_2_OUT:
self.assertNotIn(ext, new_html, 'html_email_cleaner did not erase overlimit content')
for ext in [
'<span class="span_class">Herbert Einstein<br_lapin></br_lapin><a href="#" class="a_class">read mee</a></span>',
'tasks using the gantt chart and control deadlines']:
self.assertIn(ext, new_html, 'html_email_cleaner wrongly take into account specific expand options')
def test_90_misc(self):
# False boolean for text must return empty string
new_html = html_email_clean(False)

View File

@ -60,6 +60,154 @@ EDI_LIKE_HTML_SOURCE = """<div style="font-family: 'Lucica Grande', Ubuntu, Aria
</div>
</div></body></html>"""
OERP_WEBSITE_HTML_1 = """
<div>
<div class="container">
<div class="row">
<div class="col-md-12 text-center mt16 mb16" data-snippet-id="colmd">
<h2>OpenERP HR Features</h2>
<h3 class="text-muted">Manage your company most important asset: People</h3>
</div>
<div class="col-md-4" data-snippet-id="colmd">
<img class="img-rounded img-responsive" src="/website/static/src/img/china_thumb.jpg">
<h4 class="mt16">Streamline Recruitments</h4>
<p>Post job offers and keep track of each application received. Follow applicants in your recruitment process with the smart kanban view.</p>
<p>Save time by automating some communications with email templates. Resumes are indexed automatically, allowing you to easily find for specific profiles.</p>
</div>
<div class="col-md-4" data-snippet-id="colmd">
<img class="img-rounded img-responsive" src="/website/static/src/img/desert_thumb.jpg">
<h4 class="mt16">Enterprise Social Network</h4>
<p>Break down information silos. Share knowledge and best practices amongst all employees. Follow specific people or documents and join groups of interests to share expertise and documents.</p>
<p>Interact with your collegues in real time with live chat.</p>
</div>
<div class="col-md-4" data-snippet-id="colmd">
<img class="img-rounded img-responsive" src="/website/static/src/img/deers_thumb.jpg">
<h4 class="mt16">Leaves Management</h4>
<p>Keep track of the vacation days accrued by each employee. Employees enter their requests (paid holidays, sick leave, etc), for managers to approve and validate. It's all done in just a few clicks. The agenda of each employee is updated accordingly.</p>
</div>
</div>
</div>
</div>"""
OERP_WEBSITE_HTML_1_IN = [
'Manage your company most important asset: People',
'img class="img-rounded img-responsive" src="/website/static/src/img/china_thumb.jpg"',
]
OERP_WEBSITE_HTML_1_OUT = [
'Break down information silos.',
'Keep track of the vacation days accrued by each employee',
'img class="img-rounded img-responsive" src="/website/static/src/img/deers_thumb.jpg',
]
OERP_WEBSITE_HTML_2 = """
<div class="mt16 cke_widget_editable cke_widget_element oe_editable oe_dirty" data-oe-model="blog.post" data-oe-id="6" data-oe-field="content" data-oe-type="html" data-oe-translate="0" data-oe-expression="blog_post.content" data-cke-widget-data="{}" data-cke-widget-keep-attr="0" data-widget="oeref" contenteditable="true" data-cke-widget-editable="text">
<section class="mt16 mb16" data-snippet-id="text-block">
<div class="container">
<div class="row">
<div class="col-md-12 text-center mt16 mb32" data-snippet-id="colmd">
<h2>
OpenERP Project Management
</h2>
<h3 class="text-muted">Infinitely flexible. Incredibly easy to use.</h3>
</div>
<div class="col-md-12 mb16 mt16" data-snippet-id="colmd">
<p>
OpenERP's <b>collaborative and realtime</b> project
management helps your team get work done. Keep
track of everything, from the big picture to the
minute details, from the customer contract to the
billing.
</p><p>
Organize projects around <b>your own processes</b>. Work
on tasks and issues using the kanban view, schedule
tasks using the gantt chart and control deadlines
in the calendar view. Every project may have it's
own stages allowing teams to optimize their job.
</p>
</div>
</div>
</div>
</section>
<section class="" data-snippet-id="image-text">
<div class="container">
<div class="row">
<div class="col-md-6 mt16 mb16" data-snippet-id="colmd">
<img class="img-responsive shadow" src="/website/static/src/img/image_text.jpg">
</div>
<div class="col-md-6 mt32" data-snippet-id="colmd">
<h3>Manage Your Shops</h3>
<p>
OpenERP's Point of Sale introduces a super clean
interface with no installation required that runs
online and offline on modern hardwares.
</p><p>
It's full integration with the company inventory
and accounting, gives you real time statistics and
consolidations amongst all shops without the hassle
of integrating several applications.
</p>
</div>
</div>
</div>
</section>
<section class="" data-snippet-id="text-image">
<div class="container">
<div class="row">
<div class="col-md-6 mt32" data-snippet-id="colmd">
<h3>Enterprise Social Network</h3>
<p>
Make every employee feel more connected and engaged
with twitter-like features for your own company. Follow
people, share best practices, 'like' top ideas, etc.
</p><p>
Connect with experts, follow what interests you, share
documents and promote best practices with OpenERP
Social application. Get work done with effective
collaboration across departments, geographies
and business applications.
</p>
</div>
<div class="col-md-6 mt16 mb16" data-snippet-id="colmd">
<img class="img-responsive shadow" src="/website/static/src/img/text_image.png">
</div>
</div>
</div>
</section><section class="" data-snippet-id="portfolio">
<div class="container">
<div class="row">
<div class="col-md-12 text-center mt16 mb32" data-snippet-id="colmd">
<h2>Our Porfolio</h2>
<h4 class="text-muted">More than 500 successful projects</h4>
</div>
<div class="col-md-4" data-snippet-id="colmd">
<img class="img-thumbnail img-responsive" src="/website/static/src/img/deers.jpg">
<img class="img-thumbnail img-responsive" src="/website/static/src/img/desert.jpg">
<img class="img-thumbnail img-responsive" src="/website/static/src/img/china.jpg">
</div>
<div class="col-md-4" data-snippet-id="colmd">
<img class="img-thumbnail img-responsive" src="/website/static/src/img/desert.jpg">
<img class="img-thumbnail img-responsive" src="/website/static/src/img/china.jpg">
<img class="img-thumbnail img-responsive" src="/website/static/src/img/deers.jpg">
</div>
<div class="col-md-4" data-snippet-id="colmd">
<img class="img-thumbnail img-responsive" src="/website/static/src/img/landscape.jpg">
<img class="img-thumbnail img-responsive" src="/website/static/src/img/china.jpg">
<img class="img-thumbnail img-responsive" src="/website/static/src/img/desert.jpg">
</div>
</div>
</div>
</section>
</div>
"""
OERP_WEBSITE_HTML_2_IN = [
'management helps your team get work done',
]
OERP_WEBSITE_HTML_2_OUT = [
'Make every employee feel more connected',
'img class="img-responsive shadow" src="/website/static/src/img/text_image.png',
]
TEXT_1 = """I contact you about our meeting tomorrow. Here is the schedule I propose:
9 AM: brainstorming about our new amazing business app
9.45 AM: summary

View File

@ -43,6 +43,10 @@ _logger = logging.getLogger(__name__)
tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
tags_to_remove = ['html', 'body', 'font']
# allow new semantic HTML5 tags
allowed_tags = clean.defs.tags | frozenset('article section header footer hgroup nav aside figure'.split())
safe_attrs = clean.defs.safe_attrs | frozenset(['style'])
def html_sanitize(src, silent=True):
if not src:
@ -59,6 +63,8 @@ def html_sanitize(src, silent=True):
'page_structure': True,
'style': False, # do not remove style attributes
'forms': True, # remove form tags
'remove_unknown_tags': False,
'allow_tags': allowed_tags,
}
if etree.LXML_VERSION >= (2, 3, 1):
# kill_tags attribute has been added in version 2.3.1
@ -72,7 +78,7 @@ def html_sanitize(src, silent=True):
if etree.LXML_VERSION >= (3, 1, 0):
kwargs.update({
'safe_attrs_only': True,
'safe_attrs': clean.defs.safe_attrs | set(['style']),
'safe_attrs': safe_attrs,
})
else:
# lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
@ -99,7 +105,8 @@ def html_sanitize(src, silent=True):
# HTML Cleaner
#----------------------------------------------------------
def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_options=None, br=False):
def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_options=None,
protect_sections=False):
""" html_email_clean: clean the html by doing the following steps:
- try to strip email quotes, by removing blockquotes or having some client-
@ -124,6 +131,32 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_o
be flagged as to remove
:param int max_length: if shortening, maximum number of characters before
shortening
:param dict expand_options: options for the read more link when shortening
the content.The used keys are the following:
- oe_expand_container_tag: class applied to the
container of the whole read more link
- oe_expand_container_class: class applied to the
link container (default: oe_mail_expand)
- oe_expand_container_content: content of the
container (default: ...)
- oe_expand_separator_node: optional separator, like
adding ... <br /><br /> <a ...>read more</a> (default: void)
- oe_expand_a_href: href of the read more link itself
(default: #)
- oe_expand_a_class: class applied to the <a> containing
the link itself (default: oe_mail_expand)
- oe_expand_a_content: content of the <a> (default: read more)
The formatted read more link is the following:
<cont_tag class="oe_expand_container_class">
oe_expand_container_content
if expand_options.get('oe_expand_separator_node'):
<oe_expand_separator_node/>
<a href="oe_expand_a_href" class="oe_expand_a_class">
oe_expand_a_content
</a>
</span>
"""
def _replace_matching_regex(regex, source, replace=''):
""" Replace all matching expressions in source by replace """
@ -170,6 +203,50 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_o
iteration += 1
new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
def _truncate_node(node, position, find_first_blank=True):
# truncate text
innertext = node.text[0:position]
outertext = node.text[position:]
if find_first_blank:
stop_idx = outertext.find(' ')
if stop_idx == -1:
stop_idx = len(outertext)
else:
stop_idx = 0
node.text = innertext + outertext[0:stop_idx]
# create <span> ... <a href="#">read more</a></span> node
read_more_node = _create_node(
expand_options.get('oe_expand_container_tag', 'span'),
expand_options.get('oe_expand_container_content', ' ... '),
None,
{'class': expand_options.get('oe_expand_container_class', 'oe_mail_expand')}
)
if expand_options.get('oe_expand_separator_node'):
read_more_separator_node = _create_node(
expand_options.get('oe_expand_separator_node'),
'',
None,
{}
)
read_more_node.append(read_more_separator_node)
read_more_link_node = _create_node(
'a',
expand_options.get('oe_expand_a_content', 'read more'),
None,
{
'href': expand_options.get('oe_expand_a_href', '#'),
'class': expand_options.get('oe_expand_a_class', 'oe_mail_expand'),
}
)
read_more_node.append(read_more_link_node)
# create outertext node
overtext_node = _create_node('span', outertext[stop_idx:])
# tag node
overtext_node.set('in_overlength', '1')
# add newly created nodes in dom
node.append(read_more_node)
node.append(overtext_node)
if expand_options is None:
expand_options = {}
@ -216,22 +293,39 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_o
# signature_begin = False # try dynamic signature recognition
quote_begin = False
overlength = False
overlength_section_id = None
overlength_section_count = 0
cur_char_nbr = 0
for node in root.getiterator():
# for node in root.getiterator():
for node in root.iter():
# update: add a text argument
if node.text is None:
node.text = ''
# root: try to tag the client used to write the html
if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
root.set('msoffice', '1')
if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
root.set('hotmail', '1')
# state of the parsing
# protect sections by tagging section limits and blocks contained inside sections, using an increasing id to re-find them later
if node.tag == 'section':
overlength_section_count += 1
node.set('section_closure', str(overlength_section_count))
if node.getparent() is not None and (node.getparent().get('section_closure') or node.getparent().get('section_inner')):
node.set('section_inner', str(overlength_section_count))
# state of the parsing: flag quotes and tails to remove
if quote_begin:
node.set('in_quote', '1')
node.set('tail_remove', '1')
# state of the parsing: flag when being in over-length content, depending on section content if defined (only when having protect_sections)
if overlength:
node.set('in_overlength', '1')
node.set('tail_remove', '1')
if not overlength_section_id or int(node.get('section_inner', overlength_section_count + 1)) > overlength_section_count:
node.set('in_overlength', '1')
node.set('tail_remove', '1')
# find quote in msoffice / hotmail / blockquote / text quote and signatures
if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
quote_begin = True
node.set('in_quote', '1')
@ -240,56 +334,49 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_o
quote_begin = True
node.set('in_quote', '1')
node.set('tail_remove', '1')
# shorten:
# 1/ truncate the text at the next available space
# 2/ create a 'read more' node, next to current node
# 3/ add the truncated text in a new node, next to 'read more' node
if shorten and not overlength and cur_char_nbr + len(node.text or '') > max_length:
overlength = True
# truncate text
innertext = node.text[0:(max_length - cur_char_nbr)]
outertext = node.text[(max_length - cur_char_nbr):]
stop_idx = outertext.find(' ')
if stop_idx == -1:
stop_idx = len(outertext)
node.text = innertext + outertext[0:stop_idx]
# create <span> ... <a href="#">read more</a></span> node
read_more_node = _create_node(
'span',
' ... ',
None,
{'class': expand_options.get('oe_expand_span_class', 'oe_mail_expand')}
)
if br:
read_more_node.append(_create_node('br','',None, {}))
read_more_link_node = _create_node(
'a',
'read more',
None,
{
'href': expand_options.get('oe_expand_href', '#'),
'class': expand_options.get('oe_expand_a_class', 'oe_mail_expand'),
}
)
read_more_node.append(read_more_link_node)
# create outertext node
new_node = _create_node('span', outertext[stop_idx:])
# add newly created nodes in dom
node.append(read_more_node)
# tag node
new_node.set('in_overlength', '1')
cur_char_nbr += len(node.text or '')
if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
node.set('in_quote', '1')
# shorten:
# if protect section:
# 1/ find the first parent not being inside a section
# 2/ add the read more link
# else:
# 1/ truncate the text at the next available space
# 2/ create a 'read more' node, next to current node
# 3/ add the truncated text in a new node, next to 'read more' node
node_text = (node.text or '').strip().strip('\n').strip()
if shorten and not overlength and cur_char_nbr + len(node_text) > max_length:
overlength = True
if protect_sections:
node_to_truncate = node
while node_to_truncate.getparent() is not None and \
(node_to_truncate.getparent().get('section_inner') or node_to_truncate.getparent().get('section_closure')):
node_to_truncate = node_to_truncate.getparent()
overlength_section_id = node_to_truncate.get('section_closure')
position = len(node_to_truncate.text)
find_first_blank = False
else:
node_to_truncate = node
position = max_length - cur_char_nbr
find_first_blank = True
node_to_truncate.set('truncate', '1')
node_to_truncate.set('truncate_position', str(position))
node_to_truncate.set('truncate_blank', str(find_first_blank))
cur_char_nbr += len(node_text)
# Tree modification
# ------------------------------------------------------------
for node in root.iter():
if node.get('truncate'):
_truncate_node(node, int(node.get('truncate_position', '0')), bool(node.get('truncate_blank', 'True')))
# Post processing
# ------------------------------------------------------------
to_remove = []
for node in root.getiterator():
for node in root.iter():
if node.get('in_quote') or node.get('in_overlength'):
# copy the node tail into parent text
if node.tail and not node.get('tail_remove'):
@ -302,7 +389,7 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_o
if remove:
node.getparent().remove(node)
else:
if not 'oe_mail_expand' in node.get('class', ''): # trick: read more link should be displayed even if it's in overlength
if not expand_options.get('oe_expand_a_class', 'oe_mail_expand') in node.get('class', ''): # trick: read more link should be displayed even if it's in overlength
node_class = node.get('class', '') + ' ' + 'oe_mail_cleaned'
node.set('class', node_class)