2012-09-05 22:57:46 +00:00
# -*- coding: utf-8 -*-
##############################################################################
#
# OpenERP, Open Source Business Applications
2014-01-15 14:13:19 +00:00
# Copyright (C) 2012-TODAY OpenERP S.A. (<http://openerp.com>).
2012-09-05 22:57:46 +00:00
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
##############################################################################
2012-08-13 12:53:07 +00:00
2012-11-07 08:58:07 +00:00
from lxml import etree
2012-11-07 15:41:29 +00:00
import cgi
2012-11-06 12:41:09 +00:00
import logging
2012-09-05 15:32:12 +00:00
import lxml . html
2012-12-26 16:25:05 +00:00
import lxml . html . clean as clean
2012-11-06 11:58:51 +00:00
import random
2012-08-13 15:52:05 +00:00
import re
2012-11-06 12:41:09 +00:00
import socket
import threading
import time
2014-01-14 15:44:26 +00:00
from email . utils import getaddresses
2012-08-13 12:53:07 +00:00
2013-03-27 11:10:14 +00:00
import openerp
2012-09-05 22:57:46 +00:00
from openerp . loglevels import ustr
2012-11-06 12:41:09 +00:00
_logger = logging . getLogger ( __name__ )
2012-11-07 15:41:29 +00:00
#----------------------------------------------------------
# HTML Sanitizer
#----------------------------------------------------------
2012-12-26 16:25:05 +00:00
tags_to_kill = [ " script " , " head " , " meta " , " title " , " link " , " style " , " frame " , " iframe " , " base " , " object " , " embed " ]
tags_to_remove = [ ' html ' , ' body ' , ' font ' ]
2013-10-01 14:21:51 +00:00
# allow new semantic HTML5 tags
2014-04-15 14:37:36 +00:00
allowed_tags = clean . defs . tags | frozenset ( ' article section header footer hgroup nav aside figure main ' . split ( ) + [ etree . Comment ] )
2014-01-15 14:13:19 +00:00
safe_attrs = clean . defs . safe_attrs | frozenset (
[ ' style ' ,
' data-oe-model ' , ' data-oe-id ' , ' data-oe-field ' , ' data-oe-type ' , ' data-oe-expression ' , ' data-oe-translate ' , ' data-oe-nodeid ' ,
' data-snippet-id ' , ' data-publish ' , ' data-id ' , ' data-res_id ' , ' data-member_id ' , ' data-view-id '
] )
2013-10-01 14:21:51 +00:00
2012-12-26 16:25:05 +00:00
2014-01-15 14:13:19 +00:00
def html_sanitize ( src , silent = True , strict = False ) :
2012-09-05 22:57:46 +00:00
if not src :
return src
src = ustr ( src , errors = ' replace ' )
2012-08-13 14:22:32 +00:00
2013-08-29 10:17:15 +00:00
logger = logging . getLogger ( __name__ + ' .html_sanitize ' )
2013-08-21 10:31:39 +00:00
2012-12-31 15:44:51 +00:00
# html encode email tags
2013-03-08 10:48:50 +00:00
part = re . compile ( r " (<(([^a<>]|a[^<> \ s])[^<>]*)@[^<>]+>) " , re . IGNORECASE | re . DOTALL )
2012-12-31 15:44:51 +00:00
src = part . sub ( lambda m : cgi . escape ( m . group ( 1 ) ) , src )
2014-08-07 09:21:41 +00:00
# html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner
src = src . replace ( ' < % ' , cgi . escape ( ' < % ' ) )
src = src . replace ( ' % > ' , cgi . escape ( ' % > ' ) )
2013-04-23 14:49:03 +00:00
2013-08-21 09:47:58 +00:00
kwargs = {
' page_structure ' : True ,
' style ' : False , # do not remove style attributes
' forms ' : True , # remove form tags
2013-10-01 14:21:51 +00:00
' remove_unknown_tags ' : False ,
' allow_tags ' : allowed_tags ,
2014-04-15 14:37:36 +00:00
' comments ' : False ,
2014-08-07 09:21:41 +00:00
' processing_instructions ' : False
2013-08-21 09:47:58 +00:00
}
if etree . LXML_VERSION > = ( 2 , 3 , 1 ) :
# kill_tags attribute has been added in version 2.3.1
kwargs . update ( {
' kill_tags ' : tags_to_kill ,
' remove_tags ' : tags_to_remove ,
} )
else :
kwargs [ ' remove_tags ' ] = tags_to_kill + tags_to_remove
2014-01-15 14:13:19 +00:00
if strict :
if etree . LXML_VERSION > = ( 3 , 1 , 0 ) :
# lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
kwargs . update ( {
' safe_attrs_only ' : True ,
' safe_attrs ' : safe_attrs ,
} )
else :
kwargs [ ' safe_attrs_only ' ] = False # keep oe-data attributes + style
kwargs [ ' frames ' ] = False , # do not remove frames (embbed video in CMS blogs)
2014-01-13 16:54:23 +00:00
2012-12-26 16:25:05 +00:00
try :
2013-08-21 09:47:58 +00:00
# some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
cleaner = clean . Cleaner ( * * kwargs )
2012-12-27 16:33:24 +00:00
cleaned = cleaner . clean_html ( src )
2014-01-23 12:51:37 +00:00
# MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution
2014-01-23 14:14:35 +00:00
cleaned = cleaned . replace ( ' % 24 ' , ' $ ' )
cleaned = cleaned . replace ( ' % 7B ' , ' { ' )
cleaned = cleaned . replace ( ' % 7D ' , ' } ' )
cleaned = cleaned . replace ( ' % 20 ' , ' ' )
2014-01-23 15:28:42 +00:00
cleaned = cleaned . replace ( ' % 5B ' , ' [ ' )
cleaned = cleaned . replace ( ' % 5D ' , ' ] ' )
2014-08-07 09:21:41 +00:00
cleaned = cleaned . replace ( ' < % ' , ' < % ' )
cleaned = cleaned . replace ( ' % > ' , ' % > ' )
2013-11-21 15:54:57 +00:00
except etree . ParserError , e :
2013-12-09 18:54:06 +00:00
if ' empty ' in str ( e ) :
return " "
2013-08-21 10:31:39 +00:00
if not silent :
raise
logger . warning ( ' ParserError obtained when sanitizing %r ' , src , exc_info = True )
2013-04-23 14:49:03 +00:00
cleaned = ' <p>ParserError when sanitizing</p> '
2013-08-21 09:47:58 +00:00
except Exception :
if not silent :
raise
2013-08-21 10:31:39 +00:00
logger . warning ( ' unknown error obtained when sanitizing %r ' , src , exc_info = True )
2013-04-23 14:49:03 +00:00
cleaned = ' <p>Unknown error when sanitizing</p> '
2014-01-15 14:27:09 +00:00
# this is ugly, but lxml/etree tostring want to put everything in a 'div' that breaks the editor -> remove that
if cleaned . startswith ( ' <div> ' ) and cleaned . endswith ( ' </div> ' ) :
cleaned = cleaned [ 5 : - 6 ]
2012-12-26 16:25:05 +00:00
return cleaned
2012-11-06 11:57:44 +00:00
2012-11-06 12:17:37 +00:00
#----------------------------------------------------------
# HTML Cleaner
#----------------------------------------------------------
2013-10-01 14:21:51 +00:00
def html_email_clean ( html , remove = False , shorten = False , max_length = 300 , expand_options = None ,
protect_sections = False ) :
2013-08-08 13:22:58 +00:00
""" html_email_clean: clean the html by doing the following steps:
- try to strip email quotes , by removing blockquotes or having some client -
specific heuristics
- try to strip signatures
- shorten the html to a maximum number of characters if requested
Some specific use case :
- MsOffice : ` ` div . style = border - top : solid ; ` ` delimitates the beginning of
a quote ; detecting by finding WordSection1 of MsoNormal
- Hotmail : ` ` hr . stopSpelling ` ` delimitates the beginning of a quote ; detect
Hotmail by funding ` ` SkyDrivePlaceholder ` `
: param string html : sanitized html ; tags like html or head should not
be present in the html string . This method therefore
takes as input html code coming from a sanitized source ,
like fields . html .
: param boolean remove : remove the html code that is unwanted ; otherwise it
is only flagged and tagged
: param boolean shorten : shorten the html ; every excessing content will
be flagged as to remove
: param int max_length : if shortening , maximum number of characters before
shortening
2013-10-01 14:21:51 +00:00
: param dict expand_options : options for the read more link when shortening
the content . The used keys are the following :
- oe_expand_container_tag : class applied to the
container of the whole read more link
- oe_expand_container_class : class applied to the
link container ( default : oe_mail_expand )
- oe_expand_container_content : content of the
container ( default : . . . )
- oe_expand_separator_node : optional separator , like
adding . . . < br / > < br / > < a . . . > read more < / a > ( default : void )
- oe_expand_a_href : href of the read more link itself
( default : #)
- oe_expand_a_class : class applied to the < a > containing
the link itself ( default : oe_mail_expand )
- oe_expand_a_content : content of the < a > ( default : read more )
The formatted read more link is the following :
< cont_tag class = " oe_expand_container_class " >
oe_expand_container_content
if expand_options . get ( ' oe_expand_separator_node ' ) :
< oe_expand_separator_node / >
< a href = " oe_expand_a_href " class = " oe_expand_a_class " >
oe_expand_a_content
< / a >
< / span >
2012-11-06 12:17:37 +00:00
"""
2012-11-14 10:38:17 +00:00
def _replace_matching_regex ( regex , source , replace = ' ' ) :
2013-04-25 10:43:01 +00:00
""" Replace all matching expressions in source by replace """
2013-04-23 14:49:03 +00:00
if not source :
return source
2012-11-14 10:38:17 +00:00
dest = ' '
idx = 0
for item in re . finditer ( regex , source ) :
dest + = source [ idx : item . start ( ) ] + replace
idx = item . end ( )
dest + = source [ idx : ]
return dest
2013-04-25 10:43:01 +00:00
def _create_node ( tag , text , tail = None , attrs = { } ) :
new_node = etree . Element ( tag )
new_node . text = text
new_node . tail = tail
for key , val in attrs . iteritems ( ) :
new_node . set ( key , val )
return new_node
def _insert_new_node ( node , index , new_node_tag , new_node_text , new_node_tail = None , new_node_attrs = { } ) :
new_node = _create_node ( new_node_tag , new_node_text , new_node_tail , new_node_attrs )
node . insert ( index , new_node )
return new_node
def _tag_matching_regex_in_text ( regex , node , new_node_tag = ' span ' , new_node_attrs = { } ) :
2013-04-23 14:49:03 +00:00
text = node . text or ' '
2013-04-25 10:43:01 +00:00
if not re . search ( regex , text ) :
return
2013-04-23 14:49:03 +00:00
cur_node = node
2013-04-25 10:43:01 +00:00
node . text = ' '
idx , iteration = 0 , 0
2013-04-23 14:49:03 +00:00
for item in re . finditer ( regex , text ) :
2013-04-25 10:43:01 +00:00
if iteration == 0 :
2013-04-23 14:49:03 +00:00
cur_node . text = text [ idx : item . start ( ) ]
else :
2013-04-25 10:43:01 +00:00
_insert_new_node ( node , ( iteration - 1 ) * 2 + 1 , new_node_tag , text [ idx : item . start ( ) ] )
new_node = _insert_new_node ( node , iteration * 2 , new_node_tag , text [ item . start ( ) : item . end ( ) ] , None , new_node_attrs )
2013-04-23 14:49:03 +00:00
cur_node = new_node
idx = item . end ( )
2013-04-25 10:43:01 +00:00
iteration + = 1
new_node = _insert_new_node ( node , - 1 , new_node_tag , text [ idx : ] + ( cur_node . tail or ' ' ) , None , { } )
2013-04-23 14:49:03 +00:00
2013-10-22 13:50:37 +00:00
def _truncate_node ( node , position , simplify_whitespaces = True ) :
""" Truncate a node text at a given position. This algorithm will shorten
at the end of the word whose ending character exceeds position .
: param bool simplify_whitespaces : whether to try to count all successive
whitespaces as one character . This
option should not be True when trying
to keep ' pre ' consistency .
"""
2013-10-17 11:05:25 +00:00
if node . text is None :
node . text = ' '
2013-10-22 13:50:37 +00:00
2013-10-23 12:27:44 +00:00
truncate_idx = - 1
2013-10-22 13:50:37 +00:00
if simplify_whitespaces :
cur_char_nbr = 0
word = None
node_words = node . text . strip ( ' \t \r \n ' ) . split ( )
for word in node_words :
cur_char_nbr + = len ( word )
if cur_char_nbr > = position :
break
2013-10-22 14:29:08 +00:00
if word :
2013-10-23 12:27:44 +00:00
truncate_idx = node . text . find ( word ) + len ( word )
2013-10-01 14:21:51 +00:00
else :
2013-10-23 12:27:44 +00:00
truncate_idx = position
2013-10-23 12:38:39 +00:00
if truncate_idx == - 1 or truncate_idx > len ( node . text ) :
truncate_idx = len ( node . text )
2013-10-22 13:50:37 +00:00
# compose new text bits
2013-10-23 12:27:44 +00:00
innertext = node . text [ 0 : truncate_idx ]
outertext = node . text [ truncate_idx : ]
2013-10-22 13:50:37 +00:00
node . text = innertext
2013-10-01 14:21:51 +00:00
# create <span> ... <a href="#">read more</a></span> node
read_more_node = _create_node (
expand_options . get ( ' oe_expand_container_tag ' , ' span ' ) ,
expand_options . get ( ' oe_expand_container_content ' , ' ... ' ) ,
None ,
{ ' class ' : expand_options . get ( ' oe_expand_container_class ' , ' oe_mail_expand ' ) }
)
if expand_options . get ( ' oe_expand_separator_node ' ) :
read_more_separator_node = _create_node (
expand_options . get ( ' oe_expand_separator_node ' ) ,
' ' ,
None ,
{ }
)
read_more_node . append ( read_more_separator_node )
read_more_link_node = _create_node (
' a ' ,
expand_options . get ( ' oe_expand_a_content ' , ' read more ' ) ,
None ,
{
' href ' : expand_options . get ( ' oe_expand_a_href ' , ' # ' ) ,
' class ' : expand_options . get ( ' oe_expand_a_class ' , ' oe_mail_expand ' ) ,
}
)
read_more_node . append ( read_more_link_node )
# create outertext node
2013-10-24 08:39:55 +00:00
overtext_node = _create_node ( ' span ' , outertext )
2013-10-01 14:21:51 +00:00
# tag node
overtext_node . set ( ' in_overlength ' , ' 1 ' )
# add newly created nodes in dom
node . append ( read_more_node )
node . append ( overtext_node )
2013-09-23 11:29:39 +00:00
if expand_options is None :
expand_options = { }
2012-12-19 10:37:14 +00:00
if not html or not isinstance ( html , basestring ) :
2012-11-20 15:42:17 +00:00
return html
2012-11-09 12:35:21 +00:00
html = ustr ( html )
2012-11-06 12:17:37 +00:00
2013-04-23 14:49:03 +00:00
# Pre processing
# ------------------------------------------------------------
2013-04-25 10:43:01 +00:00
# TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
2013-04-23 14:49:03 +00:00
# html: remove encoding attribute inside tags
2012-12-19 12:40:38 +00:00
doctype = re . compile ( r ' (<[^>]* \ s)(encoding=([ " \' ][^ " \' ]*?[ " \' ]|[^ \ s \ n \ r>]+)( \ s[^>]*|/)?>) ' , re . IGNORECASE | re . DOTALL )
2012-12-19 10:37:14 +00:00
html = doctype . sub ( r " " , html )
2013-04-23 14:49:03 +00:00
# html: ClEditor seems to love using <div><br /><div> -> replace with <br />
2013-04-25 10:43:01 +00:00
br_div_tags = re . compile ( r ' (<div> \ s*<br \ s* \ /> \ s*< \ /div>) ' , re . IGNORECASE )
2013-04-23 14:49:03 +00:00
html = _replace_matching_regex ( br_div_tags , html , ' <br /> ' )
# form a tree
2012-11-07 08:58:07 +00:00
root = lxml . html . fromstring ( html )
2012-11-06 12:17:37 +00:00
if not len ( root ) and root . text is None and root . tail is None :
2012-11-07 08:58:07 +00:00
html = ' <div> %s </div> ' % html
root = lxml . html . fromstring ( html )
2012-11-06 12:17:37 +00:00
2013-10-22 13:50:37 +00:00
quote_tags = re . compile ( r ' ( \ n(>)+[^ \ n \ r]*) ' )
signature = re . compile ( r ' ([-] { 2,}[ \ s]?[ \ r \ n] { 1,2}[ \ s \ S]+) ' )
for node in root . iter ( ) :
# remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
2013-04-25 10:43:01 +00:00
if node . tail :
tail_node = _create_node ( ' span ' , node . tail )
node . tail = None
node . addnext ( tail_node )
2013-10-22 13:50:37 +00:00
# form node and tag text-based quotes and signature
2013-04-23 14:49:03 +00:00
_tag_matching_regex_in_text ( quote_tags , node , ' span ' , { ' text_quote ' : ' 1 ' } )
_tag_matching_regex_in_text ( signature , node , ' span ' , { ' text_signature ' : ' 1 ' } )
# Processing
# ------------------------------------------------------------
# tree: tag nodes
2013-04-25 10:43:01 +00:00
# signature_begin = False # try dynamic signature recognition
2013-04-23 14:49:03 +00:00
quote_begin = False
2013-04-23 15:37:27 +00:00
overlength = False
2013-10-01 14:21:51 +00:00
overlength_section_id = None
overlength_section_count = 0
2013-04-23 15:37:27 +00:00
cur_char_nbr = 0
2013-10-01 14:21:51 +00:00
for node in root . iter ( ) :
2014-04-15 14:37:36 +00:00
# comments do not need processing
2014-04-23 13:19:19 +00:00
# note: bug in node.get(value, default) for HtmlComments, default never returned
2014-04-15 14:37:36 +00:00
if node . tag == etree . Comment :
continue
2013-10-24 08:44:11 +00:00
# do not take into account multiple spaces that are displayed as max 1 space in html
2013-10-22 13:50:37 +00:00
node_text = ' ' . join ( ( node . text and node . text . strip ( ' \t \r \n ' ) or ' ' ) . split ( ) )
2013-10-01 14:21:51 +00:00
2013-04-25 10:43:01 +00:00
# root: try to tag the client used to write the html
if ' WordSection1 ' in node . get ( ' class ' , ' ' ) or ' MsoNormal ' in node . get ( ' class ' , ' ' ) :
2013-04-23 14:49:03 +00:00
root . set ( ' msoffice ' , ' 1 ' )
2013-04-25 10:43:01 +00:00
if ' SkyDrivePlaceholder ' in node . get ( ' class ' , ' ' ) or ' SkyDrivePlaceholder ' in node . get ( ' id ' , ' ' ) :
2013-04-23 14:49:03 +00:00
root . set ( ' hotmail ' , ' 1 ' )
2013-10-01 14:21:51 +00:00
# protect sections by tagging section limits and blocks contained inside sections, using an increasing id to re-find them later
if node . tag == ' section ' :
overlength_section_count + = 1
node . set ( ' section_closure ' , str ( overlength_section_count ) )
if node . getparent ( ) is not None and ( node . getparent ( ) . get ( ' section_closure ' ) or node . getparent ( ) . get ( ' section_inner ' ) ) :
node . set ( ' section_inner ' , str ( overlength_section_count ) )
# state of the parsing: flag quotes and tails to remove
2013-04-23 14:49:03 +00:00
if quote_begin :
2013-04-25 10:43:01 +00:00
node . set ( ' in_quote ' , ' 1 ' )
node . set ( ' tail_remove ' , ' 1 ' )
2013-10-01 14:21:51 +00:00
# state of the parsing: flag when being in over-length content, depending on section content if defined (only when having protect_sections)
2013-04-23 15:37:27 +00:00
if overlength :
2013-10-01 14:21:51 +00:00
if not overlength_section_id or int ( node . get ( ' section_inner ' , overlength_section_count + 1 ) ) > overlength_section_count :
node . set ( ' in_overlength ' , ' 1 ' )
node . set ( ' tail_remove ' , ' 1 ' )
2013-04-23 14:49:03 +00:00
2013-10-01 14:21:51 +00:00
# find quote in msoffice / hotmail / blockquote / text quote and signatures
2013-04-23 14:49:03 +00:00
if root . get ( ' msoffice ' ) and node . tag == ' div ' and ' border-top:solid ' in node . get ( ' style ' , ' ' ) :
quote_begin = True
2013-04-25 10:43:01 +00:00
node . set ( ' in_quote ' , ' 1 ' )
node . set ( ' tail_remove ' , ' 1 ' )
2013-04-23 14:49:03 +00:00
if root . get ( ' hotmail ' ) and node . tag == ' hr ' and ( ' stopSpelling ' in node . get ( ' class ' , ' ' ) or ' stopSpelling ' in node . get ( ' id ' , ' ' ) ) :
quote_begin = True
2013-04-25 10:43:01 +00:00
node . set ( ' in_quote ' , ' 1 ' )
node . set ( ' tail_remove ' , ' 1 ' )
2013-10-01 14:21:51 +00:00
if node . tag == ' blockquote ' or node . get ( ' text_quote ' ) or node . get ( ' text_signature ' ) :
2014-08-20 10:01:50 +00:00
# here no quote_begin because we want to be able to remove some quoted
# text without removing all the remaining context
node . set ( ' in_quote ' , ' 1 ' )
if node . getparent ( ) is not None and node . getparent ( ) . get ( ' in_quote ' ) :
# inside a block of removed text but not in quote_begin (see above)
2013-10-01 14:21:51 +00:00
node . set ( ' in_quote ' , ' 1 ' )
2013-04-25 10:43:01 +00:00
# shorten:
2013-10-01 14:21:51 +00:00
# if protect section:
# 1/ find the first parent not being inside a section
# 2/ add the read more link
# else:
# 1/ truncate the text at the next available space
# 2/ create a 'read more' node, next to current node
# 3/ add the truncated text in a new node, next to 'read more' node
node_text = ( node . text or ' ' ) . strip ( ) . strip ( ' \n ' ) . strip ( )
if shorten and not overlength and cur_char_nbr + len ( node_text ) > max_length :
2013-10-17 11:05:25 +00:00
node_to_truncate = node
2013-10-28 16:33:25 +00:00
while node_to_truncate . getparent ( ) is not None :
if node_to_truncate . get ( ' in_quote ' ) :
2013-10-01 14:21:51 +00:00
node_to_truncate = node_to_truncate . getparent ( )
2013-10-28 16:33:25 +00:00
elif protect_sections and ( node_to_truncate . getparent ( ) . get ( ' section_inner ' ) or node_to_truncate . getparent ( ) . get ( ' section_closure ' ) ) :
node_to_truncate = node_to_truncate . getparent ( )
overlength_section_id = node_to_truncate . get ( ' section_closure ' )
else :
break
2013-04-25 10:43:01 +00:00
overlength = True
2013-10-01 14:21:51 +00:00
node_to_truncate . set ( ' truncate ' , ' 1 ' )
2013-10-22 13:50:37 +00:00
if node_to_truncate == node :
node_to_truncate . set ( ' truncate_position ' , str ( max_length - cur_char_nbr ) )
else :
node_to_truncate . set ( ' truncate_position ' , str ( len ( node . text or ' ' ) ) )
2013-10-01 14:21:51 +00:00
cur_char_nbr + = len ( node_text )
# Tree modification
# ------------------------------------------------------------
2013-04-23 15:37:27 +00:00
2013-10-01 14:21:51 +00:00
for node in root . iter ( ) :
if node . get ( ' truncate ' ) :
2013-10-22 13:50:37 +00:00
_truncate_node ( node , int ( node . get ( ' truncate_position ' , ' 0 ' ) ) , node . tag != ' pre ' )
2013-04-23 14:49:03 +00:00
# Post processing
# ------------------------------------------------------------
2013-04-25 10:43:01 +00:00
to_remove = [ ]
2013-10-01 14:21:51 +00:00
for node in root . iter ( ) :
2013-04-25 10:43:01 +00:00
if node . get ( ' in_quote ' ) or node . get ( ' in_overlength ' ) :
# copy the node tail into parent text
if node . tail and not node . get ( ' tail_remove ' ) :
parent = node . getparent ( )
parent . tail = node . tail + ( parent . tail or ' ' )
to_remove . append ( node )
if node . get ( ' tail_remove ' ) :
node . tail = ' '
2013-10-22 13:50:37 +00:00
# clean node
for attribute_name in [ ' in_quote ' , ' tail_remove ' , ' in_overlength ' , ' msoffice ' , ' hotmail ' , ' truncate ' , ' truncate_position ' ] :
node . attrib . pop ( attribute_name , None )
2013-04-25 10:43:01 +00:00
for node in to_remove :
if remove :
2013-04-23 14:49:03 +00:00
node . getparent ( ) . remove ( node )
2013-04-25 10:43:01 +00:00
else :
2013-10-01 14:21:51 +00:00
if not expand_options . get ( ' oe_expand_a_class ' , ' oe_mail_expand ' ) in node . get ( ' class ' , ' ' ) : # trick: read more link should be displayed even if it's in overlength
2013-10-22 13:50:37 +00:00
node_class = node . get ( ' class ' , ' ' ) + ' oe_mail_cleaned '
2013-04-25 10:43:01 +00:00
node . set ( ' class ' , node_class )
# html: \n that were tail of elements have been encapsulated into <span> -> back to \n
html = etree . tostring ( root , pretty_print = False )
2013-10-22 13:50:37 +00:00
linebreaks = re . compile ( r ' <span[^>]*>([ \ s]*[ \ r \ n]+[ \ s]*)< \ /span> ' , re . IGNORECASE | re . DOTALL )
2013-04-25 10:43:01 +00:00
html = _replace_matching_regex ( linebreaks , html , ' \n ' )
2012-11-07 08:58:07 +00:00
return html
2012-11-06 12:17:37 +00:00
2012-11-06 11:57:44 +00:00
#----------------------------------------------------------
2012-11-07 15:41:29 +00:00
# HTML/Text management
2012-11-06 11:57:44 +00:00
#----------------------------------------------------------
def html2plaintext ( html , body_id = None , encoding = ' utf-8 ' ) :
""" From an HTML text, convert the HTML to plain text.
If @param body_id is provided then this is the tag where the
body ( not necessarily < body > ) starts .
"""
## (c) Fry-IT, www.fry-it.com, 2007
## <peter@fry-it.com>
## download here: http://www.peterbe.com/plog/html2plaintext
html = ustr ( html )
2012-11-09 12:35:21 +00:00
tree = etree . fromstring ( html , parser = etree . HTMLParser ( ) )
2012-11-06 11:57:44 +00:00
if body_id is not None :
2012-11-07 16:41:17 +00:00
source = tree . xpath ( ' //*[@id= %s ] ' % ( body_id , ) )
2012-11-06 11:57:44 +00:00
else :
source = tree . xpath ( ' //body ' )
if len ( source ) :
tree = source [ 0 ]
url_index = [ ]
i = 0
for link in tree . findall ( ' .//a ' ) :
url = link . get ( ' href ' )
if url :
i + = 1
link . tag = ' span '
link . text = ' %s [ %s ] ' % ( link . text , i )
url_index . append ( url )
2012-11-09 12:35:21 +00:00
html = ustr ( etree . tostring ( tree , encoding = encoding ) )
2013-03-29 15:41:30 +00:00
# \r char is converted into , must remove it
html = html . replace ( ' ' , ' ' )
2012-11-06 11:57:44 +00:00
2012-11-07 16:41:17 +00:00
html = html . replace ( ' <strong> ' , ' * ' ) . replace ( ' </strong> ' , ' * ' )
html = html . replace ( ' <b> ' , ' * ' ) . replace ( ' </b> ' , ' * ' )
html = html . replace ( ' <h3> ' , ' * ' ) . replace ( ' </h3> ' , ' * ' )
html = html . replace ( ' <h2> ' , ' ** ' ) . replace ( ' </h2> ' , ' ** ' )
html = html . replace ( ' <h1> ' , ' ** ' ) . replace ( ' </h1> ' , ' ** ' )
html = html . replace ( ' <em> ' , ' / ' ) . replace ( ' </em> ' , ' / ' )
2012-11-06 11:57:44 +00:00
html = html . replace ( ' <tr> ' , ' \n ' )
html = html . replace ( ' </p> ' , ' \n ' )
html = re . sub ( ' <br \ s*/?> ' , ' \n ' , html )
html = re . sub ( ' <.*?> ' , ' ' , html )
html = html . replace ( ' ' * 2 , ' ' )
2014-07-01 10:24:23 +00:00
html = html . replace ( ' > ' , ' > ' )
html = html . replace ( ' < ' , ' < ' )
2014-08-06 12:16:46 +00:00
html = html . replace ( ' & ' , ' & ' )
2012-11-06 11:57:44 +00:00
# strip all lines
2014-07-11 13:31:44 +00:00
html = ' \n ' . join ( [ x . strip ( ) for x in html . splitlines ( ) ] )
html = html . replace ( ' \n ' * 2 , ' \n ' )
2012-11-06 11:57:44 +00:00
for i , url in enumerate ( url_index ) :
if i == 0 :
html + = ' \n \n '
2012-11-07 16:41:17 +00:00
html + = ustr ( ' [ %s ] %s \n ' ) % ( i + 1 , url )
2012-11-06 11:57:44 +00:00
return html
2012-11-09 12:35:21 +00:00
def plaintext2html ( text , container_tag = False ) :
2012-11-07 16:41:17 +00:00
""" Convert plaintext into html. Content of the text is escaped to manage
html entities , using cgi . escape ( ) .
- all \n , \r are replaced by < br / >
- enclose content into < p >
- 2 or more consecutive < br / > are considered as paragraph breaks
: param string container_tag : container of the html ; by default the
content is embedded into a < div >
"""
2012-11-09 12:35:21 +00:00
text = cgi . escape ( ustr ( text ) )
2012-11-07 16:41:17 +00:00
# 1. replace \n and \r
text = text . replace ( ' \n ' , ' <br/> ' )
text = text . replace ( ' \r ' , ' <br/> ' )
# 2-3: form paragraphs
idx = 0
final = ' <p> '
br_tags = re . compile ( r ' (([<] \ s*[bB][rR] \ s* \ /?[>] \ s*) { 2,}) ' )
for item in re . finditer ( br_tags , text ) :
final + = text [ idx : item . start ( ) ] + ' </p><p> '
idx = item . end ( )
final + = text [ idx : ] + ' </p> '
# 4. container
if container_tag :
final = ' < %s > %s </ %s > ' % ( container_tag , final , container_tag )
2012-11-09 12:35:21 +00:00
return ustr ( final )
def append_content_to_html ( html , content , plaintext = True , preserve = False , container_tag = False ) :
""" Append extra content at the end of an HTML snippet, trying
to locate the end of the HTML document ( < / body > , < / html > , or
EOF ) , and converting the provided content in html unless ` ` plaintext ` `
is False .
Content conversion can be done in two ways :
- wrapping it into a pre ( preserve = True )
- use plaintext2html ( preserve = False , using container_tag to wrap the
whole content )
A side - effect of this method is to coerce all HTML tags to
lowercase in ` ` html ` ` , and strip enclosing < html > or < body > tags in
content if ` ` plaintext ` ` is False .
: param str html : html tagsoup ( doesn ' t have to be XHTML)
: param str content : extra content to append
: param bool plaintext : whether content is plaintext and should
be wrapped in a < pre / > tag .
: param bool preserve : if content is plaintext , wrap it into a < pre >
instead of converting it into html
"""
html = ustr ( html )
if plaintext and preserve :
content = u ' \n <pre> %s </pre> \n ' % ustr ( content )
elif plaintext :
content = ' \n %s \n ' % plaintext2html ( content , container_tag )
else :
2014-10-09 07:14:22 +00:00
content = re . sub ( r ' (?i)(</?(?:html|body|head|! \ s*DOCTYPE)[^>]*>) ' , ' ' , content )
2012-11-09 12:35:21 +00:00
content = u ' \n %s \n ' % ustr ( content )
# Force all tags to lowercase
html = re . sub ( r ' (</?) \ W*( \ w+)([ >]) ' ,
lambda m : ' %s %s %s ' % ( m . group ( 1 ) , m . group ( 2 ) . lower ( ) , m . group ( 3 ) ) , html )
insert_location = html . find ( ' </body> ' )
if insert_location == - 1 :
insert_location = html . find ( ' </html> ' )
if insert_location == - 1 :
return ' %s %s ' % ( html , content )
return ' %s %s %s ' % ( html [ : insert_location ] , content , html [ insert_location : ] )
2012-11-07 15:41:29 +00:00
#----------------------------------------------------------
# Emails
#----------------------------------------------------------
2012-12-20 16:05:42 +00:00
# matches any email in a body of text
email_re = re . compile ( r """ ([a-zA-Z0-9._ % +-]+@[a-zA-Z0-9.-]+ \ .[a-zA-Z] { 2,6}) """ , re . VERBOSE )
# matches a string containing only one email
single_email_re = re . compile ( r """ ^[a-zA-Z0-9._ % +-]+@[a-zA-Z0-9.-]+ \ .[a-zA-Z] { 2,6}$ """ , re . VERBOSE )
2012-11-07 15:41:29 +00:00
res_re = re . compile ( r " \ [([0-9]+) \ ] " , re . UNICODE )
command_re = re . compile ( " ^Set-([a-z]+) *: *(.+)$ " , re . I + re . UNICODE )
# Updated in 7.0 to match the model name as well
# Typical form of references is <timestamp-openerp-record_id-model_name@domain>
# group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
2014-09-05 09:44:20 +00:00
reference_re = re . compile ( " <.*-open(?:object|erp)-( \\ d+)(?:-([ \ w.]+))?[^>]*@([^>]*)> " , re . UNICODE )
2012-11-07 15:41:29 +00:00
2013-08-06 15:10:18 +00:00
2012-11-06 11:57:44 +00:00
def generate_tracking_message_id ( res_id ) :
""" Returns a string that can be used in the Message-ID RFC822 header field
2012-11-06 12:41:09 +00:00
2012-11-06 11:57:44 +00:00
Used to track the replies related to a given object thanks to the " In-Reply-To "
or " References " fields that Mail User Agents will set .
"""
try :
rnd = random . SystemRandom ( ) . random ( )
except NotImplementedError :
rnd = random . random ( )
2012-11-06 12:41:09 +00:00
rndstr = ( " %.15f " % rnd ) [ 2 : ]
2012-11-06 11:57:44 +00:00
return " < %.15f . %s -openerp- %s @ %s > " % ( time . time ( ) , rndstr , res_id , socket . gethostname ( ) )
def email_send ( email_from , email_to , subject , body , email_cc = None , email_bcc = None , reply_to = False ,
attachments = None , message_id = None , references = None , openobject_id = False , debug = False , subtype = ' plain ' , headers = None ,
smtp_server = None , smtp_port = None , ssl = False , smtp_user = None , smtp_password = None , cr = None , uid = None ) :
""" Low-level function for sending an email (deprecated).
2012-11-06 12:41:09 +00:00
: deprecate : since OpenERP 6.1 , please use ir . mail_server . send_email ( ) instead .
2012-11-06 11:57:44 +00:00
: param email_from : A string used to fill the ` From ` header , if falsy ,
config [ ' email_from ' ] is used instead . Also used for
the ` Reply - To ` header if ` reply_to ` is not provided
: param email_to : a sequence of addresses to send the mail to .
"""
# If not cr, get cr from current thread database
2012-11-26 18:15:27 +00:00
local_cr = None
2012-11-06 11:57:44 +00:00
if not cr :
db_name = getattr ( threading . currentThread ( ) , ' dbname ' , None )
if db_name :
2014-04-09 09:56:04 +00:00
local_cr = cr = openerp . registry ( db_name ) . cursor ( )
2012-11-06 11:57:44 +00:00
else :
raise Exception ( " No database cursor found, please pass one explicitly " )
# Send Email
try :
2013-03-27 11:10:14 +00:00
mail_server_pool = openerp . registry ( cr . dbname ) [ ' ir.mail_server ' ]
2012-11-06 11:57:44 +00:00
res = False
# Pack Message into MIME Object
email_msg = mail_server_pool . build_email ( email_from , email_to , subject , body , email_cc , email_bcc , reply_to ,
attachments , message_id , references , openobject_id , subtype , headers = headers )
res = mail_server_pool . send_email ( cr , uid or 1 , email_msg , mail_server_id = None ,
smtp_server = smtp_server , smtp_port = smtp_port , smtp_user = smtp_user , smtp_password = smtp_password ,
smtp_encryption = ( ' ssl ' if ssl else None ) , smtp_debug = debug )
except Exception :
_logger . exception ( " tools.email_send failed to deliver email " )
return False
finally :
2012-11-26 18:15:27 +00:00
if local_cr :
cr . close ( )
2012-11-06 11:57:44 +00:00
return res
def email_split ( text ) :
""" Return a list of the email addresses found in ``text`` """
2012-11-06 12:41:09 +00:00
if not text :
return [ ]
2014-01-14 15:44:26 +00:00
return [ addr [ 1 ] for addr in getaddresses ( [ text ] )
# getaddresses() returns '' when email parsing fails, and
# sometimes returns emails without at least '@'. The '@'
# is strictly required in RFC2822's `addr-spec`.
if addr [ 1 ]
2014-08-06 12:16:46 +00:00
if ' @ ' in addr [ 1 ] ]