[IMP] html_sanitize:

- add option to reraise catched exceptions - only keep safe attributes (style is considered safe) (depending of lxml version) - remove forms tags bzr revid: chs@openerp.com-20130821094758-1ae0d1ml5obufzxv
2013-08-21 11:47:58 +02:00 · 2013-08-21 11:47:58 +02:00 · 8179683143
parent 788eb29237
commit 8179683143
1 changed files with 32 additions and 12 deletions
--- a/openerp/tools/mail.py
+++ b/openerp/tools/mail.py
@ -2,7 +2,7 @@
 ##############################################################################
 #
 #    OpenERP, Open Source Business Applications
-#    Copyright (C) 2012 OpenERP S.A. (<http://openerp.com>).
+#    Copyright (C) 2012-2013 OpenERP S.A. (<http://openerp.com>).
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU Affero General Public License as
@ -44,7 +44,7 @@ tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "if
 tags_to_remove = ['html', 'body', 'font']
-def html_sanitize(src):
+def html_sanitize(src, silent=True):
    if not src:
        return src
    src = ustr(src, errors='replace')
@ -52,18 +52,38 @@ def html_sanitize(src):
    # html encode email tags
    part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
    src = part.sub(lambda m: cgi.escape(m.group(1)), src)
-    
+
-    # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
+    kwargs = {
        'page_structure': True,
        'style': False,             # do not remove style attributes
        'forms': True,              # remove form tags
    }
    if etree.LXML_VERSION >= (2, 3, 1):
        # kill_tags attribute has been added in version 2.3.1
        kwargs.update({
            'kill_tags': tags_to_kill,
            'remove_tags': tags_to_remove,
        })
    else:
        kwargs['remove_tags'] = tags_to_kill + tags_to_remove
    if etree.LXML_VERSION >= (3, 1, 0):
        kwargs.update({
            'safe_attrs_only': True,
            'safe_attrs': clean.defs.safe_attrs | set(['style']),
        })
    else:
        # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attribute in order to keep "style"
        kwargs['safe_attrs_only'] = False
    try:
-        cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, kill_tags=tags_to_kill, remove_tags=tags_to_remove)
+        # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
        cleaner = clean.Cleaner(**kwargs)
        cleaned = cleaner.clean_html(src)
-    except TypeError, e:
+    except Exception:
-        # lxml.clean version < 2.3.1 does not have a kill_tags attribute
+        if not silent:
-        # to remove in 2014
+            raise
-        cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill+tags_to_remove)
+        _logger.warning('html_sanitize failed to parse %r', src, exc_info=True)
        cleaned = cleaner.clean_html(src)
    except:
        _logger.warning('html_sanitize failed to parse %s' % (src))
        cleaned = '<p>Impossible to parse</p>'
    return cleaned