From 3f8c2d0badcea3dd65c7fa9bdfabf0bc20da0355 Mon Sep 17 00:00:00 2001 From: Xavier Morel Date: Fri, 10 Jan 2014 16:20:24 +0100 Subject: [PATCH] [FIX] conversion from html to values for text field see html_to_text comment bzr revid: xmo@openerp.com-20140110152024-yp2out0jg1gx1uhs --- addons/website/models/ir_qweb.py | 104 ++++++++++++++++++++++- addons/website/tests/test_converter.py | 112 ++++++++++++++++++++++++- 2 files changed, 210 insertions(+), 6 deletions(-) diff --git a/addons/website/models/ir_qweb.py b/addons/website/models/ir_qweb.py index aa7200c79b3..a5f4013a7df 100644 --- a/addons/website/models/ir_qweb.py +++ b/addons/website/models/ir_qweb.py @@ -183,7 +183,7 @@ class Text(orm.AbstractModel): _inherit = ['website.qweb.field', 'ir.qweb.field.text'] def from_html(self, cr, uid, model, column, element, context=None): - return element.text_content() + return html_to_text(element) class Selection(orm.AbstractModel): _name = 'website.qweb.field.selection' @@ -211,7 +211,8 @@ class ManyToOne(orm.AbstractModel): M2O = self.pool[column._obj] field = element.get('data-oe-field') id = int(element.get('data-oe-id')) - value = element.text_content().strip() + # FIXME: weird things are going to happen for char-type _rec_name + value = html_to_text(element) # if anything blows up, just ignore it and bail try: @@ -419,3 +420,102 @@ class Contact(orm.AbstractModel): }, engine='website.qweb', context=context) return ir_qweb.HTMLSafe(html) + +def html_to_text(element): + """ Converts HTML content with HTML-specified line breaks (br, p, div, ...) + in roughly equivalent textual content. + + Used to replace and fixup the roundtripping of text and m2o: when using + libxml 2.8.0 (but not 2.9.1) and parsing HTML with lxml.html.fromstring + whitespace text nodes (text nodes composed *solely* of whitespace) are + stripped out with no recourse, and fundamentally relying on newlines + being in the text (e.g. inserted during user edition) is probably poor form + anyway. + + -> this utility function collapses whitespace sequences and replaces + nodes by roughly corresponding linebreaks + * p are pre-and post-fixed by 2 newlines + * br are replaced by a single newline + * block-level elements not already mentioned are pre- and post-fixed by + a single newline + + ought be somewhat similar (but much less high-tech) to aaronsw's html2text. + the latter produces full-blown markdown, our text -> html converter only + replaces newlines by
elements at this point so we're reverting that, + and a few more newline-ish elements in case the user tried to add + newlines/paragraphs into the text field + + :param element: lxml.html content + :returns: corresponding pure-text output + """ + + # output is a list of str | int. Integers are padding requests (in minimum + # number of newlines). When multiple padding requests, fold them into the + # biggest one + output = [] + _wrap(element, output) + + # remove any leading or tailing whitespace, replace sequences of + # (whitespace)\n(whitespace) by a single newline, where (whitespace) is a + # non-newline whitespace in this case + return re.sub( + r'[ \t\r\f]*\n[ \t\r\f]*', + '\n', + ''.join(_realize_padding(output)).strip()) + +_PADDED_BLOCK = set('p h1 h2 h3 h4 h5 h6'.split()) +# https://developer.mozilla.org/en-US/docs/HTML/Block-level_elements minus p +_MISC_BLOCK = set(( + 'address article aside audio blockquote canvas dd dl div figcaption figure' + ' footer form header hgroup hr ol output pre section tfoot ul video' +).split()) + +def _collapse_whitespace(text): + """ Collapses sequences of whitespace characters in ``text`` to a single + space + """ + return re.sub('\s+', ' ', text) +def _realize_padding(it): + """ Fold and convert padding requests: integers in the output sequence are + requests for at least n newlines of padding. Runs thereof can be collapsed + into the largest requests and converted to newlines. + """ + padding = None + for item in it: + if isinstance(item, int): + padding = max(padding, item) + continue + + if padding: + yield '\n' * padding + padding = None + + yield item + # leftover padding irrelevant as the output will be stripped + +def _wrap(element, output, wrapper=u''): + """ Recursively extracts text from ``element`` (via _element_to_text), and + wraps it all in ``wrapper``. Extracted text is added to ``output`` + + :type wrapper: basestring | int + """ + output.append(wrapper) + if element.text: + output.append(_collapse_whitespace(element.text)) + for child in element: + _element_to_text(child, output) + output.append(wrapper) + +def _element_to_text(e, output): + if e.tag == 'br': + output.append(u'\n') + elif e.tag in _PADDED_BLOCK: + _wrap(e, output, 2) + elif e.tag in _MISC_BLOCK: + _wrap(e, output, 1) + else: + # inline + _wrap(e, output) + + if e.tail: + output.append(_collapse_whitespace(e.tail)) diff --git a/addons/website/tests/test_converter.py b/addons/website/tests/test_converter.py index 4b24e8f0b9f..d31df8e6956 100644 --- a/addons/website/tests/test_converter.py +++ b/addons/website/tests/test_converter.py @@ -1,15 +1,120 @@ # -*- coding: utf-8 -*- -from functools import partial +import textwrap +import unittest2 from xml.dom.minidom import getDOMImplementation from lxml import html +from lxml.builder import E from openerp.tests import common from openerp.addons.base.ir import ir_qweb +from openerp.addons.website.models.ir_qweb import html_to_text impl = getDOMImplementation() document = impl.createDocument(None, None, None) +class TestHTMLToText(unittest2.TestCase): + def test_rawstring(self): + self.assertEqual( + "foobar", + html_to_text(E.div("foobar"))) + + def test_br(self): + self.assertEqual( + "foo\nbar", + html_to_text(E.div("foo", E.br(), "bar"))) + + self.assertEqual( + "foo\n\nbar\nbaz", + html_to_text(E.div( + "foo", E.br(), E.br(), + "bar", E.br(), + "baz"))) + + def test_p(self): + self.assertEqual( + "foo\n\nbar\n\nbaz", + html_to_text(E.div( + "foo", + E.p("bar"), + "baz"))) + + self.assertEqual( + "foo", + html_to_text(E.div(E.p("foo")))) + + self.assertEqual( + "foo\n\nbar", + html_to_text(E.div("foo", E.p("bar")))) + self.assertEqual( + "foo\n\nbar", + html_to_text(E.div(E.p("foo"), "bar"))) + + self.assertEqual( + "foo\n\nbar\n\nbaz", + html_to_text(E.div( + E.p("foo"), + E.p("bar"), + E.p("baz"), + ))) + + def test_div(self): + self.assertEqual( + "foo\nbar\nbaz", + html_to_text(E.div( + "foo", + E.div("bar"), + "baz" + ))) + + self.assertEqual( + "foo", + html_to_text(E.div(E.div("foo")))) + + self.assertEqual( + "foo\nbar", + html_to_text(E.div("foo", E.div("bar")))) + self.assertEqual( + "foo\nbar", + html_to_text(E.div(E.div("foo"), "bar"))) + + self.assertEqual( + "foo\nbar\nbaz", + html_to_text(E.div( + "foo", + E.div("bar"), + E.div("baz") + ))) + + def test_other_block(self): + self.assertEqual( + "foo\nbar\nbaz", + html_to_text(E.div( + "foo", + E.section("bar"), + "baz" + ))) + + def test_inline(self): + self.assertEqual( + "foobarbaz", + html_to_text(E.div("foo", E.span("bar"), "baz"))) + + def test_whitespace(self): + self.assertEqual( + "foo bar\nbaz", + html_to_text(E.div( + "foo\nbar", + E.br(), + "baz") + )) + + self.assertEqual( + "foo bar\nbaz", + html_to_text(E.div( + E.div(E.span("foo"), " bar"), + "baz"))) + class TestConvertBack(common.TransactionCase): def setUp(self): super(TestConvertBack, self).setUp() @@ -70,7 +175,7 @@ class TestConvertBack(common.TransactionCase): self.field_roundtrip('selection_str', 'B') def test_text(self): - self.field_roundtrip('text', """ + self.field_roundtrip('text', textwrap.dedent("""\ You must obey the dance commander Givin' out the order for fun You must obey the dance commander @@ -88,8 +193,7 @@ class TestConvertBack(common.TransactionCase): Let's start the show Because you never know You never know - You never know until you go - """) + You never know until you go""")) def test_m2o(self): """ the M2O field conversion (from html) is markedly different from