[FIX] conversion from html to values for text field

see html_to_text comment bzr revid: xmo@openerp.com-20140110152024-yp2out0jg1gx1uhs
2014-01-10 16:20:24 +01:00 · 2014-01-10 16:20:24 +01:00 · 3f8c2d0bad
parent 24e1e2cb6c
commit 3f8c2d0bad
2 changed files with 210 additions and 6 deletions
--- a/addons/website/models/ir_qweb.py
+++ b/addons/website/models/ir_qweb.py
@ -183,7 +183,7 @@ class Text(orm.AbstractModel):
    _inherit = ['website.qweb.field', 'ir.qweb.field.text']
    def from_html(self, cr, uid, model, column, element, context=None):
-        return element.text_content()
+        return html_to_text(element)
 class Selection(orm.AbstractModel):
    _name = 'website.qweb.field.selection'
@ -211,7 +211,8 @@ class ManyToOne(orm.AbstractModel):
        M2O = self.pool[column._obj]
        field = element.get('data-oe-field')
        id = int(element.get('data-oe-id'))
-        value = element.text_content().strip()
+        # FIXME: weird things are going to happen for char-type _rec_name
        value = html_to_text(element)
        # if anything blows up, just ignore it and bail
        try:
@ -419,3 +420,102 @@ class Contact(orm.AbstractModel):
        }, engine='website.qweb', context=context)
        return ir_qweb.HTMLSafe(html)
 def html_to_text(element):
    """ Converts HTML content with HTML-specified line breaks (br, p, div, ...)
    in roughly equivalent textual content.
    Used to replace and fixup the roundtripping of text and m2o: when using
    libxml 2.8.0 (but not 2.9.1) and parsing HTML with lxml.html.fromstring
    whitespace text nodes (text nodes composed *solely* of whitespace) are
    stripped out with no recourse, and fundamentally relying on newlines
    being in the text (e.g. inserted during user edition) is probably poor form
    anyway.
    -> this utility function collapses whitespace sequences and replaces
       nodes by roughly corresponding linebreaks
       * p are pre-and post-fixed by 2 newlines
       * br are replaced by a single newline
       * block-level elements not already mentioned are pre- and post-fixed by
         a single newline
    ought be somewhat similar (but much less high-tech) to aaronsw's html2text.
    the latter produces full-blown markdown, our text -> html converter only
    replaces newlines by <br> elements at this point so we're reverting that,
    and a few more newline-ish elements in case the user tried to add
    newlines/paragraphs into the text field
    :param element: lxml.html content
    :returns: corresponding pure-text output
    """
    # output is a list of str | int. Integers are padding requests (in minimum
    # number of newlines). When multiple padding requests, fold them into the
    # biggest one
    output = []
    _wrap(element, output)
    # remove any leading or tailing whitespace, replace sequences of
    # (whitespace)\n(whitespace) by a single newline, where (whitespace) is a
    # non-newline whitespace in this case
    return re.sub(
        r'[ \t\r\f]*\n[ \t\r\f]*',
        '\n',
        ''.join(_realize_padding(output)).strip())
 _PADDED_BLOCK = set('p h1 h2 h3 h4 h5 h6'.split())
 # https://developer.mozilla.org/en-US/docs/HTML/Block-level_elements minus p
 _MISC_BLOCK = set((
    'address article aside audio blockquote canvas dd dl div figcaption figure'
    ' footer form header hgroup hr ol output pre section tfoot ul video'
 ).split())
 def _collapse_whitespace(text):
    """ Collapses sequences of whitespace characters in ``text`` to a single
    space
    """
    return re.sub('\s+', ' ', text)
 def _realize_padding(it):
    """ Fold and convert padding requests: integers in the output sequence are
    requests for at least n newlines of padding. Runs thereof can be collapsed
    into the largest requests and converted to newlines.
    """
    padding = None
    for item in it:
        if isinstance(item, int):
            padding = max(padding, item)
            continue
        if padding:
            yield '\n' * padding
            padding = None
        yield item
    # leftover padding irrelevant as the output will be stripped
 def _wrap(element, output, wrapper=u''):
    """ Recursively extracts text from ``element`` (via _element_to_text), and
    wraps it all in ``wrapper``. Extracted text is added to ``output``
    :type wrapper: basestring | int
    """
    output.append(wrapper)
    if element.text:
        output.append(_collapse_whitespace(element.text))
    for child in element:
        _element_to_text(child, output)
    output.append(wrapper)
 def _element_to_text(e, output):
    if e.tag == 'br':
        output.append(u'\n')
    elif e.tag in _PADDED_BLOCK:
        _wrap(e, output, 2)
    elif e.tag in _MISC_BLOCK:
        _wrap(e, output, 1)
    else:
        # inline
        _wrap(e, output)
    if e.tail:
        output.append(_collapse_whitespace(e.tail))
--- a/addons/website/tests/test_converter.py
+++ b/addons/website/tests/test_converter.py
@ -1,15 +1,120 @@
 # -*- coding: utf-8 -*-
-from functools import partial
+import textwrap
 import unittest2
 from xml.dom.minidom import getDOMImplementation
 from lxml import html
 from lxml.builder import E
 from openerp.tests import common
 from openerp.addons.base.ir import ir_qweb
 from openerp.addons.website.models.ir_qweb import html_to_text
 impl = getDOMImplementation()
 document = impl.createDocument(None, None, None)
 class TestHTMLToText(unittest2.TestCase):
    def test_rawstring(self):
        self.assertEqual(
            "foobar",
            html_to_text(E.div("foobar")))
    def test_br(self):
        self.assertEqual(
            "foo\nbar",
            html_to_text(E.div("foo", E.br(), "bar")))
        self.assertEqual(
            "foo\n\nbar\nbaz",
            html_to_text(E.div(
                "foo", E.br(), E.br(),
                "bar", E.br(),
                "baz")))
    def test_p(self):
        self.assertEqual(
            "foo\n\nbar\n\nbaz",
            html_to_text(E.div(
                "foo",
                E.p("bar"),
                "baz")))
        self.assertEqual(
            "foo",
            html_to_text(E.div(E.p("foo"))))
        self.assertEqual(
            "foo\n\nbar",
            html_to_text(E.div("foo", E.p("bar"))))
        self.assertEqual(
            "foo\n\nbar",
            html_to_text(E.div(E.p("foo"), "bar")))
        self.assertEqual(
            "foo\n\nbar\n\nbaz",
            html_to_text(E.div(
                E.p("foo"),
                E.p("bar"),
                E.p("baz"),
            )))
    def test_div(self):
        self.assertEqual(
            "foo\nbar\nbaz",
            html_to_text(E.div(
                "foo",
                E.div("bar"),
                "baz"
            )))
        self.assertEqual(
            "foo",
            html_to_text(E.div(E.div("foo"))))
        self.assertEqual(
            "foo\nbar",
            html_to_text(E.div("foo", E.div("bar"))))
        self.assertEqual(
            "foo\nbar",
            html_to_text(E.div(E.div("foo"), "bar")))
        self.assertEqual(
            "foo\nbar\nbaz",
            html_to_text(E.div(
                "foo",
                E.div("bar"),
                E.div("baz")
            )))
    def test_other_block(self):
        self.assertEqual(
            "foo\nbar\nbaz",
            html_to_text(E.div(
                "foo",
                E.section("bar"),
                "baz"
            )))
    def test_inline(self):
        self.assertEqual(
            "foobarbaz",
            html_to_text(E.div("foo", E.span("bar"), "baz")))
    def test_whitespace(self):
        self.assertEqual(
            "foo bar\nbaz",
            html_to_text(E.div(
                "foo\nbar",
                E.br(),
                "baz")
            ))
        self.assertEqual(
            "foo bar\nbaz",
            html_to_text(E.div(
                E.div(E.span("foo"), " bar"),
                "baz")))
 class TestConvertBack(common.TransactionCase):
    def setUp(self):
        super(TestConvertBack, self).setUp()
@ -70,7 +175,7 @@ class TestConvertBack(common.TransactionCase):
        self.field_roundtrip('selection_str', 'B')
    def test_text(self):
-        self.field_roundtrip('text', """
+        self.field_roundtrip('text', textwrap.dedent("""\
            You must obey the dance commander
            Givin' out the order for fun
            You must obey the dance commander
@ -88,8 +193,7 @@ class TestConvertBack(common.TransactionCase):
            Let's start the show
            Because you never know
            You never know
-            You never know until you go
+            You never know until you go"""))
        """)
    def test_m2o(self):
        """ the M2O field conversion (from html) is markedly different from