[FIX] conversion from html to values for text field

see html_to_text comment bzr revid: xmo@openerp.com-20140110152024-yp2out0jg1gx1uhs
2014-01-10 16:20:24 +01:00 · 2014-01-10 16:20:24 +01:00 · 3f8c2d0bad
parent 24e1e2cb6c
commit 3f8c2d0bad
2 changed files with 210 additions and 6 deletions
--- a/addons/website/models/ir_qweb.py
+++ b/addons/website/models/ir_qweb.py
@ -183,7 +183,7 @@ class Text(orm.AbstractModel):
    _inherit = ['website.qweb.field', 'ir.qweb.field.text']

    def from_html(self, cr, uid, model, column, element, context=None):
-        return element.text_content()
+        return html_to_text(element)

 class Selection(orm.AbstractModel):
    _name = 'website.qweb.field.selection'
@ -211,7 +211,8 @@ class ManyToOne(orm.AbstractModel):
        M2O = self.pool[column._obj]
        field = element.get('data-oe-field')
        id = int(element.get('data-oe-id'))
-        value = element.text_content().strip()
+        # FIXME: weird things are going to happen for char-type _rec_name
+        value = html_to_text(element)

        # if anything blows up, just ignore it and bail
        try:
@ -419,3 +420,102 @@ class Contact(orm.AbstractModel):
        }, engine='website.qweb', context=context)

        return ir_qweb.HTMLSafe(html)
+
+def html_to_text(element):
+    """ Converts HTML content with HTML-specified line breaks (br, p, div, ...)
+    in roughly equivalent textual content.
+
+    Used to replace and fixup the roundtripping of text and m2o: when using
+    libxml 2.8.0 (but not 2.9.1) and parsing HTML with lxml.html.fromstring
+    whitespace text nodes (text nodes composed *solely* of whitespace) are
+    stripped out with no recourse, and fundamentally relying on newlines
+    being in the text (e.g. inserted during user edition) is probably poor form
+    anyway.
+
+    -> this utility function collapses whitespace sequences and replaces
+       nodes by roughly corresponding linebreaks
+       * p are pre-and post-fixed by 2 newlines
+       * br are replaced by a single newline
+       * block-level elements not already mentioned are pre- and post-fixed by
+         a single newline
+
+    ought be somewhat similar (but much less high-tech) to aaronsw's html2text.
+    the latter produces full-blown markdown, our text -> html converter only
+    replaces newlines by <br> elements at this point so we're reverting that,
+    and a few more newline-ish elements in case the user tried to add
+    newlines/paragraphs into the text field
+
+    :param element: lxml.html content
+    :returns: corresponding pure-text output
+    """
+
+    # output is a list of str | int. Integers are padding requests (in minimum
+    # number of newlines). When multiple padding requests, fold them into the
+    # biggest one
+    output = []
+    _wrap(element, output)
+
+    # remove any leading or tailing whitespace, replace sequences of
+    # (whitespace)\n(whitespace) by a single newline, where (whitespace) is a
+    # non-newline whitespace in this case
+    return re.sub(
+        r'[ \t\r\f]*\n[ \t\r\f]*',
+        '\n',
+        ''.join(_realize_padding(output)).strip())
+
+_PADDED_BLOCK = set('p h1 h2 h3 h4 h5 h6'.split())
+# https://developer.mozilla.org/en-US/docs/HTML/Block-level_elements minus p
+_MISC_BLOCK = set((
+    'address article aside audio blockquote canvas dd dl div figcaption figure'
+    ' footer form header hgroup hr ol output pre section tfoot ul video'
+).split())
+
+def _collapse_whitespace(text):
+    """ Collapses sequences of whitespace characters in ``text`` to a single
+    space
+    """
+    return re.sub('\s+', ' ', text)
+def _realize_padding(it):
+    """ Fold and convert padding requests: integers in the output sequence are
+    requests for at least n newlines of padding. Runs thereof can be collapsed
+    into the largest requests and converted to newlines.
+    """
+    padding = None
+    for item in it:
+        if isinstance(item, int):
+            padding = max(padding, item)
+            continue
+
+        if padding:
+            yield '\n' * padding
+            padding = None
+
+        yield item
+    # leftover padding irrelevant as the output will be stripped
+
+def _wrap(element, output, wrapper=u''):
+    """ Recursively extracts text from ``element`` (via _element_to_text), and
+    wraps it all in ``wrapper``. Extracted text is added to ``output``
+
+    :type wrapper: basestring | int
+    """
+    output.append(wrapper)
+    if element.text:
+        output.append(_collapse_whitespace(element.text))
+    for child in element:
+        _element_to_text(child, output)
+    output.append(wrapper)
+
+def _element_to_text(e, output):
+    if e.tag == 'br':
+        output.append(u'\n')
+    elif e.tag in _PADDED_BLOCK:
+        _wrap(e, output, 2)
+    elif e.tag in _MISC_BLOCK:
+        _wrap(e, output, 1)
+    else:
+        # inline
+        _wrap(e, output)
+
+    if e.tail:
+        output.append(_collapse_whitespace(e.tail))
--- a/addons/website/tests/test_converter.py
+++ b/addons/website/tests/test_converter.py
@ -1,15 +1,120 @@
 # -*- coding: utf-8 -*-
-from functools import partial
+import textwrap
+import unittest2
 from xml.dom.minidom import getDOMImplementation

 from lxml import html
+from lxml.builder import E

 from openerp.tests import common
 from openerp.addons.base.ir import ir_qweb
+from openerp.addons.website.models.ir_qweb import html_to_text

 impl = getDOMImplementation()
 document = impl.createDocument(None, None, None)

+class TestHTMLToText(unittest2.TestCase):
+    def test_rawstring(self):
+        self.assertEqual(
+            "foobar",
+            html_to_text(E.div("foobar")))
+
+    def test_br(self):
+        self.assertEqual(
+            "foo\nbar",
+            html_to_text(E.div("foo", E.br(), "bar")))
+
+        self.assertEqual(
+            "foo\n\nbar\nbaz",
+            html_to_text(E.div(
+                "foo", E.br(), E.br(),
+                "bar", E.br(),
+                "baz")))
+
+    def test_p(self):
+        self.assertEqual(
+            "foo\n\nbar\n\nbaz",
+            html_to_text(E.div(
+                "foo",
+                E.p("bar"),
+                "baz")))
+
+        self.assertEqual(
+            "foo",
+            html_to_text(E.div(E.p("foo"))))
+
+        self.assertEqual(
+            "foo\n\nbar",
+            html_to_text(E.div("foo", E.p("bar"))))
+        self.assertEqual(
+            "foo\n\nbar",
+            html_to_text(E.div(E.p("foo"), "bar")))
+
+        self.assertEqual(
+            "foo\n\nbar\n\nbaz",
+            html_to_text(E.div(
+                E.p("foo"),
+                E.p("bar"),
+                E.p("baz"),
+            )))
+
+    def test_div(self):
+        self.assertEqual(
+            "foo\nbar\nbaz",
+            html_to_text(E.div(
+                "foo",
+                E.div("bar"),
+                "baz"
+            )))
+
+        self.assertEqual(
+            "foo",
+            html_to_text(E.div(E.div("foo"))))
+
+        self.assertEqual(
+            "foo\nbar",
+            html_to_text(E.div("foo", E.div("bar"))))
+        self.assertEqual(
+            "foo\nbar",
+            html_to_text(E.div(E.div("foo"), "bar")))
+
+        self.assertEqual(
+            "foo\nbar\nbaz",
+            html_to_text(E.div(
+                "foo",
+                E.div("bar"),
+                E.div("baz")
+            )))
+
+    def test_other_block(self):
+        self.assertEqual(
+            "foo\nbar\nbaz",
+            html_to_text(E.div(
+                "foo",
+                E.section("bar"),
+                "baz"
+            )))
+
+    def test_inline(self):
+        self.assertEqual(
+            "foobarbaz",
+            html_to_text(E.div("foo", E.span("bar"), "baz")))
+
+    def test_whitespace(self):
+        self.assertEqual(
+            "foo bar\nbaz",
+            html_to_text(E.div(
+                "foo\nbar",
+                E.br(),
+                "baz")
+            ))
+
+        self.assertEqual(
+            "foo bar\nbaz",
+            html_to_text(E.div(
+                E.div(E.span("foo"), " bar"),
+                "baz")))
+
 class TestConvertBack(common.TransactionCase):
    def setUp(self):
        super(TestConvertBack, self).setUp()
@ -70,7 +175,7 @@ class TestConvertBack(common.TransactionCase):
        self.field_roundtrip('selection_str', 'B')

    def test_text(self):
-        self.field_roundtrip('text', """
+        self.field_roundtrip('text', textwrap.dedent("""\
            You must obey the dance commander
            Givin' out the order for fun
            You must obey the dance commander
@ -88,8 +193,7 @@ class TestConvertBack(common.TransactionCase):
            Let's start the show
            Because you never know
            You never know
-            You never know until you go
-        """)
+            You never know until you go"""))

    def test_m2o(self):
        """ the M2O field conversion (from html) is markedly different from