[FIX] conversion from html to values for text field

see html_to_text comment

bzr revid: xmo@openerp.com-20140110152024-yp2out0jg1gx1uhs
This commit is contained in:
Xavier Morel 2014-01-10 16:20:24 +01:00
parent 24e1e2cb6c
commit 3f8c2d0bad
2 changed files with 210 additions and 6 deletions

View File

@ -183,7 +183,7 @@ class Text(orm.AbstractModel):
_inherit = ['website.qweb.field', 'ir.qweb.field.text']
def from_html(self, cr, uid, model, column, element, context=None):
return element.text_content()
return html_to_text(element)
class Selection(orm.AbstractModel):
_name = 'website.qweb.field.selection'
@ -211,7 +211,8 @@ class ManyToOne(orm.AbstractModel):
M2O = self.pool[column._obj]
field = element.get('data-oe-field')
id = int(element.get('data-oe-id'))
value = element.text_content().strip()
# FIXME: weird things are going to happen for char-type _rec_name
value = html_to_text(element)
# if anything blows up, just ignore it and bail
try:
@ -419,3 +420,102 @@ class Contact(orm.AbstractModel):
}, engine='website.qweb', context=context)
return ir_qweb.HTMLSafe(html)
def html_to_text(element):
""" Converts HTML content with HTML-specified line breaks (br, p, div, ...)
in roughly equivalent textual content.
Used to replace and fixup the roundtripping of text and m2o: when using
libxml 2.8.0 (but not 2.9.1) and parsing HTML with lxml.html.fromstring
whitespace text nodes (text nodes composed *solely* of whitespace) are
stripped out with no recourse, and fundamentally relying on newlines
being in the text (e.g. inserted during user edition) is probably poor form
anyway.
-> this utility function collapses whitespace sequences and replaces
nodes by roughly corresponding linebreaks
* p are pre-and post-fixed by 2 newlines
* br are replaced by a single newline
* block-level elements not already mentioned are pre- and post-fixed by
a single newline
ought be somewhat similar (but much less high-tech) to aaronsw's html2text.
the latter produces full-blown markdown, our text -> html converter only
replaces newlines by <br> elements at this point so we're reverting that,
and a few more newline-ish elements in case the user tried to add
newlines/paragraphs into the text field
:param element: lxml.html content
:returns: corresponding pure-text output
"""
# output is a list of str | int. Integers are padding requests (in minimum
# number of newlines). When multiple padding requests, fold them into the
# biggest one
output = []
_wrap(element, output)
# remove any leading or tailing whitespace, replace sequences of
# (whitespace)\n(whitespace) by a single newline, where (whitespace) is a
# non-newline whitespace in this case
return re.sub(
r'[ \t\r\f]*\n[ \t\r\f]*',
'\n',
''.join(_realize_padding(output)).strip())
_PADDED_BLOCK = set('p h1 h2 h3 h4 h5 h6'.split())
# https://developer.mozilla.org/en-US/docs/HTML/Block-level_elements minus p
_MISC_BLOCK = set((
'address article aside audio blockquote canvas dd dl div figcaption figure'
' footer form header hgroup hr ol output pre section tfoot ul video'
).split())
def _collapse_whitespace(text):
""" Collapses sequences of whitespace characters in ``text`` to a single
space
"""
return re.sub('\s+', ' ', text)
def _realize_padding(it):
""" Fold and convert padding requests: integers in the output sequence are
requests for at least n newlines of padding. Runs thereof can be collapsed
into the largest requests and converted to newlines.
"""
padding = None
for item in it:
if isinstance(item, int):
padding = max(padding, item)
continue
if padding:
yield '\n' * padding
padding = None
yield item
# leftover padding irrelevant as the output will be stripped
def _wrap(element, output, wrapper=u''):
""" Recursively extracts text from ``element`` (via _element_to_text), and
wraps it all in ``wrapper``. Extracted text is added to ``output``
:type wrapper: basestring | int
"""
output.append(wrapper)
if element.text:
output.append(_collapse_whitespace(element.text))
for child in element:
_element_to_text(child, output)
output.append(wrapper)
def _element_to_text(e, output):
if e.tag == 'br':
output.append(u'\n')
elif e.tag in _PADDED_BLOCK:
_wrap(e, output, 2)
elif e.tag in _MISC_BLOCK:
_wrap(e, output, 1)
else:
# inline
_wrap(e, output)
if e.tail:
output.append(_collapse_whitespace(e.tail))

View File

@ -1,15 +1,120 @@
# -*- coding: utf-8 -*-
from functools import partial
import textwrap
import unittest2
from xml.dom.minidom import getDOMImplementation
from lxml import html
from lxml.builder import E
from openerp.tests import common
from openerp.addons.base.ir import ir_qweb
from openerp.addons.website.models.ir_qweb import html_to_text
impl = getDOMImplementation()
document = impl.createDocument(None, None, None)
class TestHTMLToText(unittest2.TestCase):
def test_rawstring(self):
self.assertEqual(
"foobar",
html_to_text(E.div("foobar")))
def test_br(self):
self.assertEqual(
"foo\nbar",
html_to_text(E.div("foo", E.br(), "bar")))
self.assertEqual(
"foo\n\nbar\nbaz",
html_to_text(E.div(
"foo", E.br(), E.br(),
"bar", E.br(),
"baz")))
def test_p(self):
self.assertEqual(
"foo\n\nbar\n\nbaz",
html_to_text(E.div(
"foo",
E.p("bar"),
"baz")))
self.assertEqual(
"foo",
html_to_text(E.div(E.p("foo"))))
self.assertEqual(
"foo\n\nbar",
html_to_text(E.div("foo", E.p("bar"))))
self.assertEqual(
"foo\n\nbar",
html_to_text(E.div(E.p("foo"), "bar")))
self.assertEqual(
"foo\n\nbar\n\nbaz",
html_to_text(E.div(
E.p("foo"),
E.p("bar"),
E.p("baz"),
)))
def test_div(self):
self.assertEqual(
"foo\nbar\nbaz",
html_to_text(E.div(
"foo",
E.div("bar"),
"baz"
)))
self.assertEqual(
"foo",
html_to_text(E.div(E.div("foo"))))
self.assertEqual(
"foo\nbar",
html_to_text(E.div("foo", E.div("bar"))))
self.assertEqual(
"foo\nbar",
html_to_text(E.div(E.div("foo"), "bar")))
self.assertEqual(
"foo\nbar\nbaz",
html_to_text(E.div(
"foo",
E.div("bar"),
E.div("baz")
)))
def test_other_block(self):
self.assertEqual(
"foo\nbar\nbaz",
html_to_text(E.div(
"foo",
E.section("bar"),
"baz"
)))
def test_inline(self):
self.assertEqual(
"foobarbaz",
html_to_text(E.div("foo", E.span("bar"), "baz")))
def test_whitespace(self):
self.assertEqual(
"foo bar\nbaz",
html_to_text(E.div(
"foo\nbar",
E.br(),
"baz")
))
self.assertEqual(
"foo bar\nbaz",
html_to_text(E.div(
E.div(E.span("foo"), " bar"),
"baz")))
class TestConvertBack(common.TransactionCase):
def setUp(self):
super(TestConvertBack, self).setUp()
@ -70,7 +175,7 @@ class TestConvertBack(common.TransactionCase):
self.field_roundtrip('selection_str', 'B')
def test_text(self):
self.field_roundtrip('text', """
self.field_roundtrip('text', textwrap.dedent("""\
You must obey the dance commander
Givin' out the order for fun
You must obey the dance commander
@ -88,8 +193,7 @@ class TestConvertBack(common.TransactionCase):
Let's start the show
Because you never know
You never know
You never know until you go
""")
You never know until you go"""))
def test_m2o(self):
""" the M2O field conversion (from html) is markedly different from