[FIX] conversion from html to values for text field
see html_to_text comment bzr revid: xmo@openerp.com-20140110152024-yp2out0jg1gx1uhs
This commit is contained in:
parent
24e1e2cb6c
commit
3f8c2d0bad
|
@ -183,7 +183,7 @@ class Text(orm.AbstractModel):
|
|||
_inherit = ['website.qweb.field', 'ir.qweb.field.text']
|
||||
|
||||
def from_html(self, cr, uid, model, column, element, context=None):
|
||||
return element.text_content()
|
||||
return html_to_text(element)
|
||||
|
||||
class Selection(orm.AbstractModel):
|
||||
_name = 'website.qweb.field.selection'
|
||||
|
@ -211,7 +211,8 @@ class ManyToOne(orm.AbstractModel):
|
|||
M2O = self.pool[column._obj]
|
||||
field = element.get('data-oe-field')
|
||||
id = int(element.get('data-oe-id'))
|
||||
value = element.text_content().strip()
|
||||
# FIXME: weird things are going to happen for char-type _rec_name
|
||||
value = html_to_text(element)
|
||||
|
||||
# if anything blows up, just ignore it and bail
|
||||
try:
|
||||
|
@ -419,3 +420,102 @@ class Contact(orm.AbstractModel):
|
|||
}, engine='website.qweb', context=context)
|
||||
|
||||
return ir_qweb.HTMLSafe(html)
|
||||
|
||||
def html_to_text(element):
|
||||
""" Converts HTML content with HTML-specified line breaks (br, p, div, ...)
|
||||
in roughly equivalent textual content.
|
||||
|
||||
Used to replace and fixup the roundtripping of text and m2o: when using
|
||||
libxml 2.8.0 (but not 2.9.1) and parsing HTML with lxml.html.fromstring
|
||||
whitespace text nodes (text nodes composed *solely* of whitespace) are
|
||||
stripped out with no recourse, and fundamentally relying on newlines
|
||||
being in the text (e.g. inserted during user edition) is probably poor form
|
||||
anyway.
|
||||
|
||||
-> this utility function collapses whitespace sequences and replaces
|
||||
nodes by roughly corresponding linebreaks
|
||||
* p are pre-and post-fixed by 2 newlines
|
||||
* br are replaced by a single newline
|
||||
* block-level elements not already mentioned are pre- and post-fixed by
|
||||
a single newline
|
||||
|
||||
ought be somewhat similar (but much less high-tech) to aaronsw's html2text.
|
||||
the latter produces full-blown markdown, our text -> html converter only
|
||||
replaces newlines by <br> elements at this point so we're reverting that,
|
||||
and a few more newline-ish elements in case the user tried to add
|
||||
newlines/paragraphs into the text field
|
||||
|
||||
:param element: lxml.html content
|
||||
:returns: corresponding pure-text output
|
||||
"""
|
||||
|
||||
# output is a list of str | int. Integers are padding requests (in minimum
|
||||
# number of newlines). When multiple padding requests, fold them into the
|
||||
# biggest one
|
||||
output = []
|
||||
_wrap(element, output)
|
||||
|
||||
# remove any leading or tailing whitespace, replace sequences of
|
||||
# (whitespace)\n(whitespace) by a single newline, where (whitespace) is a
|
||||
# non-newline whitespace in this case
|
||||
return re.sub(
|
||||
r'[ \t\r\f]*\n[ \t\r\f]*',
|
||||
'\n',
|
||||
''.join(_realize_padding(output)).strip())
|
||||
|
||||
_PADDED_BLOCK = set('p h1 h2 h3 h4 h5 h6'.split())
|
||||
# https://developer.mozilla.org/en-US/docs/HTML/Block-level_elements minus p
|
||||
_MISC_BLOCK = set((
|
||||
'address article aside audio blockquote canvas dd dl div figcaption figure'
|
||||
' footer form header hgroup hr ol output pre section tfoot ul video'
|
||||
).split())
|
||||
|
||||
def _collapse_whitespace(text):
|
||||
""" Collapses sequences of whitespace characters in ``text`` to a single
|
||||
space
|
||||
"""
|
||||
return re.sub('\s+', ' ', text)
|
||||
def _realize_padding(it):
|
||||
""" Fold and convert padding requests: integers in the output sequence are
|
||||
requests for at least n newlines of padding. Runs thereof can be collapsed
|
||||
into the largest requests and converted to newlines.
|
||||
"""
|
||||
padding = None
|
||||
for item in it:
|
||||
if isinstance(item, int):
|
||||
padding = max(padding, item)
|
||||
continue
|
||||
|
||||
if padding:
|
||||
yield '\n' * padding
|
||||
padding = None
|
||||
|
||||
yield item
|
||||
# leftover padding irrelevant as the output will be stripped
|
||||
|
||||
def _wrap(element, output, wrapper=u''):
|
||||
""" Recursively extracts text from ``element`` (via _element_to_text), and
|
||||
wraps it all in ``wrapper``. Extracted text is added to ``output``
|
||||
|
||||
:type wrapper: basestring | int
|
||||
"""
|
||||
output.append(wrapper)
|
||||
if element.text:
|
||||
output.append(_collapse_whitespace(element.text))
|
||||
for child in element:
|
||||
_element_to_text(child, output)
|
||||
output.append(wrapper)
|
||||
|
||||
def _element_to_text(e, output):
|
||||
if e.tag == 'br':
|
||||
output.append(u'\n')
|
||||
elif e.tag in _PADDED_BLOCK:
|
||||
_wrap(e, output, 2)
|
||||
elif e.tag in _MISC_BLOCK:
|
||||
_wrap(e, output, 1)
|
||||
else:
|
||||
# inline
|
||||
_wrap(e, output)
|
||||
|
||||
if e.tail:
|
||||
output.append(_collapse_whitespace(e.tail))
|
||||
|
|
|
@ -1,15 +1,120 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from functools import partial
|
||||
import textwrap
|
||||
import unittest2
|
||||
from xml.dom.minidom import getDOMImplementation
|
||||
|
||||
from lxml import html
|
||||
from lxml.builder import E
|
||||
|
||||
from openerp.tests import common
|
||||
from openerp.addons.base.ir import ir_qweb
|
||||
from openerp.addons.website.models.ir_qweb import html_to_text
|
||||
|
||||
impl = getDOMImplementation()
|
||||
document = impl.createDocument(None, None, None)
|
||||
|
||||
class TestHTMLToText(unittest2.TestCase):
|
||||
def test_rawstring(self):
|
||||
self.assertEqual(
|
||||
"foobar",
|
||||
html_to_text(E.div("foobar")))
|
||||
|
||||
def test_br(self):
|
||||
self.assertEqual(
|
||||
"foo\nbar",
|
||||
html_to_text(E.div("foo", E.br(), "bar")))
|
||||
|
||||
self.assertEqual(
|
||||
"foo\n\nbar\nbaz",
|
||||
html_to_text(E.div(
|
||||
"foo", E.br(), E.br(),
|
||||
"bar", E.br(),
|
||||
"baz")))
|
||||
|
||||
def test_p(self):
|
||||
self.assertEqual(
|
||||
"foo\n\nbar\n\nbaz",
|
||||
html_to_text(E.div(
|
||||
"foo",
|
||||
E.p("bar"),
|
||||
"baz")))
|
||||
|
||||
self.assertEqual(
|
||||
"foo",
|
||||
html_to_text(E.div(E.p("foo"))))
|
||||
|
||||
self.assertEqual(
|
||||
"foo\n\nbar",
|
||||
html_to_text(E.div("foo", E.p("bar"))))
|
||||
self.assertEqual(
|
||||
"foo\n\nbar",
|
||||
html_to_text(E.div(E.p("foo"), "bar")))
|
||||
|
||||
self.assertEqual(
|
||||
"foo\n\nbar\n\nbaz",
|
||||
html_to_text(E.div(
|
||||
E.p("foo"),
|
||||
E.p("bar"),
|
||||
E.p("baz"),
|
||||
)))
|
||||
|
||||
def test_div(self):
|
||||
self.assertEqual(
|
||||
"foo\nbar\nbaz",
|
||||
html_to_text(E.div(
|
||||
"foo",
|
||||
E.div("bar"),
|
||||
"baz"
|
||||
)))
|
||||
|
||||
self.assertEqual(
|
||||
"foo",
|
||||
html_to_text(E.div(E.div("foo"))))
|
||||
|
||||
self.assertEqual(
|
||||
"foo\nbar",
|
||||
html_to_text(E.div("foo", E.div("bar"))))
|
||||
self.assertEqual(
|
||||
"foo\nbar",
|
||||
html_to_text(E.div(E.div("foo"), "bar")))
|
||||
|
||||
self.assertEqual(
|
||||
"foo\nbar\nbaz",
|
||||
html_to_text(E.div(
|
||||
"foo",
|
||||
E.div("bar"),
|
||||
E.div("baz")
|
||||
)))
|
||||
|
||||
def test_other_block(self):
|
||||
self.assertEqual(
|
||||
"foo\nbar\nbaz",
|
||||
html_to_text(E.div(
|
||||
"foo",
|
||||
E.section("bar"),
|
||||
"baz"
|
||||
)))
|
||||
|
||||
def test_inline(self):
|
||||
self.assertEqual(
|
||||
"foobarbaz",
|
||||
html_to_text(E.div("foo", E.span("bar"), "baz")))
|
||||
|
||||
def test_whitespace(self):
|
||||
self.assertEqual(
|
||||
"foo bar\nbaz",
|
||||
html_to_text(E.div(
|
||||
"foo\nbar",
|
||||
E.br(),
|
||||
"baz")
|
||||
))
|
||||
|
||||
self.assertEqual(
|
||||
"foo bar\nbaz",
|
||||
html_to_text(E.div(
|
||||
E.div(E.span("foo"), " bar"),
|
||||
"baz")))
|
||||
|
||||
class TestConvertBack(common.TransactionCase):
|
||||
def setUp(self):
|
||||
super(TestConvertBack, self).setUp()
|
||||
|
@ -70,7 +175,7 @@ class TestConvertBack(common.TransactionCase):
|
|||
self.field_roundtrip('selection_str', 'B')
|
||||
|
||||
def test_text(self):
|
||||
self.field_roundtrip('text', """
|
||||
self.field_roundtrip('text', textwrap.dedent("""\
|
||||
You must obey the dance commander
|
||||
Givin' out the order for fun
|
||||
You must obey the dance commander
|
||||
|
@ -88,8 +193,7 @@ class TestConvertBack(common.TransactionCase):
|
|||
Let's start the show
|
||||
Because you never know
|
||||
You never know
|
||||
You never know until you go
|
||||
""")
|
||||
You never know until you go"""))
|
||||
|
||||
def test_m2o(self):
|
||||
""" the M2O field conversion (from html) is markedly different from
|
||||
|
|
Loading…
Reference in New Issue