[FIX] conversion from html to values for text field
see html_to_text comment bzr revid: xmo@openerp.com-20140110152024-yp2out0jg1gx1uhs
This commit is contained in:
parent
24e1e2cb6c
commit
3f8c2d0bad
|
@ -183,7 +183,7 @@ class Text(orm.AbstractModel):
|
||||||
_inherit = ['website.qweb.field', 'ir.qweb.field.text']
|
_inherit = ['website.qweb.field', 'ir.qweb.field.text']
|
||||||
|
|
||||||
def from_html(self, cr, uid, model, column, element, context=None):
|
def from_html(self, cr, uid, model, column, element, context=None):
|
||||||
return element.text_content()
|
return html_to_text(element)
|
||||||
|
|
||||||
class Selection(orm.AbstractModel):
|
class Selection(orm.AbstractModel):
|
||||||
_name = 'website.qweb.field.selection'
|
_name = 'website.qweb.field.selection'
|
||||||
|
@ -211,7 +211,8 @@ class ManyToOne(orm.AbstractModel):
|
||||||
M2O = self.pool[column._obj]
|
M2O = self.pool[column._obj]
|
||||||
field = element.get('data-oe-field')
|
field = element.get('data-oe-field')
|
||||||
id = int(element.get('data-oe-id'))
|
id = int(element.get('data-oe-id'))
|
||||||
value = element.text_content().strip()
|
# FIXME: weird things are going to happen for char-type _rec_name
|
||||||
|
value = html_to_text(element)
|
||||||
|
|
||||||
# if anything blows up, just ignore it and bail
|
# if anything blows up, just ignore it and bail
|
||||||
try:
|
try:
|
||||||
|
@ -419,3 +420,102 @@ class Contact(orm.AbstractModel):
|
||||||
}, engine='website.qweb', context=context)
|
}, engine='website.qweb', context=context)
|
||||||
|
|
||||||
return ir_qweb.HTMLSafe(html)
|
return ir_qweb.HTMLSafe(html)
|
||||||
|
|
||||||
|
def html_to_text(element):
|
||||||
|
""" Converts HTML content with HTML-specified line breaks (br, p, div, ...)
|
||||||
|
in roughly equivalent textual content.
|
||||||
|
|
||||||
|
Used to replace and fixup the roundtripping of text and m2o: when using
|
||||||
|
libxml 2.8.0 (but not 2.9.1) and parsing HTML with lxml.html.fromstring
|
||||||
|
whitespace text nodes (text nodes composed *solely* of whitespace) are
|
||||||
|
stripped out with no recourse, and fundamentally relying on newlines
|
||||||
|
being in the text (e.g. inserted during user edition) is probably poor form
|
||||||
|
anyway.
|
||||||
|
|
||||||
|
-> this utility function collapses whitespace sequences and replaces
|
||||||
|
nodes by roughly corresponding linebreaks
|
||||||
|
* p are pre-and post-fixed by 2 newlines
|
||||||
|
* br are replaced by a single newline
|
||||||
|
* block-level elements not already mentioned are pre- and post-fixed by
|
||||||
|
a single newline
|
||||||
|
|
||||||
|
ought be somewhat similar (but much less high-tech) to aaronsw's html2text.
|
||||||
|
the latter produces full-blown markdown, our text -> html converter only
|
||||||
|
replaces newlines by <br> elements at this point so we're reverting that,
|
||||||
|
and a few more newline-ish elements in case the user tried to add
|
||||||
|
newlines/paragraphs into the text field
|
||||||
|
|
||||||
|
:param element: lxml.html content
|
||||||
|
:returns: corresponding pure-text output
|
||||||
|
"""
|
||||||
|
|
||||||
|
# output is a list of str | int. Integers are padding requests (in minimum
|
||||||
|
# number of newlines). When multiple padding requests, fold them into the
|
||||||
|
# biggest one
|
||||||
|
output = []
|
||||||
|
_wrap(element, output)
|
||||||
|
|
||||||
|
# remove any leading or tailing whitespace, replace sequences of
|
||||||
|
# (whitespace)\n(whitespace) by a single newline, where (whitespace) is a
|
||||||
|
# non-newline whitespace in this case
|
||||||
|
return re.sub(
|
||||||
|
r'[ \t\r\f]*\n[ \t\r\f]*',
|
||||||
|
'\n',
|
||||||
|
''.join(_realize_padding(output)).strip())
|
||||||
|
|
||||||
|
_PADDED_BLOCK = set('p h1 h2 h3 h4 h5 h6'.split())
|
||||||
|
# https://developer.mozilla.org/en-US/docs/HTML/Block-level_elements minus p
|
||||||
|
_MISC_BLOCK = set((
|
||||||
|
'address article aside audio blockquote canvas dd dl div figcaption figure'
|
||||||
|
' footer form header hgroup hr ol output pre section tfoot ul video'
|
||||||
|
).split())
|
||||||
|
|
||||||
|
def _collapse_whitespace(text):
|
||||||
|
""" Collapses sequences of whitespace characters in ``text`` to a single
|
||||||
|
space
|
||||||
|
"""
|
||||||
|
return re.sub('\s+', ' ', text)
|
||||||
|
def _realize_padding(it):
|
||||||
|
""" Fold and convert padding requests: integers in the output sequence are
|
||||||
|
requests for at least n newlines of padding. Runs thereof can be collapsed
|
||||||
|
into the largest requests and converted to newlines.
|
||||||
|
"""
|
||||||
|
padding = None
|
||||||
|
for item in it:
|
||||||
|
if isinstance(item, int):
|
||||||
|
padding = max(padding, item)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if padding:
|
||||||
|
yield '\n' * padding
|
||||||
|
padding = None
|
||||||
|
|
||||||
|
yield item
|
||||||
|
# leftover padding irrelevant as the output will be stripped
|
||||||
|
|
||||||
|
def _wrap(element, output, wrapper=u''):
|
||||||
|
""" Recursively extracts text from ``element`` (via _element_to_text), and
|
||||||
|
wraps it all in ``wrapper``. Extracted text is added to ``output``
|
||||||
|
|
||||||
|
:type wrapper: basestring | int
|
||||||
|
"""
|
||||||
|
output.append(wrapper)
|
||||||
|
if element.text:
|
||||||
|
output.append(_collapse_whitespace(element.text))
|
||||||
|
for child in element:
|
||||||
|
_element_to_text(child, output)
|
||||||
|
output.append(wrapper)
|
||||||
|
|
||||||
|
def _element_to_text(e, output):
|
||||||
|
if e.tag == 'br':
|
||||||
|
output.append(u'\n')
|
||||||
|
elif e.tag in _PADDED_BLOCK:
|
||||||
|
_wrap(e, output, 2)
|
||||||
|
elif e.tag in _MISC_BLOCK:
|
||||||
|
_wrap(e, output, 1)
|
||||||
|
else:
|
||||||
|
# inline
|
||||||
|
_wrap(e, output)
|
||||||
|
|
||||||
|
if e.tail:
|
||||||
|
output.append(_collapse_whitespace(e.tail))
|
||||||
|
|
|
@ -1,15 +1,120 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from functools import partial
|
import textwrap
|
||||||
|
import unittest2
|
||||||
from xml.dom.minidom import getDOMImplementation
|
from xml.dom.minidom import getDOMImplementation
|
||||||
|
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
from lxml.builder import E
|
||||||
|
|
||||||
from openerp.tests import common
|
from openerp.tests import common
|
||||||
from openerp.addons.base.ir import ir_qweb
|
from openerp.addons.base.ir import ir_qweb
|
||||||
|
from openerp.addons.website.models.ir_qweb import html_to_text
|
||||||
|
|
||||||
impl = getDOMImplementation()
|
impl = getDOMImplementation()
|
||||||
document = impl.createDocument(None, None, None)
|
document = impl.createDocument(None, None, None)
|
||||||
|
|
||||||
|
class TestHTMLToText(unittest2.TestCase):
|
||||||
|
def test_rawstring(self):
|
||||||
|
self.assertEqual(
|
||||||
|
"foobar",
|
||||||
|
html_to_text(E.div("foobar")))
|
||||||
|
|
||||||
|
def test_br(self):
|
||||||
|
self.assertEqual(
|
||||||
|
"foo\nbar",
|
||||||
|
html_to_text(E.div("foo", E.br(), "bar")))
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
"foo\n\nbar\nbaz",
|
||||||
|
html_to_text(E.div(
|
||||||
|
"foo", E.br(), E.br(),
|
||||||
|
"bar", E.br(),
|
||||||
|
"baz")))
|
||||||
|
|
||||||
|
def test_p(self):
|
||||||
|
self.assertEqual(
|
||||||
|
"foo\n\nbar\n\nbaz",
|
||||||
|
html_to_text(E.div(
|
||||||
|
"foo",
|
||||||
|
E.p("bar"),
|
||||||
|
"baz")))
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
"foo",
|
||||||
|
html_to_text(E.div(E.p("foo"))))
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
"foo\n\nbar",
|
||||||
|
html_to_text(E.div("foo", E.p("bar"))))
|
||||||
|
self.assertEqual(
|
||||||
|
"foo\n\nbar",
|
||||||
|
html_to_text(E.div(E.p("foo"), "bar")))
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
"foo\n\nbar\n\nbaz",
|
||||||
|
html_to_text(E.div(
|
||||||
|
E.p("foo"),
|
||||||
|
E.p("bar"),
|
||||||
|
E.p("baz"),
|
||||||
|
)))
|
||||||
|
|
||||||
|
def test_div(self):
|
||||||
|
self.assertEqual(
|
||||||
|
"foo\nbar\nbaz",
|
||||||
|
html_to_text(E.div(
|
||||||
|
"foo",
|
||||||
|
E.div("bar"),
|
||||||
|
"baz"
|
||||||
|
)))
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
"foo",
|
||||||
|
html_to_text(E.div(E.div("foo"))))
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
"foo\nbar",
|
||||||
|
html_to_text(E.div("foo", E.div("bar"))))
|
||||||
|
self.assertEqual(
|
||||||
|
"foo\nbar",
|
||||||
|
html_to_text(E.div(E.div("foo"), "bar")))
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
"foo\nbar\nbaz",
|
||||||
|
html_to_text(E.div(
|
||||||
|
"foo",
|
||||||
|
E.div("bar"),
|
||||||
|
E.div("baz")
|
||||||
|
)))
|
||||||
|
|
||||||
|
def test_other_block(self):
|
||||||
|
self.assertEqual(
|
||||||
|
"foo\nbar\nbaz",
|
||||||
|
html_to_text(E.div(
|
||||||
|
"foo",
|
||||||
|
E.section("bar"),
|
||||||
|
"baz"
|
||||||
|
)))
|
||||||
|
|
||||||
|
def test_inline(self):
|
||||||
|
self.assertEqual(
|
||||||
|
"foobarbaz",
|
||||||
|
html_to_text(E.div("foo", E.span("bar"), "baz")))
|
||||||
|
|
||||||
|
def test_whitespace(self):
|
||||||
|
self.assertEqual(
|
||||||
|
"foo bar\nbaz",
|
||||||
|
html_to_text(E.div(
|
||||||
|
"foo\nbar",
|
||||||
|
E.br(),
|
||||||
|
"baz")
|
||||||
|
))
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
"foo bar\nbaz",
|
||||||
|
html_to_text(E.div(
|
||||||
|
E.div(E.span("foo"), " bar"),
|
||||||
|
"baz")))
|
||||||
|
|
||||||
class TestConvertBack(common.TransactionCase):
|
class TestConvertBack(common.TransactionCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super(TestConvertBack, self).setUp()
|
super(TestConvertBack, self).setUp()
|
||||||
|
@ -70,7 +175,7 @@ class TestConvertBack(common.TransactionCase):
|
||||||
self.field_roundtrip('selection_str', 'B')
|
self.field_roundtrip('selection_str', 'B')
|
||||||
|
|
||||||
def test_text(self):
|
def test_text(self):
|
||||||
self.field_roundtrip('text', """
|
self.field_roundtrip('text', textwrap.dedent("""\
|
||||||
You must obey the dance commander
|
You must obey the dance commander
|
||||||
Givin' out the order for fun
|
Givin' out the order for fun
|
||||||
You must obey the dance commander
|
You must obey the dance commander
|
||||||
|
@ -88,8 +193,7 @@ class TestConvertBack(common.TransactionCase):
|
||||||
Let's start the show
|
Let's start the show
|
||||||
Because you never know
|
Because you never know
|
||||||
You never know
|
You never know
|
||||||
You never know until you go
|
You never know until you go"""))
|
||||||
""")
|
|
||||||
|
|
||||||
def test_m2o(self):
|
def test_m2o(self):
|
||||||
""" the M2O field conversion (from html) is markedly different from
|
""" the M2O field conversion (from html) is markedly different from
|
||||||
|
|
Loading…
Reference in New Issue