[FIX] conversion from html to values for text field

see html_to_text comment

bzr revid: xmo@openerp.com-20140110152024-yp2out0jg1gx1uhs
This commit is contained in:
Xavier Morel 2014-01-10 16:20:24 +01:00
parent 24e1e2cb6c
commit 3f8c2d0bad
2 changed files with 210 additions and 6 deletions

View File

@ -183,7 +183,7 @@ class Text(orm.AbstractModel):
_inherit = ['website.qweb.field', 'ir.qweb.field.text'] _inherit = ['website.qweb.field', 'ir.qweb.field.text']
def from_html(self, cr, uid, model, column, element, context=None): def from_html(self, cr, uid, model, column, element, context=None):
return element.text_content() return html_to_text(element)
class Selection(orm.AbstractModel): class Selection(orm.AbstractModel):
_name = 'website.qweb.field.selection' _name = 'website.qweb.field.selection'
@ -211,7 +211,8 @@ class ManyToOne(orm.AbstractModel):
M2O = self.pool[column._obj] M2O = self.pool[column._obj]
field = element.get('data-oe-field') field = element.get('data-oe-field')
id = int(element.get('data-oe-id')) id = int(element.get('data-oe-id'))
value = element.text_content().strip() # FIXME: weird things are going to happen for char-type _rec_name
value = html_to_text(element)
# if anything blows up, just ignore it and bail # if anything blows up, just ignore it and bail
try: try:
@ -419,3 +420,102 @@ class Contact(orm.AbstractModel):
}, engine='website.qweb', context=context) }, engine='website.qweb', context=context)
return ir_qweb.HTMLSafe(html) return ir_qweb.HTMLSafe(html)
def html_to_text(element):
""" Converts HTML content with HTML-specified line breaks (br, p, div, ...)
in roughly equivalent textual content.
Used to replace and fixup the roundtripping of text and m2o: when using
libxml 2.8.0 (but not 2.9.1) and parsing HTML with lxml.html.fromstring
whitespace text nodes (text nodes composed *solely* of whitespace) are
stripped out with no recourse, and fundamentally relying on newlines
being in the text (e.g. inserted during user edition) is probably poor form
anyway.
-> this utility function collapses whitespace sequences and replaces
nodes by roughly corresponding linebreaks
* p are pre-and post-fixed by 2 newlines
* br are replaced by a single newline
* block-level elements not already mentioned are pre- and post-fixed by
a single newline
ought be somewhat similar (but much less high-tech) to aaronsw's html2text.
the latter produces full-blown markdown, our text -> html converter only
replaces newlines by <br> elements at this point so we're reverting that,
and a few more newline-ish elements in case the user tried to add
newlines/paragraphs into the text field
:param element: lxml.html content
:returns: corresponding pure-text output
"""
# output is a list of str | int. Integers are padding requests (in minimum
# number of newlines). When multiple padding requests, fold them into the
# biggest one
output = []
_wrap(element, output)
# remove any leading or tailing whitespace, replace sequences of
# (whitespace)\n(whitespace) by a single newline, where (whitespace) is a
# non-newline whitespace in this case
return re.sub(
r'[ \t\r\f]*\n[ \t\r\f]*',
'\n',
''.join(_realize_padding(output)).strip())
_PADDED_BLOCK = set('p h1 h2 h3 h4 h5 h6'.split())
# https://developer.mozilla.org/en-US/docs/HTML/Block-level_elements minus p
_MISC_BLOCK = set((
'address article aside audio blockquote canvas dd dl div figcaption figure'
' footer form header hgroup hr ol output pre section tfoot ul video'
).split())
def _collapse_whitespace(text):
""" Collapses sequences of whitespace characters in ``text`` to a single
space
"""
return re.sub('\s+', ' ', text)
def _realize_padding(it):
""" Fold and convert padding requests: integers in the output sequence are
requests for at least n newlines of padding. Runs thereof can be collapsed
into the largest requests and converted to newlines.
"""
padding = None
for item in it:
if isinstance(item, int):
padding = max(padding, item)
continue
if padding:
yield '\n' * padding
padding = None
yield item
# leftover padding irrelevant as the output will be stripped
def _wrap(element, output, wrapper=u''):
""" Recursively extracts text from ``element`` (via _element_to_text), and
wraps it all in ``wrapper``. Extracted text is added to ``output``
:type wrapper: basestring | int
"""
output.append(wrapper)
if element.text:
output.append(_collapse_whitespace(element.text))
for child in element:
_element_to_text(child, output)
output.append(wrapper)
def _element_to_text(e, output):
if e.tag == 'br':
output.append(u'\n')
elif e.tag in _PADDED_BLOCK:
_wrap(e, output, 2)
elif e.tag in _MISC_BLOCK:
_wrap(e, output, 1)
else:
# inline
_wrap(e, output)
if e.tail:
output.append(_collapse_whitespace(e.tail))

View File

@ -1,15 +1,120 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from functools import partial import textwrap
import unittest2
from xml.dom.minidom import getDOMImplementation from xml.dom.minidom import getDOMImplementation
from lxml import html from lxml import html
from lxml.builder import E
from openerp.tests import common from openerp.tests import common
from openerp.addons.base.ir import ir_qweb from openerp.addons.base.ir import ir_qweb
from openerp.addons.website.models.ir_qweb import html_to_text
impl = getDOMImplementation() impl = getDOMImplementation()
document = impl.createDocument(None, None, None) document = impl.createDocument(None, None, None)
class TestHTMLToText(unittest2.TestCase):
def test_rawstring(self):
self.assertEqual(
"foobar",
html_to_text(E.div("foobar")))
def test_br(self):
self.assertEqual(
"foo\nbar",
html_to_text(E.div("foo", E.br(), "bar")))
self.assertEqual(
"foo\n\nbar\nbaz",
html_to_text(E.div(
"foo", E.br(), E.br(),
"bar", E.br(),
"baz")))
def test_p(self):
self.assertEqual(
"foo\n\nbar\n\nbaz",
html_to_text(E.div(
"foo",
E.p("bar"),
"baz")))
self.assertEqual(
"foo",
html_to_text(E.div(E.p("foo"))))
self.assertEqual(
"foo\n\nbar",
html_to_text(E.div("foo", E.p("bar"))))
self.assertEqual(
"foo\n\nbar",
html_to_text(E.div(E.p("foo"), "bar")))
self.assertEqual(
"foo\n\nbar\n\nbaz",
html_to_text(E.div(
E.p("foo"),
E.p("bar"),
E.p("baz"),
)))
def test_div(self):
self.assertEqual(
"foo\nbar\nbaz",
html_to_text(E.div(
"foo",
E.div("bar"),
"baz"
)))
self.assertEqual(
"foo",
html_to_text(E.div(E.div("foo"))))
self.assertEqual(
"foo\nbar",
html_to_text(E.div("foo", E.div("bar"))))
self.assertEqual(
"foo\nbar",
html_to_text(E.div(E.div("foo"), "bar")))
self.assertEqual(
"foo\nbar\nbaz",
html_to_text(E.div(
"foo",
E.div("bar"),
E.div("baz")
)))
def test_other_block(self):
self.assertEqual(
"foo\nbar\nbaz",
html_to_text(E.div(
"foo",
E.section("bar"),
"baz"
)))
def test_inline(self):
self.assertEqual(
"foobarbaz",
html_to_text(E.div("foo", E.span("bar"), "baz")))
def test_whitespace(self):
self.assertEqual(
"foo bar\nbaz",
html_to_text(E.div(
"foo\nbar",
E.br(),
"baz")
))
self.assertEqual(
"foo bar\nbaz",
html_to_text(E.div(
E.div(E.span("foo"), " bar"),
"baz")))
class TestConvertBack(common.TransactionCase): class TestConvertBack(common.TransactionCase):
def setUp(self): def setUp(self):
super(TestConvertBack, self).setUp() super(TestConvertBack, self).setUp()
@ -70,7 +175,7 @@ class TestConvertBack(common.TransactionCase):
self.field_roundtrip('selection_str', 'B') self.field_roundtrip('selection_str', 'B')
def test_text(self): def test_text(self):
self.field_roundtrip('text', """ self.field_roundtrip('text', textwrap.dedent("""\
You must obey the dance commander You must obey the dance commander
Givin' out the order for fun Givin' out the order for fun
You must obey the dance commander You must obey the dance commander
@ -88,8 +193,7 @@ class TestConvertBack(common.TransactionCase):
Let's start the show Let's start the show
Because you never know Because you never know
You never know You never know
You never know until you go You never know until you go"""))
""")
def test_m2o(self): def test_m2o(self):
""" the M2O field conversion (from html) is markedly different from """ the M2O field conversion (from html) is markedly different from