[IMP] tools.ustr: added support for encoding hints for conversion
bzr revid: odo@openerp.com-20100702133922-qkfjodoh5z0i5fij
This commit is contained in:
parent
2949ed38ab
commit
42b7282e4a
|
@ -349,7 +349,6 @@ def html2plaintext(html, body_id=None, encoding='utf-8'):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
html = ustr(html)
|
html = ustr(html)
|
||||||
urls = []
|
|
||||||
|
|
||||||
from lxml.etree import Element, tostring
|
from lxml.etree import Element, tostring
|
||||||
try:
|
try:
|
||||||
|
@ -376,7 +375,6 @@ def html2plaintext(html, body_id=None, encoding='utf-8'):
|
||||||
url = link.get('href')
|
url = link.get('href')
|
||||||
if url:
|
if url:
|
||||||
i += 1
|
i += 1
|
||||||
urls.append(dict(url=ustr(url), tag=ustr(link), title=ustr(title)))
|
|
||||||
link.tag = 'span'
|
link.tag = 'span'
|
||||||
link.text = '%s [%s]' % (link.text, i)
|
link.text = '%s [%s]' % (link.text, i)
|
||||||
url_index.append(url)
|
url_index.append(url)
|
||||||
|
@ -822,27 +820,39 @@ class cache(object):
|
||||||
def to_xml(s):
|
def to_xml(s):
|
||||||
return s.replace('&','&').replace('<','<').replace('>','>')
|
return s.replace('&','&').replace('<','<').replace('>','>')
|
||||||
|
|
||||||
def get_encodings():
|
def get_encodings(hint_encoding='utf-8'):
|
||||||
yield 'utf8'
|
fallbacks = {
|
||||||
|
'latin1': 'latin9',
|
||||||
|
'iso-8859-1': 'iso8859-15',
|
||||||
|
'cp1252': '1252',
|
||||||
|
}
|
||||||
|
if hint_encoding:
|
||||||
|
yield hint_encoding
|
||||||
|
if hint_encoding.lower() in fallbacks:
|
||||||
|
yield fallbacks[hint_encoding.lower()]
|
||||||
|
|
||||||
|
# some defaults (also taking care of pure ASCII)
|
||||||
|
for charset in ['utf8','latin1']:
|
||||||
|
if not (hint_encoding) or (charset.lower() != hint_encoding.lower()):
|
||||||
|
yield charset
|
||||||
|
|
||||||
from locale import getpreferredencoding
|
from locale import getpreferredencoding
|
||||||
prefenc = getpreferredencoding()
|
prefenc = getpreferredencoding()
|
||||||
if prefenc:
|
if prefenc and prefenc.lower() != 'utf-8':
|
||||||
yield prefenc
|
yield prefenc
|
||||||
|
prefenc = fallbacks.get(prefenc.lower())
|
||||||
prefenc = {
|
|
||||||
'latin1': 'latin9',
|
|
||||||
'iso-8859-1': 'iso8859-15',
|
|
||||||
'cp1252': '1252',
|
|
||||||
}.get(prefenc.lower())
|
|
||||||
if prefenc:
|
if prefenc:
|
||||||
yield prefenc
|
yield prefenc
|
||||||
|
|
||||||
|
|
||||||
def ustr(value):
|
def ustr(value, hint_encoding='utf-8'):
|
||||||
"""This method is similar to the builtin `str` method, except
|
"""This method is similar to the builtin `str` method, except
|
||||||
it will return Unicode string.
|
it will return unicode() string.
|
||||||
|
|
||||||
@param value: the value to convert
|
@param value: the value to convert
|
||||||
|
@param hint_encoding: an optional encoding that was detected
|
||||||
|
upstream and should be tried first to
|
||||||
|
decode ``value``.
|
||||||
|
|
||||||
@rtype: unicode
|
@rtype: unicode
|
||||||
@return: unicode string
|
@return: unicode string
|
||||||
|
@ -854,12 +864,7 @@ def ustr(value):
|
||||||
if isinstance(value, unicode):
|
if isinstance(value, unicode):
|
||||||
return value
|
return value
|
||||||
|
|
||||||
try:
|
for ln in get_encodings(hint_encoding):
|
||||||
return unicode(value)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
for ln in get_encodings():
|
|
||||||
try:
|
try:
|
||||||
return unicode(value, ln)
|
return unicode(value, ln)
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|
Loading…
Reference in New Issue