[FIX][365069]DMS : handling unicode in file contents

bzr revid: hmo@tinyerp.com-20090422120839-qm2vyh4h1c83o6xo
This commit is contained in:
Harry (Open ERP) 2009-04-22 17:38:39 +05:30
parent 7092fe3ef9
commit be7fd21899
1 changed files with 15 additions and 2 deletions

View File

@ -28,6 +28,19 @@ import tempfile
#
# This should be the indexer
#
def _to_unicode(s):
try:
return s.decode('utf-8')
except UnicodeError:
try:
return s.decode('latin')
except UnicodeError:
try:
return s.encode('ascii')
except UnicodeError:
return s
def content_index(content, filename=None, content_type=None):
fname,ext = os.path.splitext(filename)
result = ''
@ -35,7 +48,7 @@ def content_index(content, filename=None, content_type=None):
(stdin,stdout) = os.popen2('antiword -', 'b')
stdin.write(content)
stdin.close()
result = stdout.read().decode('latin1','replace').encode('utf-8','replace')
result = _to_unicode(stdout.read())
elif ext == '.pdf':
file_descriptor, file_name = tempfile.mkstemp(suffix=ext)
os.write(file_descriptor, content)
@ -46,7 +59,7 @@ def content_index(content, filename=None, content_type=None):
elif ext in ('.xls','.ods','.odt','.odp'):
s = StringIO.StringIO(content)
o = odt2txt.OpenDocumentTextFile(s)
result = o.toString().encode('ascii','replace')
result = _to_unicode(o.toString())
s.close()
elif ext in ('.txt','.py','.patch','.html','.csv','.xml'):
result = content