[FIX][365069]DMS : handling unicode in file contents

bzr revid: hmo@tinyerp.com-20090422120839-qm2vyh4h1c83o6xo
2009-04-22 17:38:39 +05:30 · 2009-04-22 17:38:39 +05:30 · be7fd21899
parent 7092fe3ef9
commit be7fd21899
1 changed files with 15 additions and 2 deletions
--- a/addons/document/content_index.py
+++ b/addons/document/content_index.py
@ -28,6 +28,19 @@ import tempfile
 #
 # This should be the indexer
 #
+def _to_unicode(s):
+    try:
+        return s.decode('utf-8')
+    except UnicodeError:
+        try:
+            return s.decode('latin')
+        except UnicodeError:
+            try:
+                return s.encode('ascii')
+            except UnicodeError:
+                return s
+
+
 def content_index(content, filename=None, content_type=None):
    fname,ext = os.path.splitext(filename)
    result = ''
@ -35,7 +48,7 @@ def content_index(content, filename=None, content_type=None):
        (stdin,stdout) = os.popen2('antiword -', 'b')
        stdin.write(content)
        stdin.close()
-        result = stdout.read().decode('latin1','replace').encode('utf-8','replace')
+        result = _to_unicode(stdout.read())
    elif ext == '.pdf':
        file_descriptor, file_name = tempfile.mkstemp(suffix=ext)
        os.write(file_descriptor, content)
@ -46,7 +59,7 @@ def content_index(content, filename=None, content_type=None):
    elif ext in ('.xls','.ods','.odt','.odp'):
        s = StringIO.StringIO(content)
        o = odt2txt.OpenDocumentTextFile(s)
-        result = o.toString().encode('ascii','replace')
+        result = _to_unicode(o.toString())
        s.close()
    elif ext in ('.txt','.py','.patch','.html','.csv','.xml'):
        result = content