2008-11-03 19:18:56 +00:00
|
|
|
# -*- encoding: utf-8 -*-
|
|
|
|
##############################################################################
|
|
|
|
#
|
2009-03-03 09:51:57 +00:00
|
|
|
# OpenERP, Open Source Management Solution
|
2009-01-04 22:12:50 +00:00
|
|
|
# Copyright (C) 2004-2009 Tiny SPRL (<http://tiny.be>). All Rights Reserved
|
2008-11-03 19:18:56 +00:00
|
|
|
# $Id$
|
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
#
|
|
|
|
##############################################################################
|
2008-09-26 14:23:55 +00:00
|
|
|
import time
|
|
|
|
import os
|
|
|
|
import StringIO
|
|
|
|
import odt2txt
|
2009-01-14 07:54:20 +00:00
|
|
|
import tempfile
|
2008-09-26 14:23:55 +00:00
|
|
|
|
|
|
|
#
|
|
|
|
# This should be the indexer
|
|
|
|
#
|
2009-04-22 12:08:39 +00:00
|
|
|
def _to_unicode(s):
|
|
|
|
try:
|
|
|
|
return s.decode('utf-8')
|
|
|
|
except UnicodeError:
|
|
|
|
try:
|
|
|
|
return s.decode('latin')
|
|
|
|
except UnicodeError:
|
|
|
|
try:
|
|
|
|
return s.encode('ascii')
|
|
|
|
except UnicodeError:
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
2008-09-26 14:23:55 +00:00
|
|
|
def content_index(content, filename=None, content_type=None):
|
2009-03-03 09:51:57 +00:00
|
|
|
fname,ext = os.path.splitext(filename)
|
|
|
|
result = ''
|
|
|
|
if ext in ('.doc'): #or content_type ?
|
|
|
|
(stdin,stdout) = os.popen2('antiword -', 'b')
|
|
|
|
stdin.write(content)
|
|
|
|
stdin.close()
|
2009-04-22 12:08:39 +00:00
|
|
|
result = _to_unicode(stdout.read())
|
2009-03-03 09:51:57 +00:00
|
|
|
elif ext == '.pdf':
|
|
|
|
file_descriptor, file_name = tempfile.mkstemp(suffix=ext)
|
|
|
|
os.write(file_descriptor, content)
|
|
|
|
os.close(file_descriptor)
|
|
|
|
fp = os.popen('pdftotext -enc UTF-8 -nopgbrk '+file_name+' -', 'r')
|
|
|
|
result = fp.read()
|
|
|
|
fp.close()
|
|
|
|
elif ext in ('.xls','.ods','.odt','.odp'):
|
|
|
|
s = StringIO.StringIO(content)
|
|
|
|
o = odt2txt.OpenDocumentTextFile(s)
|
2009-04-22 12:08:39 +00:00
|
|
|
result = _to_unicode(o.toString())
|
2009-03-03 09:51:57 +00:00
|
|
|
s.close()
|
|
|
|
elif ext in ('.txt','.py','.patch','.html','.csv','.xml'):
|
|
|
|
result = content
|
2009-09-03 12:50:00 +00:00
|
|
|
#else:
|
|
|
|
# result = content
|
2009-03-03 09:51:57 +00:00
|
|
|
return result
|
|
|
|
|
|
|
|
# vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4:
|