odoo/addons/document/content_index.py

# -*- encoding: utf-8 -*-
##############################################################################
#
#    OpenERP, Open Source Management Solution	
#    Copyright (C) 2004-2009 Tiny SPRL (<http://tiny.be>). All Rights Reserved
#    $Id$
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
##############################################################################
import time
import os
import StringIO
import odt2txt

#
# This should be the indexer
#
def content_index(content, filename=None, content_type=None):
	fname,ext = os.path.splitext(filename)
	result = ''
	if ext in ('.doc'): #or content_type ?
		(stdin,stdout) = os.popen2('antiword -', 'b')
		stdin.write(content)
		stdin.close()
		result = stdout.read().decode('latin1','replace').encode('utf-8','replace')
	elif ext == '.pdf':
		fname = os.tempnam(filename)+'.pdf'
		fp = file(fname,'wb')
		fp.write(content)
		fp.close()
		fp = os.popen('pdftotext -enc UTF-8 -nopgbrk '+fname+' -', 'r')
		result = fp.read()
		fp.close()
	elif ext in ('.xls','.ods','.odt','.odp'):
		s = StringIO.StringIO(content)
		o = odt2txt.OpenDocumentTextFile(s)
		result = o.toString().encode('ascii','replace')
		s.close()
	elif ext in ('.txt','.py','.patch','.html','.csv','.xml'):
		result = content
	else:
		result = content
	return result
passing modules in GPL-3 bzr revid: christophe@tinyerp.com-20081103191856-jhcivvwb16fvz2os 2008-11-03 19:18:56 +00:00			`# -- encoding: utf-8 --`
			`##############################################################################`
			`#`
			`# OpenERP, Open Source Management Solution`
[IMP] Update the copyright to 2009 bzr revid: stephane@tinyerp.com-20090104221250-55q32ayj2t8kzb2k 2009-01-04 22:12:50 +00:00			`# Copyright (C) 2004-2009 Tiny SPRL (<http://tiny.be>). All Rights Reserved`
passing modules in GPL-3 bzr revid: christophe@tinyerp.com-20081103191856-jhcivvwb16fvz2os 2008-11-03 19:18:56 +00:00			`# $Id$`
			`#`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`#`
			`##############################################################################`
Document Management System bzr revid: hmo@tinyerp.com-20080926142355-5nxy223jb4rad5lh 2008-09-26 14:23:55 +00:00			`import time`
			`import os`
			`import StringIO`
			`import odt2txt`

			`#`
			`# This should be the indexer`
			`#`
			`def content_index(content, filename=None, content_type=None):`
			`fname,ext = os.path.splitext(filename)`
			`result = ''`
			`if ext in ('.doc'): #or content_type ?`
			`(stdin,stdout) = os.popen2('antiword -', 'b')`
			`stdin.write(content)`
			`stdin.close()`
			`result = stdout.read().decode('latin1','replace').encode('utf-8','replace')`
			`elif ext == '.pdf':`
			`fname = os.tempnam(filename)+'.pdf'`
			`fp = file(fname,'wb')`
			`fp.write(content)`
			`fp.close()`
			`fp = os.popen('pdftotext -enc UTF-8 -nopgbrk '+fname+' -', 'r')`
			`result = fp.read()`
			`fp.close()`
			`elif ext in ('.xls','.ods','.odt','.odp'):`
			`s = StringIO.StringIO(content)`
			`o = odt2txt.OpenDocumentTextFile(s)`
			`result = o.toString().encode('ascii','replace')`
			`s.close()`
			`elif ext in ('.txt','.py','.patch','.html','.csv','.xml'):`
			`result = content`
			`else:`
			`result = content`
			`return result`