2009-12-02 05:36:57 +00:00
# -*- coding: utf-8 -*-
##############################################################################
#
# OpenERP, Open Source Management Solution
2010-01-12 09:18:39 +00:00
# Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>).
2009-12-02 05:36:57 +00:00
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
##############################################################################
2010-06-24 09:48:27 +00:00
from content_index import indexer , cntIndex
from subprocess import Popen , PIPE
2009-12-02 05:36:57 +00:00
import StringIO
import odt2txt
2010-06-24 09:48:27 +00:00
import sys , zipfile , xml . dom . minidom
2011-12-19 15:10:01 +00:00
import logging
2012-06-22 06:48:39 +00:00
_logger = logging . getLogger ( __name__ )
2009-12-02 05:36:57 +00:00
def _to_unicode ( s ) :
try :
return s . decode ( ' utf-8 ' )
except UnicodeError :
try :
return s . decode ( ' latin ' )
except UnicodeError :
try :
return s . encode ( ' ascii ' )
except UnicodeError :
return s
2012-03-05 18:40:03 +00:00
def textToString ( element ) :
2010-06-24 09:48:27 +00:00
buffer = u " "
for node in element . childNodes :
if node . nodeType == xml . dom . Node . TEXT_NODE :
buffer + = node . nodeValue
elif node . nodeType == xml . dom . Node . ELEMENT_NODE :
buffer + = textToString ( node )
return buffer
2009-12-02 05:36:57 +00:00
class TxtIndex ( indexer ) :
2009-12-07 13:11:11 +00:00
def _getMimeTypes ( self ) :
return [ ' text/plain ' , ' text/html ' , ' text/diff ' , ' text/xml ' , ' text/* ' ,
2010-06-24 15:17:35 +00:00
' application/xml ' ]
2009-12-07 13:11:11 +00:00
def _getExtensions ( self ) :
return [ ' .txt ' , ' .py ' ]
2012-03-05 18:40:03 +00:00
def _doIndexContent ( self , content ) :
2009-12-07 13:11:11 +00:00
return content
2010-06-24 15:17:35 +00:00
2009-12-02 05:36:57 +00:00
cntIndex . register ( TxtIndex ( ) )
2010-03-19 08:41:30 +00:00
class PptxIndex ( indexer ) :
2010-03-11 08:56:00 +00:00
def _getMimeTypes ( self ) :
2010-03-19 08:41:30 +00:00
return [ ' application/vnd.openxmlformats-officedocument.presentationml.presentation ' ]
2010-03-11 08:56:00 +00:00
def _getExtensions ( self ) :
2010-03-19 08:41:30 +00:00
return [ ' .pptx ' ]
2010-03-11 08:56:00 +00:00
2012-03-05 18:40:03 +00:00
def _doIndexFile ( self , fname ) :
2010-06-24 09:48:27 +00:00
def toString ( ) :
""" Converts the document to a string. """
buffer = u " "
for val in [ " a:t " ] :
for paragraph in content . getElementsByTagName ( val ) :
buffer + = textToString ( paragraph ) + " \n "
return buffer
data = [ ]
zip = zipfile . ZipFile ( fname )
files = filter ( lambda x : x . startswith ( ' ppt/slides/slide ' ) , zip . namelist ( ) )
for i in range ( 1 , len ( files ) + 1 ) :
content = xml . dom . minidom . parseString ( zip . read ( ' ppt/slides/slide %s .xml ' % str ( i ) ) )
res = toString ( ) . encode ( ' ascii ' , ' replace ' )
data . append ( res )
return _to_unicode ( ' \n ' . join ( data ) )
2010-03-11 08:56:00 +00:00
2010-03-19 08:41:30 +00:00
cntIndex . register ( PptxIndex ( ) )
2010-03-11 08:56:00 +00:00
2009-12-02 05:36:57 +00:00
class DocIndex ( indexer ) :
2009-12-07 13:11:11 +00:00
def _getMimeTypes ( self ) :
return [ ' application/ms-word ' ]
def _getExtensions ( self ) :
2010-03-15 10:52:27 +00:00
return [ ' .doc ' ]
2009-12-02 05:36:57 +00:00
2012-03-05 18:40:03 +00:00
def _doIndexFile ( self , fname ) :
2011-12-19 09:31:56 +00:00
try :
pop = Popen ( [ ' antiword ' , fname ] , shell = False , stdout = PIPE )
( data , _ ) = pop . communicate ( )
return _to_unicode ( data )
2011-12-19 15:10:01 +00:00
except OSError :
2012-06-22 06:48:39 +00:00
2012-01-25 09:44:51 +00:00
_logger . warning ( " Failed attempt to execute antiword (MS Word reader). Antiword is necessary to index the file %s of MIME type %s . Detailed error available at DEBUG level. " , fname , self . _getMimeTypes ( ) [ 0 ] )
2012-07-25 10:33:34 +00:00
_logger . debug ( " Trace of the failed file indexing attempt. " , exc_info = True )
2011-12-19 09:31:56 +00:00
return False
2010-11-12 10:14:16 +00:00
2009-12-02 05:36:57 +00:00
cntIndex . register ( DocIndex ( ) )
2010-03-19 08:41:30 +00:00
class DocxIndex ( indexer ) :
def _getMimeTypes ( self ) :
return [ ' application/vnd.openxmlformats-officedocument.wordprocessingml.document ' ]
2010-06-24 09:48:27 +00:00
2010-03-19 08:41:30 +00:00
def _getExtensions ( self ) :
return [ ' .docx ' ]
2012-03-05 18:40:03 +00:00
def _doIndexFile ( self , fname ) :
2010-06-24 09:48:27 +00:00
zip = zipfile . ZipFile ( fname )
content = xml . dom . minidom . parseString ( zip . read ( " word/document.xml " ) )
def toString ( ) :
""" Converts the document to a string. """
buffer = u " "
for val in [ " w:p " , " w:h " , " text:list " ] :
for paragraph in content . getElementsByTagName ( val ) :
buffer + = textToString ( paragraph ) + " \n "
return buffer
res = toString ( ) . encode ( ' ascii ' , ' replace ' )
2010-03-19 08:41:30 +00:00
2010-06-24 09:48:27 +00:00
return _to_unicode ( res )
2010-03-19 08:41:30 +00:00
cntIndex . register ( DocxIndex ( ) )
2010-06-24 09:48:27 +00:00
class XlsxIndex ( indexer ) :
def _getMimeTypes ( self ) :
return [ ' application/vnd.openxmlformats-officedocument.spreadsheetml.sheet ' ]
def _getExtensions ( self ) :
return [ ' .xlsx ' ]
2012-03-05 18:40:03 +00:00
def _doIndexFile ( self , fname ) :
2010-06-24 09:48:27 +00:00
zip = zipfile . ZipFile ( fname )
content = xml . dom . minidom . parseString ( zip . read ( " xl/sharedStrings.xml " ) )
def toString ( ) :
""" Converts the document to a string. """
buffer = u " "
for val in [ " t " ] :
for paragraph in content . getElementsByTagName ( val ) :
buffer + = textToString ( paragraph ) + " \n "
return buffer
res = toString ( ) . encode ( ' ascii ' , ' replace ' )
return _to_unicode ( res )
cntIndex . register ( XlsxIndex ( ) )
2009-12-02 05:36:57 +00:00
class PdfIndex ( indexer ) :
2009-12-07 13:11:11 +00:00
def _getMimeTypes ( self ) :
return [ ' application/pdf ' ]
def _getExtensions ( self ) :
return [ ' .pdf ' ]
2009-12-02 05:36:57 +00:00
2012-03-05 18:40:03 +00:00
def _doIndexFile ( self , fname ) :
2010-12-23 14:17:44 +00:00
pop = Popen ( [ ' pdftotext ' , ' -enc ' , ' UTF-8 ' , ' -nopgbrk ' , fname , ' - ' ] , shell = False , stdout = PIPE )
( data , _ ) = pop . communicate ( )
return _to_unicode ( data )
2009-12-02 05:36:57 +00:00
cntIndex . register ( PdfIndex ( ) )
class ImageNoIndex ( indexer ) :
2009-12-07 13:11:11 +00:00
def _getMimeTypes ( self ) :
return [ ' image/* ' ]
def _getExtensions ( self ) :
#better return no extension, and let 'file' do its magic
return [ ]
#return ['.png','.jpg','.gif','.jpeg','.bmp','.tiff']
2009-12-02 05:36:57 +00:00
2012-03-05 18:40:03 +00:00
def _doIndexContent ( self , content ) :
2009-12-07 13:11:11 +00:00
return ' image '
2009-12-02 05:36:57 +00:00
cntIndex . register ( ImageNoIndex ( ) )
2010-06-24 15:17:35 +00:00
# other opendocument formats:
2010-06-24 15:27:10 +00:00
# chart-template chart database
# formula-template formula graphics-template graphics
# image
# presentation-template presentation spreadsheet-template spreadsheet
2010-06-24 15:17:35 +00:00
class OpenDoc ( indexer ) :
""" Index OpenDocument files.
Q : is it really worth it to index spreadsheets , or do we only get a
meaningless list of numbers ( cell contents ) ?
"""
def _getMimeTypes ( self ) :
otypes = [ ' text ' , ' text-web ' , ' text-template ' , ' text-master ' ]
return map ( lambda a : ' application/vnd.oasis.opendocument. ' + a , otypes )
def _getExtensions ( self ) :
return [ ' .odt ' , ' .ott ' , ] # '.ods'
def _doIndexContent ( self , content ) :
s = StringIO . StringIO ( content )
o = odt2txt . OpenDocumentTextFile ( s )
result = _to_unicode ( o . toString ( ) )
s . close ( )
return result
cntIndex . register ( OpenDoc ( ) )
2009-12-02 05:36:57 +00:00
#eof
2011-11-22 08:51:38 +00:00
# vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4: