210 lines
6.4 KiB
Python
210 lines
6.4 KiB
Python
# -*- coding: utf-8 -*-
|
|
##############################################################################
|
|
#
|
|
# OpenERP, Open Source Management Solution
|
|
# Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>).
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU Affero General Public License as
|
|
# published by the Free Software Foundation, either version 3 of the
|
|
# License, or (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU Affero General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU Affero General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
#
|
|
##############################################################################
|
|
|
|
from content_index import indexer, cntIndex
|
|
from subprocess import Popen, PIPE
|
|
import StringIO
|
|
import odt2txt
|
|
import sys, zipfile, xml.dom.minidom
|
|
|
|
def _to_unicode(s):
|
|
try:
|
|
return s.decode('utf-8')
|
|
except UnicodeError:
|
|
try:
|
|
return s.decode('latin')
|
|
except UnicodeError:
|
|
try:
|
|
return s.encode('ascii')
|
|
except UnicodeError:
|
|
return s
|
|
|
|
def textToString(element) :
|
|
buffer = u""
|
|
for node in element.childNodes :
|
|
if node.nodeType == xml.dom.Node.TEXT_NODE :
|
|
buffer += node.nodeValue
|
|
elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
|
|
buffer += textToString(node)
|
|
return buffer
|
|
|
|
class TxtIndex(indexer):
|
|
def _getMimeTypes(self):
|
|
return ['text/plain','text/html','text/diff','text/xml', 'text/*',
|
|
'application/xml']
|
|
|
|
def _getExtensions(self):
|
|
return ['.txt', '.py']
|
|
|
|
def _doIndexContent(self,content):
|
|
return content
|
|
|
|
cntIndex.register(TxtIndex())
|
|
|
|
class PptxIndex(indexer):
|
|
def _getMimeTypes(self):
|
|
return [ 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
|
|
|
|
def _getExtensions(self):
|
|
return ['.pptx']
|
|
|
|
def _doIndexFile(self,fname):
|
|
def toString () :
|
|
""" Converts the document to a string. """
|
|
buffer = u""
|
|
for val in ["a:t"]:
|
|
for paragraph in content.getElementsByTagName(val) :
|
|
buffer += textToString(paragraph) + "\n"
|
|
return buffer
|
|
|
|
data = []
|
|
zip = zipfile.ZipFile(fname)
|
|
files = filter(lambda x: x.startswith('ppt/slides/slide'), zip.namelist())
|
|
for i in range(1, len(files) + 1):
|
|
content = xml.dom.minidom.parseString(zip.read('ppt/slides/slide%s.xml' % str(i)))
|
|
res = toString().encode('ascii','replace')
|
|
data.append(res)
|
|
|
|
return _to_unicode('\n'.join(data))
|
|
|
|
cntIndex.register(PptxIndex())
|
|
|
|
class DocIndex(indexer):
|
|
def _getMimeTypes(self):
|
|
return [ 'application/ms-word']
|
|
|
|
def _getExtensions(self):
|
|
return ['.doc']
|
|
|
|
def _doIndexFile(self,fname):
|
|
fp = Popen(['antiword', fname], shell=False, stdout=PIPE).stdout
|
|
return _to_unicode(fp.read())
|
|
|
|
cntIndex.register(DocIndex())
|
|
|
|
class DocxIndex(indexer):
|
|
def _getMimeTypes(self):
|
|
return [ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']
|
|
|
|
def _getExtensions(self):
|
|
return ['.docx']
|
|
|
|
def _doIndexFile(self,fname):
|
|
zip = zipfile.ZipFile(fname)
|
|
content = xml.dom.minidom.parseString(zip.read("word/document.xml"))
|
|
def toString () :
|
|
""" Converts the document to a string. """
|
|
buffer = u""
|
|
for val in ["w:p", "w:h", "text:list"]:
|
|
for paragraph in content.getElementsByTagName(val) :
|
|
buffer += textToString(paragraph) + "\n"
|
|
return buffer
|
|
|
|
res = toString().encode('ascii','replace')
|
|
|
|
return _to_unicode(res)
|
|
|
|
cntIndex.register(DocxIndex())
|
|
|
|
|
|
class XlsxIndex(indexer):
|
|
def _getMimeTypes(self):
|
|
return [ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']
|
|
|
|
def _getExtensions(self):
|
|
return ['.xlsx']
|
|
|
|
def _doIndexFile(self,fname):
|
|
zip = zipfile.ZipFile(fname)
|
|
content = xml.dom.minidom.parseString(zip.read("xl/sharedStrings.xml"))
|
|
def toString () :
|
|
""" Converts the document to a string. """
|
|
buffer = u""
|
|
for val in ["t"]:
|
|
for paragraph in content.getElementsByTagName(val) :
|
|
buffer += textToString(paragraph) + "\n"
|
|
return buffer
|
|
|
|
res = toString().encode('ascii','replace')
|
|
|
|
return _to_unicode(res)
|
|
|
|
cntIndex.register(XlsxIndex())
|
|
|
|
class PdfIndex(indexer):
|
|
def _getMimeTypes(self):
|
|
return [ 'application/pdf']
|
|
|
|
def _getExtensions(self):
|
|
return ['.pdf']
|
|
|
|
def _doIndexFile(self,fname):
|
|
fp = Popen(['pdftotext', '-enc', 'UTF-8', '-nopgbrk', fname, '-'], shell=False, stdout=PIPE).stdout
|
|
return _to_unicode( fp.read())
|
|
|
|
cntIndex.register(PdfIndex())
|
|
|
|
class ImageNoIndex(indexer):
|
|
def _getMimeTypes(self):
|
|
return [ 'image/*']
|
|
|
|
def _getExtensions(self):
|
|
#better return no extension, and let 'file' do its magic
|
|
return []
|
|
#return ['.png','.jpg','.gif','.jpeg','.bmp','.tiff']
|
|
|
|
def _doIndexContent(self,content):
|
|
return 'image'
|
|
|
|
|
|
cntIndex.register(ImageNoIndex())
|
|
|
|
# other opendocument formats:
|
|
# chart-template chart database
|
|
# formula-template formula graphics-template graphics
|
|
# image
|
|
# presentation-template presentation spreadsheet-template spreadsheet
|
|
|
|
class OpenDoc(indexer):
|
|
""" Index OpenDocument files.
|
|
|
|
Q: is it really worth it to index spreadsheets, or do we only get a
|
|
meaningless list of numbers (cell contents) ?
|
|
"""
|
|
def _getMimeTypes(self):
|
|
otypes = [ 'text', 'text-web', 'text-template', 'text-master' ]
|
|
return map(lambda a: 'application/vnd.oasis.opendocument.'+a, otypes)
|
|
|
|
def _getExtensions(self):
|
|
return ['.odt', '.ott', ] # '.ods'
|
|
|
|
def _doIndexContent(self, content):
|
|
s = StringIO.StringIO(content)
|
|
o = odt2txt.OpenDocumentTextFile(s)
|
|
result = _to_unicode(o.toString())
|
|
s.close()
|
|
return result
|
|
|
|
cntIndex.register(OpenDoc())
|
|
|
|
|
|
#eof
|