2009-10-13 05:58:37 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
2008-11-03 19:18:56 +00:00
|
|
|
##############################################################################
|
2011-12-31 07:57:20 +00:00
|
|
|
#
|
2009-10-14 11:15:34 +00:00
|
|
|
# OpenERP, Open Source Management Solution
|
2010-01-12 09:18:39 +00:00
|
|
|
# Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>).
|
2008-11-03 19:18:56 +00:00
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
2009-10-14 11:15:34 +00:00
|
|
|
# it under the terms of the GNU Affero General Public License as
|
|
|
|
# published by the Free Software Foundation, either version 3 of the
|
|
|
|
# License, or (at your option) any later version.
|
2008-11-03 19:18:56 +00:00
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
2009-10-14 11:15:34 +00:00
|
|
|
# GNU Affero General Public License for more details.
|
2008-11-03 19:18:56 +00:00
|
|
|
#
|
2009-10-14 11:15:34 +00:00
|
|
|
# You should have received a copy of the GNU Affero General Public License
|
2011-12-31 07:57:20 +00:00
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
2008-11-03 19:18:56 +00:00
|
|
|
#
|
|
|
|
##############################################################################
|
2010-03-02 09:43:49 +00:00
|
|
|
import logging
|
2008-09-26 14:23:55 +00:00
|
|
|
import os
|
2009-01-14 07:54:20 +00:00
|
|
|
import tempfile
|
2010-10-12 11:20:30 +00:00
|
|
|
from subprocess import Popen, PIPE
|
2012-06-22 06:48:39 +00:00
|
|
|
_logger = logging.getLogger(__name__)
|
2009-12-02 05:36:57 +00:00
|
|
|
class NhException(Exception):
|
2009-12-07 13:11:11 +00:00
|
|
|
pass
|
2009-12-02 05:36:57 +00:00
|
|
|
|
|
|
|
|
2010-06-16 11:51:39 +00:00
|
|
|
class indexer(object):
|
2009-12-07 13:11:11 +00:00
|
|
|
""" An indexer knows how to parse the content of some file.
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2009-12-07 13:11:11 +00:00
|
|
|
Typically, one indexer should be instantiated per file
|
|
|
|
type.
|
|
|
|
Override this class to add more functionality. Note that
|
|
|
|
you should only override the Content or the File methods
|
|
|
|
that give an optimal result. """
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2009-12-07 13:11:11 +00:00
|
|
|
def _getMimeTypes(self):
|
|
|
|
""" Return supported mimetypes """
|
|
|
|
return []
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2009-12-07 13:11:11 +00:00
|
|
|
def _getExtensions(self):
|
|
|
|
return []
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2010-06-16 11:51:39 +00:00
|
|
|
def _getDefMime(self, ext):
|
2009-12-07 13:11:11 +00:00
|
|
|
""" Return a mimetype for this document type, ideally the
|
|
|
|
closest to the extension ext. """
|
|
|
|
mts = self._getMimeTypes();
|
|
|
|
if len (mts):
|
|
|
|
return mts[0]
|
|
|
|
return None
|
2009-12-02 05:36:57 +00:00
|
|
|
|
2012-03-05 18:40:03 +00:00
|
|
|
def indexContent(self, content, filename=None, realfile=None):
|
2009-12-07 13:11:11 +00:00
|
|
|
""" Use either content or the real file, to index.
|
|
|
|
Some parsers will work better with the actual
|
|
|
|
content, others parse a file easier. Try the
|
|
|
|
optimal.
|
|
|
|
"""
|
|
|
|
res = ''
|
|
|
|
try:
|
|
|
|
if content != None:
|
|
|
|
return self._doIndexContent(content)
|
|
|
|
except NhException:
|
|
|
|
pass
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2009-12-07 13:11:11 +00:00
|
|
|
if realfile != None:
|
|
|
|
try:
|
|
|
|
return self._doIndexFile(realfile)
|
|
|
|
except NhException:
|
|
|
|
pass
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2009-12-07 13:11:11 +00:00
|
|
|
fp = open(realfile,'rb')
|
2010-11-11 06:47:07 +00:00
|
|
|
try:
|
|
|
|
content2 = fp.read()
|
2011-12-31 07:57:20 +00:00
|
|
|
finally:
|
2010-11-11 06:47:07 +00:00
|
|
|
fp.close()
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2009-12-07 13:11:11 +00:00
|
|
|
# The not-handled exception may be raised here
|
|
|
|
return self._doIndexContent(content2)
|
2011-12-31 07:57:20 +00:00
|
|
|
|
|
|
|
|
2009-12-07 13:11:11 +00:00
|
|
|
# last try, with a tmp file
|
|
|
|
if content:
|
|
|
|
try:
|
|
|
|
fname,ext = filename and os.path.splitext(filename) or ('','')
|
|
|
|
fd, rfname = tempfile.mkstemp(suffix=ext)
|
|
|
|
os.write(fd, content)
|
|
|
|
os.close(fd)
|
|
|
|
res = self._doIndexFile(rfname)
|
|
|
|
os.unlink(rfname)
|
|
|
|
return res
|
|
|
|
except NhException:
|
|
|
|
pass
|
2009-12-02 05:36:57 +00:00
|
|
|
|
2012-08-06 17:08:41 +00:00
|
|
|
raise NhException('No appropriate method to index file.')
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2012-03-05 18:40:03 +00:00
|
|
|
def _doIndexContent(self, content):
|
2012-08-06 17:08:41 +00:00
|
|
|
raise NhException("Content cannot be handled here.")
|
2009-12-02 05:36:57 +00:00
|
|
|
|
2012-03-05 18:40:03 +00:00
|
|
|
def _doIndexFile(self, fpath):
|
2012-08-06 17:08:41 +00:00
|
|
|
raise NhException("Content cannot be handled here.")
|
2010-08-12 11:09:29 +00:00
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return "<indexer %s.%s>" %(self.__module__, self.__class__.__name__)
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2009-12-02 05:36:57 +00:00
|
|
|
def mime_match(mime, mdict):
|
2009-12-07 13:11:11 +00:00
|
|
|
if mdict.has_key(mime):
|
|
|
|
return (mime, mdict[mime])
|
|
|
|
if '/' in mime:
|
|
|
|
mpat = mime.split('/')[0]+'/*'
|
|
|
|
if mdict.has_key(mpat):
|
|
|
|
return (mime, mdict[mpat])
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2009-12-07 13:11:11 +00:00
|
|
|
return (None, None)
|
2009-04-22 12:08:39 +00:00
|
|
|
|
2010-06-16 11:51:39 +00:00
|
|
|
class contentIndex(object):
|
2012-08-06 17:08:41 +00:00
|
|
|
|
2009-12-07 13:11:11 +00:00
|
|
|
def __init__(self):
|
|
|
|
self.mimes = {}
|
|
|
|
self.exts = {}
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2009-12-07 13:11:11 +00:00
|
|
|
def register(self, obj):
|
|
|
|
f = False
|
|
|
|
for mime in obj._getMimeTypes():
|
|
|
|
self.mimes[mime] = obj
|
|
|
|
f = True
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2009-12-07 13:11:11 +00:00
|
|
|
for ext in obj._getExtensions():
|
|
|
|
self.exts[ext] = obj
|
|
|
|
f = True
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2009-12-07 13:11:11 +00:00
|
|
|
if f:
|
2012-07-25 10:33:34 +00:00
|
|
|
_logger.debug('Register content indexer: %r.', obj)
|
2009-12-07 13:11:11 +00:00
|
|
|
if not f:
|
2012-07-12 10:17:47 +00:00
|
|
|
raise Exception("Your indexer should at least support a mimetype or extension.")
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2012-03-05 18:40:03 +00:00
|
|
|
def doIndex(self, content, filename=None, content_type=None, realfname=None, debug=False):
|
2009-12-07 13:11:11 +00:00
|
|
|
fobj = None
|
|
|
|
fname = None
|
|
|
|
mime = None
|
|
|
|
if content_type and self.mimes.has_key(content_type):
|
|
|
|
mime = content_type
|
|
|
|
fobj = self.mimes[content_type]
|
|
|
|
elif filename:
|
|
|
|
bname,ext = os.path.splitext(filename)
|
|
|
|
if self.exts.has_key(ext):
|
|
|
|
fobj = self.exts[ext]
|
|
|
|
mime = fobj._getDefMime(ext)
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2009-12-07 13:11:11 +00:00
|
|
|
if content_type and not fobj:
|
|
|
|
mime,fobj = mime_match(content_type, self.mimes)
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2009-12-07 13:11:11 +00:00
|
|
|
if not fobj:
|
|
|
|
try:
|
|
|
|
if realfname :
|
|
|
|
fname = realfname
|
|
|
|
else:
|
2010-07-01 17:51:31 +00:00
|
|
|
try:
|
|
|
|
bname,ext = os.path.splitext(filename or 'test.tmp')
|
|
|
|
except Exception:
|
|
|
|
bname, ext = filename, 'tmp'
|
2009-12-07 13:11:11 +00:00
|
|
|
fd, fname = tempfile.mkstemp(suffix=ext)
|
|
|
|
os.write(fd, content)
|
|
|
|
os.close(fd)
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2010-12-09 11:26:07 +00:00
|
|
|
pop = Popen(['file','-b','--mime',fname], shell=False, stdout=PIPE)
|
|
|
|
(result, _) = pop.communicate()
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2010-03-19 11:22:22 +00:00
|
|
|
mime2 = result.split(';')[0]
|
2012-07-25 10:33:34 +00:00
|
|
|
_logger.debug('File gives us: %s', mime2)
|
2009-12-07 13:11:11 +00:00
|
|
|
# Note that the temporary file still exists now.
|
|
|
|
mime,fobj = mime_match(mime2, self.mimes)
|
|
|
|
if not mime:
|
|
|
|
mime = mime2
|
2010-03-02 09:43:49 +00:00
|
|
|
except Exception:
|
2012-07-25 10:33:34 +00:00
|
|
|
_logger.exception('Cannot determine mime type.')
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2009-12-07 13:11:11 +00:00
|
|
|
try:
|
|
|
|
if fobj:
|
|
|
|
res = (mime, fobj.indexContent(content,filename,fname or realfname) )
|
|
|
|
else:
|
2012-07-25 10:33:34 +00:00
|
|
|
_logger.debug("Have no object, return (%s, None).", mime)
|
2013-03-07 08:55:05 +00:00
|
|
|
res = (mime, '')
|
2010-03-02 09:43:49 +00:00
|
|
|
except Exception:
|
2012-07-25 10:33:34 +00:00
|
|
|
_logger.exception("Cannot index file %s (%s).",
|
2010-03-02 09:43:49 +00:00
|
|
|
filename, fname or realfname)
|
2013-03-07 08:55:05 +00:00
|
|
|
res = (mime, '')
|
2011-12-31 07:57:20 +00:00
|
|
|
|
2009-12-07 13:11:11 +00:00
|
|
|
# If we created a tmp file, unlink it now
|
|
|
|
if not realfname and fname:
|
|
|
|
try:
|
|
|
|
os.unlink(fname)
|
2010-03-02 09:43:49 +00:00
|
|
|
except Exception:
|
2012-07-25 10:33:34 +00:00
|
|
|
_logger.exception("Cannot unlink %s.", fname)
|
2009-12-07 13:11:11 +00:00
|
|
|
return res
|
2009-04-22 12:08:39 +00:00
|
|
|
|
2009-12-02 05:36:57 +00:00
|
|
|
cntIndex = contentIndex()
|
2009-03-03 09:51:57 +00:00
|
|
|
|
|
|
|
# vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4:
|