Unify the way backends are defined and loaded, unify the fs_cleanup function

This commit is contained in:
Roberto Rosario
2014-07-01 00:22:31 -04:00
parent 0255ea67b1
commit a9390d55ba
21 changed files with 208 additions and 235 deletions

View File

@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import logging
import os
import random
import re
@@ -14,8 +15,11 @@ from django.contrib.contenttypes.models import ContentType
from django.utils.datastructures import MultiValueDict
from django.utils.http import urlquote as django_urlquote
from django.utils.http import urlencode as django_urlencode
from django.utils.importlib import import_module
from django.utils.translation import ugettext_lazy as _
logger = logging.getLogger(__name__)
def urlquote(link=None, get=None):
u"""
@@ -427,3 +431,34 @@ def copyfile(source, destination, buffer_size=1024 * 1024):
source_descriptor.close()
destination_descriptor.close()
def _lazy_load(fn):
_cached = []
def _decorated():
if not _cached:
_cached.append(fn())
return _cached[0]
return _decorated
def load_backend(backend_string):
logger.debug('loading: %s' % backend_string)
module_name, klass = backend_string.rsplit('.', 1)
try:
return getattr(import_module(module_name), klass)()
except ImportError as exception:
logger.debug('error importing: %s' % backend_string)
raise
def fs_cleanup(filename):
"""
Tries to remove the given filename. Ignores non-existent files
"""
try:
os.remove(filename)
except OSError:
pass

View File

@@ -1,14 +1,10 @@
from __future__ import absolute_import
from django.utils.translation import ugettext_lazy as _
from django.core.exceptions import ImproperlyConfigured
from navigation.api import register_sidebar_template
from project_tools.api import register_tool
from .utils import load_backend
from .conf.settings import GRAPHICS_BACKEND
def is_superuser(context):
return context['request'].user.is_staff or context['request'].user.is_superuser
@@ -18,9 +14,4 @@ formats_list = {'text': _('file formats'), 'view': 'formats_list', 'famfam': 'pi
register_sidebar_template(['formats_list'], 'converter_file_formats_help.html')
try:
backend = load_backend().ConverterClass()
except ImproperlyConfigured:
raise ImproperlyConfigured(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)
register_tool(formats_list)

View File

@@ -7,16 +7,15 @@ import os
from django.utils.encoding import smart_str
from common.conf.settings import TEMPORARY_DIRECTORY
from common.utils import fs_cleanup
from . import backend
from .exceptions import OfficeConversionError, UnknownFileFormat
from .literals import (DEFAULT_PAGE_NUMBER,
DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, DEFAULT_FILE_FORMAT)
from .literals import (TRANSFORMATION_CHOICES, TRANSFORMATION_RESIZE,
TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM, DIMENSION_SEPARATOR,
FILE_FORMATS)
from .runtime import office_converter
from .utils import cleanup
from .runtime import backend, office_converter
HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
@@ -96,7 +95,7 @@ def convert(input_filepath, output_filepath=None, cleanup_files=False, mimetype=
backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, transformations=transformations, page=page, file_format=file_format, mimetype=mimetype)
finally:
if cleanup_files:
cleanup(input_filepath)
fs_cleanup(input_filepath)
return output_filepath

View File

@@ -1,21 +1,21 @@
from __future__ import absolute_import
import re
import subprocess
import re
from ...backends import ConverterBase
from ...conf.settings import GM_PATH, GM_SETTINGS
from ...exceptions import ConvertError, UnknownFileFormat, IdentifyError
from ...literals import (TRANSFORMATION_RESIZE,
from . import ConverterBase
from ..conf.settings import GM_PATH, GM_SETTINGS
from ..exceptions import ConvertError, UnknownFileFormat, IdentifyError
from ..literals import (TRANSFORMATION_RESIZE,
TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM)
from ...literals import DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER, \
DEFAULT_FILE_FORMAT
from ..literals import (DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER,
DEFAULT_FILE_FORMAT)
CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format'
CONVERTER_ERROR_STARTS_WITH = u'starts with'
class ConverterClass(ConverterBase):
class GraphicsMagick(ConverterBase):
def identify_file(self, input_filepath, arguments=None):
command = []
command.append(unicode(GM_PATH))

View File

@@ -1,19 +1,19 @@
from __future__ import absolute_import
import re
import subprocess
import re
from ...backends import ConverterBase
from ...conf.settings import IM_CONVERT_PATH, IM_IDENTIFY_PATH
from ...exceptions import ConvertError, UnknownFileFormat, IdentifyError
from ...literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE,
from . import ConverterBase
from ..conf.settings import IM_CONVERT_PATH, IM_IDENTIFY_PATH
from ..exceptions import ConvertError, UnknownFileFormat, IdentifyError
from ..literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE,
TRANSFORMATION_ZOOM, DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER,
DEFAULT_FILE_FORMAT)
CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format'
class ConverterClass(ConverterBase):
class ImageMagick(ConverterBase):
def identify_file(self, input_filepath, arguments=None):
command = []
command.append(unicode(IM_IDENTIFY_PATH))

View File

@@ -12,16 +12,18 @@ try:
except RuntimeError:
USE_GHOSTSCRIPT = False
from common.utils import fs_cleanup
from mimetype.api import get_mimetype
from ...exceptions import UnknownFileFormat
from ...backends import ConverterBase
from ...literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE,
from . import ConverterBase
from ..exceptions import UnknownFileFormat
from ..literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE,
TRANSFORMATION_ZOOM, DEFAULT_PAGE_NUMBER, DEFAULT_FILE_FORMAT)
from ...utils import cleanup
Image.init()
class ConverterClass(ConverterBase):
class Python(ConverterBase):
def get_page_count(self, input_filepath):
page_count = 1
@@ -94,7 +96,7 @@ class ConverterClass(ConverterBase):
raise UnknownFileFormat
finally:
if tmpfile:
cleanup(tmpfile)
fs_cleanup(tmpfile)
current_page = 0
try:

View File

@@ -1,3 +0,0 @@
from PIL import Image
Image.init()

View File

@@ -12,11 +12,7 @@ register_settings(
{'name': u'IM_IDENTIFY_PATH', 'global_name': u'CONVERTER_IM_IDENTIFY_PATH', 'default': u'/usr/bin/identify', 'description': _(u'File path to imagemagick\'s identify program.'), 'exists': True},
{'name': u'GM_PATH', 'global_name': u'CONVERTER_GM_PATH', 'default': u'/usr/bin/gm', 'description': _(u'File path to graphicsmagick\'s program.'), 'exists': True},
{'name': u'GM_SETTINGS', 'global_name': u'CONVERTER_GM_SETTINGS', 'default': u''},
{'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick, converter.backends.graphicsmagick and converter.backends.python.')},
{'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python.Python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick.ImageMagick, converter.backends.graphicsmagick.GraphicsMagick and converter.backends.python.Python')},
{'name': u'LIBREOFFICE_PATH', 'global_name': u'CONVERTER_LIBREOFFICE_PATH', 'default': u'/usr/bin/libreoffice', 'exists': True, 'description': _(u'Path to the libreoffice program.')},
# {'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'},
# {'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'},
# {'name': u'PRINT_QUALITY_OPTIONS', 'global_name': u'CONVERTER_PRINT_QUALITY_OPTIONS', 'default': u'-density 500'},
]
)

View File

@@ -1,10 +1,17 @@
from __future__ import absolute_import
from .office_converter import OfficeConverter
from .exceptions import OfficeBackendError
from django.core.exceptions import ImproperlyConfigured
from common.utils import load_backend
from .conf.settings import GRAPHICS_BACKEND
from .exceptions import OfficeBackendError
from .office_converter import OfficeConverter
try:
office_converter = OfficeConverter()
except OfficeBackendError:
office_converter = None
backend = load_backend(GRAPHICS_BACKEND)

View File

@@ -1,40 +0,0 @@
import os
from django.utils.importlib import import_module
def _lazy_load(fn):
_cached = []
def _decorated():
if not _cached:
_cached.append(fn())
return _cached[0]
return _decorated
@_lazy_load
def load_backend():
from converter.conf.settings import GRAPHICS_BACKEND as backend_name
try:
module = import_module('.base', 'converter.backends.%s' % backend_name)
import warnings
warnings.warn(
"Short names for CONVERTER_BACKEND are deprecated; prepend with 'converter.backends.'",
PendingDeprecationWarning
)
return module
except ImportError, e:
# Look for a fully qualified converter backend name
return import_module('.base', backend_name)
def cleanup(filename):
"""
Tries to remove the given filename. Ignores non-existent files
"""
try:
os.remove(filename)
except OSError:
pass

View File

@@ -1,48 +1,26 @@
from __future__ import absolute_import
import logging
import os
import subprocess
import sys
from django.utils.importlib import import_module
from django.utils.translation import ugettext as _
from common.conf.settings import TEMPORARY_DIRECTORY
from common.utils import fs_cleanup
from converter.api import convert
from documents.models import DocumentPage
from .backends import ocr_backend
from .conf.settings import UNPAPER_PATH, LANGUAGE
from .exceptions import UnpaperError
from .literals import (DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT,
DEFAULT_OCR_FILE_EXTENSION)
from .parsers import parse_document_page
from .parsers.exceptions import ParserError, ParserUnknownFile
from .runtime import language_backend, ocr_backend
def get_language_backend():
"""
Return the OCR cleanup language backend using the selected language
in the configuration settings
"""
try:
module = import_module(u'.'.join([u'ocr', u'lang', LANGUAGE]))
except ImportError:
sys.stderr.write(u'\nWarning: No OCR app language backend for language: %s\n\n' % LANGUAGE)
return None
return module
language_backend = get_language_backend()
def cleanup(filename):
"""
Try to remove the given filename, ignoring non-existent files
"""
try:
os.remove(filename)
except OSError:
pass
logger = logging.getLogger(__name__)
def do_document_ocr(queue_document):
@@ -58,15 +36,21 @@ def do_document_ocr(queue_document):
parse_document_page(document_page)
except (ParserError, ParserUnknownFile):
# Fall back to doing visual OCR
ocr_transformations, warnings = queue_document.get_transformation_list()
document_filepath = document_page.document.get_image_cache_name(page=document_page.page_number, version=document_page.document_version.pk)
unpaper_output_filename = u'%s_unpaper_out_page_%s%s%s' % (document_page.document.uuid, document_page.page_number, os.extsep, UNPAPER_FILE_FORMAT)
unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename)
unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT, transformations=ocr_transformations)
logger.debug('document_filepath: %s' % document_filepath)
unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT)
logger.debug('unpaper_input: %s' % unpaper_input)
execute_unpaper(input_filepath=unpaper_input, output_filepath=unpaper_output_filepath)
logger.debug('unpaper_output_filepath: %s' % unpaper_output_filepath)
# from PIL import Image, ImageOps
# im = Image.open(document_filepath)
# #if im.mode=='RGBA':
@@ -77,8 +61,14 @@ def do_document_ocr(queue_document):
# Convert to TIFF
pre_ocr_filepath = convert(input_filepath=unpaper_output_filepath, file_format=DEFAULT_OCR_FILE_FORMAT)
logger.debug('pre_ocr_filepath: %s' % pre_ocr_filepath)
# Tesseract needs an explicit file extension
pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])
logger.debug('pre_ocr_filepath_w_ext: %s' % pre_ocr_filepath_w_ext)
os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
try:
ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, LANGUAGE)
@@ -87,10 +77,10 @@ def do_document_ocr(queue_document):
document_page.page_label = _(u'Text from OCR')
document_page.save()
finally:
cleanup(pre_ocr_filepath_w_ext)
cleanup(unpaper_input)
cleanup(document_filepath)
cleanup(unpaper_output_filepath)
fs_cleanup(pre_ocr_filepath_w_ext)
fs_cleanup(unpaper_input)
fs_cleanup(document_filepath)
fs_cleanup(unpaper_output_filepath)
def ocr_cleanup(text):

View File

@@ -1,28 +1,3 @@
from __future__ import absolute_import
import sys
from django.utils.importlib import import_module
from ..conf.settings import BACKEND
class BackendBase(object):
def execute(input_filename, language=None):
raise NotImplemented
def get_ocr_backend():
"""
Return the OCR backend using the path specified in the configuration
settings
"""
try:
module = import_module(BACKEND)
except ImportError:
sys.stderr.write(u'\nWarning: No OCR backend named: %s\n\n' % BACKEND)
raise
else:
return module
ocr_backend = get_ocr_backend()

View File

@@ -5,6 +5,8 @@ import os
import subprocess
import tempfile
from common.utils import fs_cleanup
from . import BackendBase
from ..conf.settings import TESSERACT_PATH
from ..exceptions import OCRError
@@ -27,8 +29,8 @@ class Tesseract(BackendBase):
return_code = proc.wait()
if return_code != 0:
error_text = proc.stderr.read()
cleanup(filepath)
cleanup(ocr_output)
fs_cleanup(filepath)
fs_cleanup(ocr_output)
if language:
# If tesseract gives an error with a language parameter
# re-run it with no parameter again
@@ -43,14 +45,3 @@ class Tesseract(BackendBase):
os.unlink(filepath)
return text
# TODO: Reduntant, also in api.py
def cleanup(filename):
"""
Try to remove the given filename, ignoring non-existent files
"""
try:
os.remove(filename)
except OSError:
pass

View File

@@ -16,6 +16,6 @@ register_settings(
{'name': u'QUEUE_PROCESSING_INTERVAL', 'global_name': u'OCR_QUEUE_PROCESSING_INTERVAL', 'default': 10},
{'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
{'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True},
{'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')},
{'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract.Tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')},
]
)

View File

@@ -0,0 +1,3 @@
class BackendBase(object):
def check_word(word):
raise NotImplemented

View File

@@ -1,7 +1,12 @@
from __future__ import absolute_import
import re
from . import BackendBase
def check_word(word):
class LanguageBackend(BackendBase):
def check_word(word):
ALL_ALPHANUM = re.compile('([0-9a-z])', re.I)
NON_ALPHANUM = re.compile('([^0-9a-z])', re.I)

View File

@@ -1,8 +1,13 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import re
from . import BackendBase
def check_word(word):
class LanguageBackend(BackendBase):
def check_word(word):
ALL_ALPHANUM = re.compile('([0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
NON_ALPHANUM = re.compile('([^0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)

View File

@@ -1,8 +1,13 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import re
from . import BackendBase
def check_word(word):
class LanguageBackend(BackendBase):
def check_word(word):
ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)

12
mayan/apps/ocr/runtime.py Normal file
View File

@@ -0,0 +1,12 @@
from __future__ import absolute_import
from common.utils import load_backend
from .conf.settings import BACKEND, LANGUAGE
try:
language_backend = load_backend(u'.'.join([u'ocr', u'lang', LANGUAGE, u'LanguageBackend']))
except ImportError:
language_backend = None
ocr_backend = load_backend(BACKEND)