Unify the way backends are defined and loaded, unify the fs_cleanup function

2014-07-01 00:22:31 -04:00
parent 0255ea67b1
commit a9390d55ba
21 changed files with 208 additions and 235 deletions
--- a/mayan/apps/common/utils.py
+++ b/mayan/apps/common/utils.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import

+import logging
 import os
 import random
 import re
@@ -14,8 +15,11 @@ from django.contrib.contenttypes.models import ContentType
 from django.utils.datastructures import MultiValueDict
 from django.utils.http import urlquote as django_urlquote
 from django.utils.http import urlencode as django_urlencode
+from django.utils.importlib import import_module
 from django.utils.translation import ugettext_lazy as _

+logger = logging.getLogger(__name__)
+

 def urlquote(link=None, get=None):
    u"""
@@ -427,3 +431,34 @@ def copyfile(source, destination, buffer_size=1024 * 1024):

    source_descriptor.close()
    destination_descriptor.close()
+
+
+def _lazy_load(fn):
+    _cached = []
+
+    def _decorated():
+        if not _cached:
+            _cached.append(fn())
+        return _cached[0]
+    return _decorated
+
+
+def load_backend(backend_string):
+    logger.debug('loading: %s' % backend_string)
+    module_name, klass = backend_string.rsplit('.', 1)
+
+    try:
+        return getattr(import_module(module_name), klass)()
+    except ImportError as exception:
+        logger.debug('error importing: %s' % backend_string)
+        raise
+
+
+def fs_cleanup(filename):
+    """
+    Tries to remove the given filename. Ignores non-existent files
+    """
+    try:
+        os.remove(filename)
+    except OSError:
+        pass
--- a/mayan/apps/converter/init.py
+++ b/mayan/apps/converter/init.py
@@ -1,14 +1,10 @@
 from __future__ import absolute_import

 from django.utils.translation import ugettext_lazy as _
-from django.core.exceptions import ImproperlyConfigured

 from navigation.api import register_sidebar_template
 from project_tools.api import register_tool

-from .utils import load_backend
-from .conf.settings import GRAPHICS_BACKEND
-

 def is_superuser(context):
    return context['request'].user.is_staff or context['request'].user.is_superuser
@@ -18,9 +14,4 @@ formats_list = {'text': _('file formats'), 'view': 'formats_list', 'famfam': 'pi

 register_sidebar_template(['formats_list'], 'converter_file_formats_help.html')

-try:
-    backend = load_backend().ConverterClass()
-except ImproperlyConfigured:
-    raise ImproperlyConfigured(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)
-
 register_tool(formats_list)
--- a/mayan/apps/converter/api.py
+++ b/mayan/apps/converter/api.py
@@ -7,16 +7,15 @@ import os
 from django.utils.encoding import smart_str

 from common.conf.settings import TEMPORARY_DIRECTORY
+from common.utils import fs_cleanup

-from . import backend
 from .exceptions import OfficeConversionError, UnknownFileFormat
 from .literals import (DEFAULT_PAGE_NUMBER,
    DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, DEFAULT_FILE_FORMAT)
 from .literals import (TRANSFORMATION_CHOICES, TRANSFORMATION_RESIZE,
    TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM, DIMENSION_SEPARATOR,
    FILE_FORMATS)
-from .runtime import office_converter
-from .utils import cleanup
+from .runtime import backend, office_converter

 HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()

@@ -96,7 +95,7 @@ def convert(input_filepath, output_filepath=None, cleanup_files=False, mimetype=
        backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, transformations=transformations, page=page, file_format=file_format, mimetype=mimetype)
    finally:
        if cleanup_files:
-            cleanup(input_filepath)
+            fs_cleanup(input_filepath)

    return output_filepath

--- a/mayan/apps/converter/backends/graphicsmagick/base.py
+++ b/mayan/apps/converter/backends/graphicsmagick/base.py
@@ -1,21 +1,21 @@
 from __future__ import absolute_import

-import re
 import subprocess
+import re

-from ...backends import ConverterBase
-from ...conf.settings import GM_PATH, GM_SETTINGS
-from ...exceptions import ConvertError, UnknownFileFormat, IdentifyError
-from ...literals import (TRANSFORMATION_RESIZE,
+from . import ConverterBase
+from ..conf.settings import GM_PATH, GM_SETTINGS
+from ..exceptions import ConvertError, UnknownFileFormat, IdentifyError
+from ..literals import (TRANSFORMATION_RESIZE,
    TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM)
-from ...literals import DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER, \
-    DEFAULT_FILE_FORMAT
+from ..literals import (DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER,
+    DEFAULT_FILE_FORMAT)

 CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format'
 CONVERTER_ERROR_STARTS_WITH = u'starts with'


-class ConverterClass(ConverterBase):
+class GraphicsMagick(ConverterBase):
    def identify_file(self, input_filepath, arguments=None):
        command = []
        command.append(unicode(GM_PATH))
--- a/mayan/apps/converter/backends/graphicsmagick/init.py
+++ b/mayan/apps/converter/backends/graphicsmagick/init.py
--- a/mayan/apps/converter/backends/imagemagick/base.py
+++ b/mayan/apps/converter/backends/imagemagick/base.py
@@ -1,19 +1,19 @@
 from __future__ import absolute_import

-import re
 import subprocess
+import re

-from ...backends import ConverterBase
-from ...conf.settings import IM_CONVERT_PATH, IM_IDENTIFY_PATH
-from ...exceptions import ConvertError, UnknownFileFormat, IdentifyError
-from ...literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE,
+from . import ConverterBase
+from ..conf.settings import IM_CONVERT_PATH, IM_IDENTIFY_PATH
+from ..exceptions import ConvertError, UnknownFileFormat, IdentifyError
+from ..literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE,
    TRANSFORMATION_ZOOM, DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER,
    DEFAULT_FILE_FORMAT)

 CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format'


-class ConverterClass(ConverterBase):
+class ImageMagick(ConverterBase):
    def identify_file(self, input_filepath, arguments=None):
        command = []
        command.append(unicode(IM_IDENTIFY_PATH))
--- a/mayan/apps/converter/backends/imagemagick/init.py
+++ b/mayan/apps/converter/backends/imagemagick/init.py
--- a/mayan/apps/converter/backends/python/base.py
+++ b/mayan/apps/converter/backends/python/base.py
@@ -12,16 +12,18 @@ try:
 except RuntimeError:
    USE_GHOSTSCRIPT = False

+from common.utils import fs_cleanup
 from mimetype.api import get_mimetype

-from ...exceptions import UnknownFileFormat
-from ...backends import ConverterBase
-from ...literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE,
+from . import ConverterBase
+from ..exceptions import UnknownFileFormat
+from ..literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE,
    TRANSFORMATION_ZOOM, DEFAULT_PAGE_NUMBER, DEFAULT_FILE_FORMAT)
-from ...utils import cleanup
+
+Image.init()


-class ConverterClass(ConverterBase):
+class Python(ConverterBase):
    def get_page_count(self, input_filepath):
        page_count = 1

@@ -94,7 +96,7 @@ class ConverterClass(ConverterBase):
            raise UnknownFileFormat
        finally:
            if tmpfile:
-                cleanup(tmpfile)
+                fs_cleanup(tmpfile)

        current_page = 0
        try:
--- a/mayan/apps/converter/backends/python/init.py
+++ b/mayan/apps/converter/backends/python/init.py
@@ -1,3 +0,0 @@
-from PIL import Image
-
-Image.init()
--- a/mayan/apps/converter/conf/settings.py
+++ b/mayan/apps/converter/conf/settings.py
@@ -12,11 +12,7 @@ register_settings(
        {'name': u'IM_IDENTIFY_PATH', 'global_name': u'CONVERTER_IM_IDENTIFY_PATH', 'default': u'/usr/bin/identify', 'description': _(u'File path to imagemagick\'s identify program.'), 'exists': True},
        {'name': u'GM_PATH', 'global_name': u'CONVERTER_GM_PATH', 'default': u'/usr/bin/gm', 'description': _(u'File path to graphicsmagick\'s program.'), 'exists': True},
        {'name': u'GM_SETTINGS', 'global_name': u'CONVERTER_GM_SETTINGS', 'default': u''},
-        {'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python', 'description': _(u'Graphics conversion backend to use.  Options are: converter.backends.imagemagick, converter.backends.graphicsmagick and converter.backends.python.')},
+        {'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python.Python', 'description': _(u'Graphics conversion backend to use.  Options are: converter.backends.imagemagick.ImageMagick, converter.backends.graphicsmagick.GraphicsMagick and converter.backends.python.Python')},
        {'name': u'LIBREOFFICE_PATH', 'global_name': u'CONVERTER_LIBREOFFICE_PATH', 'default': u'/usr/bin/libreoffice', 'exists': True, 'description': _(u'Path to the libreoffice program.')},
-
-        # {'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'},
-        # {'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'},
-        # {'name': u'PRINT_QUALITY_OPTIONS', 'global_name': u'CONVERTER_PRINT_QUALITY_OPTIONS', 'default': u'-density 500'},
    ]
 )
--- a/mayan/apps/converter/runtime.py
+++ b/mayan/apps/converter/runtime.py
@@ -1,10 +1,17 @@
 from __future__ import absolute_import

-from .office_converter import OfficeConverter
-from .exceptions import OfficeBackendError
+from django.core.exceptions import ImproperlyConfigured

+from common.utils import load_backend
+
+from .conf.settings import GRAPHICS_BACKEND
+from .exceptions import OfficeBackendError
+from .office_converter import OfficeConverter

 try:
    office_converter = OfficeConverter()
 except OfficeBackendError:
    office_converter = None
+
+backend = load_backend(GRAPHICS_BACKEND)
+
--- a/mayan/apps/converter/utils.py
+++ b/mayan/apps/converter/utils.py
@@ -1,40 +0,0 @@
-import os
-
-from django.utils.importlib import import_module
-
-
-def _lazy_load(fn):
-    _cached = []
-
-    def _decorated():
-        if not _cached:
-            _cached.append(fn())
-        return _cached[0]
-    return _decorated
-
-
-@_lazy_load
-def load_backend():
-    from converter.conf.settings import GRAPHICS_BACKEND as backend_name
-
-    try:
-        module = import_module('.base', 'converter.backends.%s' % backend_name)
-        import warnings
-        warnings.warn(
-            "Short names for CONVERTER_BACKEND are deprecated; prepend with 'converter.backends.'",
-            PendingDeprecationWarning
-        )
-        return module
-    except ImportError, e:
-        # Look for a fully qualified converter backend name
-        return import_module('.base', backend_name)
-
-
-def cleanup(filename):
-    """
-    Tries to remove the given filename. Ignores non-existent files
-    """
-    try:
-        os.remove(filename)
-    except OSError:
-        pass
--- a/mayan/apps/ocr/api.py
+++ b/mayan/apps/ocr/api.py
@@ -1,48 +1,26 @@
 from __future__ import absolute_import

+import logging
 import os
 import subprocess
 import sys

-from django.utils.importlib import import_module
 from django.utils.translation import ugettext as _

 from common.conf.settings import TEMPORARY_DIRECTORY
+from common.utils import fs_cleanup
 from converter.api import convert
 from documents.models import DocumentPage

-from .backends import ocr_backend
 from .conf.settings import UNPAPER_PATH, LANGUAGE
 from .exceptions import UnpaperError
 from .literals import (DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT,
    DEFAULT_OCR_FILE_EXTENSION)
 from .parsers import parse_document_page
 from .parsers.exceptions import ParserError, ParserUnknownFile
+from .runtime import language_backend, ocr_backend

-
-def get_language_backend():
-    """
-    Return the OCR cleanup language backend using the selected language
-    in the configuration settings
-    """
-    try:
-        module = import_module(u'.'.join([u'ocr', u'lang', LANGUAGE]))
-    except ImportError:
-        sys.stderr.write(u'\nWarning: No OCR app language backend for language: %s\n\n' % LANGUAGE)
-        return None
-    return module
-
-language_backend = get_language_backend()
-
-
-def cleanup(filename):
-    """
-    Try to remove the given filename, ignoring non-existent files
-    """
-    try:
-        os.remove(filename)
-    except OSError:
-        pass
+logger = logging.getLogger(__name__)


 def do_document_ocr(queue_document):
@@ -58,15 +36,21 @@ def do_document_ocr(queue_document):
            parse_document_page(document_page)
        except (ParserError, ParserUnknownFile):
            # Fall back to doing visual OCR
-            ocr_transformations, warnings = queue_document.get_transformation_list()

            document_filepath = document_page.document.get_image_cache_name(page=document_page.page_number, version=document_page.document_version.pk)
            unpaper_output_filename = u'%s_unpaper_out_page_%s%s%s' % (document_page.document.uuid, document_page.page_number, os.extsep, UNPAPER_FILE_FORMAT)
            unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename)

-            unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT, transformations=ocr_transformations)
+            logger.debug('document_filepath: %s' % document_filepath)
+
+            unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT)
+
+            logger.debug('unpaper_input: %s' % unpaper_input)
+
            execute_unpaper(input_filepath=unpaper_input, output_filepath=unpaper_output_filepath)

+            logger.debug('unpaper_output_filepath: %s' % unpaper_output_filepath)
+
            # from PIL import Image, ImageOps
            # im = Image.open(document_filepath)
            # #if im.mode=='RGBA':
@@ -77,8 +61,14 @@ def do_document_ocr(queue_document):

            # Convert to TIFF
            pre_ocr_filepath = convert(input_filepath=unpaper_output_filepath, file_format=DEFAULT_OCR_FILE_FORMAT)
+
+            logger.debug('pre_ocr_filepath: %s' % pre_ocr_filepath)
+
            # Tesseract needs an explicit file extension
            pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])
+
+            logger.debug('pre_ocr_filepath_w_ext: %s' % pre_ocr_filepath_w_ext)
+
            os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
            try:
                ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, LANGUAGE)
@@ -87,10 +77,10 @@ def do_document_ocr(queue_document):
                document_page.page_label = _(u'Text from OCR')
                document_page.save()
            finally:
-                cleanup(pre_ocr_filepath_w_ext)
-                cleanup(unpaper_input)
-                cleanup(document_filepath)
-                cleanup(unpaper_output_filepath)
+                fs_cleanup(pre_ocr_filepath_w_ext)
+                fs_cleanup(unpaper_input)
+                fs_cleanup(document_filepath)
+                fs_cleanup(unpaper_output_filepath)


 def ocr_cleanup(text):
--- a/mayan/apps/ocr/backends/init.py
+++ b/mayan/apps/ocr/backends/init.py
@@ -1,28 +1,3 @@
-from __future__ import absolute_import
-
-import sys
-
-from django.utils.importlib import import_module
-
-from ..conf.settings import BACKEND
-
-
 class BackendBase(object):
    def execute(input_filename, language=None):
        raise NotImplemented
-
-
-def get_ocr_backend():
-    """
-    Return the OCR backend using the path specified in the configuration
-    settings
-    """
-    try:
-        module = import_module(BACKEND)
-    except ImportError:
-        sys.stderr.write(u'\nWarning: No OCR backend named: %s\n\n' % BACKEND)
-        raise
-    else:
-        return module
-
-ocr_backend = get_ocr_backend()
--- a/mayan/apps/ocr/backends/tesseract.py
+++ b/mayan/apps/ocr/backends/tesseract.py
@@ -5,6 +5,8 @@ import os
 import subprocess
 import tempfile

+from common.utils import fs_cleanup
+
 from . import BackendBase
 from ..conf.settings import TESSERACT_PATH
 from ..exceptions import OCRError
@@ -27,8 +29,8 @@ class Tesseract(BackendBase):
        return_code = proc.wait()
        if return_code != 0:
            error_text = proc.stderr.read()
-            cleanup(filepath)
-            cleanup(ocr_output)
+            fs_cleanup(filepath)
+            fs_cleanup(ocr_output)
            if language:
                # If tesseract gives an error with a language parameter
                # re-run it with no parameter again
@@ -43,14 +45,3 @@ class Tesseract(BackendBase):
        os.unlink(filepath)

        return text
-
-
-# TODO: Reduntant, also in api.py
-def cleanup(filename):
-    """
-    Try to remove the given filename, ignoring non-existent files
-    """
-    try:
-        os.remove(filename)
-    except OSError:
-        pass
--- a/mayan/apps/ocr/conf/settings.py
+++ b/mayan/apps/ocr/conf/settings.py
@@ -16,6 +16,6 @@ register_settings(
        {'name': u'QUEUE_PROCESSING_INTERVAL', 'global_name': u'OCR_QUEUE_PROCESSING_INTERVAL', 'default': 10},
        {'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
        {'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True},
-        {'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')},
+        {'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract.Tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')},
    ]
 )
--- a/mayan/apps/ocr/lang/init.py
+++ b/mayan/apps/ocr/lang/init.py
@@ -0,0 +1,3 @@
+class BackendBase(object):
+    def check_word(word):
+        raise NotImplemented
--- a/mayan/apps/ocr/lang/eng.py
+++ b/mayan/apps/ocr/lang/eng.py
@@ -1,7 +1,12 @@
+from __future__ import absolute_import
+
 import re

+from . import BackendBase

-def check_word(word):
+
+class LanguageBackend(BackendBase):
+    def check_word(word):
        ALL_ALPHANUM = re.compile('([0-9a-z])', re.I)
        NON_ALPHANUM = re.compile('([^0-9a-z])', re.I)

--- a/mayan/apps/ocr/lang/rus.py
+++ b/mayan/apps/ocr/lang/rus.py
@@ -1,8 +1,13 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
 import re

+from . import BackendBase

-def check_word(word):
+
+class LanguageBackend(BackendBase):
+    def check_word(word):
        ALL_ALPHANUM = re.compile('([0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
        NON_ALPHANUM = re.compile('([^0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)

--- a/mayan/apps/ocr/lang/spa.py
+++ b/mayan/apps/ocr/lang/spa.py
@@ -1,8 +1,13 @@
 # -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
 import re

+from . import BackendBase

-def check_word(word):
+
+class LanguageBackend(BackendBase):
+    def check_word(word):
        ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
        NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)

--- a/mayan/apps/ocr/runtime.py
+++ b/mayan/apps/ocr/runtime.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import
+
+from common.utils import load_backend
+
+from .conf.settings import BACKEND, LANGUAGE
+
+try:
+    language_backend = load_backend(u'.'.join([u'ocr', u'lang', LANGUAGE, u'LanguageBackend']))
+except ImportError:
+    language_backend = None
+
+ocr_backend = load_backend(BACKEND)