Unify the way backends are defined and loaded, unify the fs_cleanup function
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
@@ -14,8 +15,11 @@ from django.contrib.contenttypes.models import ContentType
|
||||
from django.utils.datastructures import MultiValueDict
|
||||
from django.utils.http import urlquote as django_urlquote
|
||||
from django.utils.http import urlencode as django_urlencode
|
||||
from django.utils.importlib import import_module
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def urlquote(link=None, get=None):
|
||||
u"""
|
||||
@@ -427,3 +431,34 @@ def copyfile(source, destination, buffer_size=1024 * 1024):
|
||||
|
||||
source_descriptor.close()
|
||||
destination_descriptor.close()
|
||||
|
||||
|
||||
def _lazy_load(fn):
|
||||
_cached = []
|
||||
|
||||
def _decorated():
|
||||
if not _cached:
|
||||
_cached.append(fn())
|
||||
return _cached[0]
|
||||
return _decorated
|
||||
|
||||
|
||||
def load_backend(backend_string):
|
||||
logger.debug('loading: %s' % backend_string)
|
||||
module_name, klass = backend_string.rsplit('.', 1)
|
||||
|
||||
try:
|
||||
return getattr(import_module(module_name), klass)()
|
||||
except ImportError as exception:
|
||||
logger.debug('error importing: %s' % backend_string)
|
||||
raise
|
||||
|
||||
|
||||
def fs_cleanup(filename):
|
||||
"""
|
||||
Tries to remove the given filename. Ignores non-existent files
|
||||
"""
|
||||
try:
|
||||
os.remove(filename)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
@@ -1,14 +1,10 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
from django.core.exceptions import ImproperlyConfigured
|
||||
|
||||
from navigation.api import register_sidebar_template
|
||||
from project_tools.api import register_tool
|
||||
|
||||
from .utils import load_backend
|
||||
from .conf.settings import GRAPHICS_BACKEND
|
||||
|
||||
|
||||
def is_superuser(context):
|
||||
return context['request'].user.is_staff or context['request'].user.is_superuser
|
||||
@@ -18,9 +14,4 @@ formats_list = {'text': _('file formats'), 'view': 'formats_list', 'famfam': 'pi
|
||||
|
||||
register_sidebar_template(['formats_list'], 'converter_file_formats_help.html')
|
||||
|
||||
try:
|
||||
backend = load_backend().ConverterClass()
|
||||
except ImproperlyConfigured:
|
||||
raise ImproperlyConfigured(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)
|
||||
|
||||
register_tool(formats_list)
|
||||
|
||||
@@ -7,16 +7,15 @@ import os
|
||||
from django.utils.encoding import smart_str
|
||||
|
||||
from common.conf.settings import TEMPORARY_DIRECTORY
|
||||
from common.utils import fs_cleanup
|
||||
|
||||
from . import backend
|
||||
from .exceptions import OfficeConversionError, UnknownFileFormat
|
||||
from .literals import (DEFAULT_PAGE_NUMBER,
|
||||
DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, DEFAULT_FILE_FORMAT)
|
||||
from .literals import (TRANSFORMATION_CHOICES, TRANSFORMATION_RESIZE,
|
||||
TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM, DIMENSION_SEPARATOR,
|
||||
FILE_FORMATS)
|
||||
from .runtime import office_converter
|
||||
from .utils import cleanup
|
||||
from .runtime import backend, office_converter
|
||||
|
||||
HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
|
||||
|
||||
@@ -96,7 +95,7 @@ def convert(input_filepath, output_filepath=None, cleanup_files=False, mimetype=
|
||||
backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, transformations=transformations, page=page, file_format=file_format, mimetype=mimetype)
|
||||
finally:
|
||||
if cleanup_files:
|
||||
cleanup(input_filepath)
|
||||
fs_cleanup(input_filepath)
|
||||
|
||||
return output_filepath
|
||||
|
||||
|
||||
@@ -1,21 +1,21 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import re
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
from ...backends import ConverterBase
|
||||
from ...conf.settings import GM_PATH, GM_SETTINGS
|
||||
from ...exceptions import ConvertError, UnknownFileFormat, IdentifyError
|
||||
from ...literals import (TRANSFORMATION_RESIZE,
|
||||
from . import ConverterBase
|
||||
from ..conf.settings import GM_PATH, GM_SETTINGS
|
||||
from ..exceptions import ConvertError, UnknownFileFormat, IdentifyError
|
||||
from ..literals import (TRANSFORMATION_RESIZE,
|
||||
TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM)
|
||||
from ...literals import DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER, \
|
||||
DEFAULT_FILE_FORMAT
|
||||
from ..literals import (DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER,
|
||||
DEFAULT_FILE_FORMAT)
|
||||
|
||||
CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format'
|
||||
CONVERTER_ERROR_STARTS_WITH = u'starts with'
|
||||
|
||||
|
||||
class ConverterClass(ConverterBase):
|
||||
class GraphicsMagick(ConverterBase):
|
||||
def identify_file(self, input_filepath, arguments=None):
|
||||
command = []
|
||||
command.append(unicode(GM_PATH))
|
||||
@@ -1,19 +1,19 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import re
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
from ...backends import ConverterBase
|
||||
from ...conf.settings import IM_CONVERT_PATH, IM_IDENTIFY_PATH
|
||||
from ...exceptions import ConvertError, UnknownFileFormat, IdentifyError
|
||||
from ...literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE,
|
||||
from . import ConverterBase
|
||||
from ..conf.settings import IM_CONVERT_PATH, IM_IDENTIFY_PATH
|
||||
from ..exceptions import ConvertError, UnknownFileFormat, IdentifyError
|
||||
from ..literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE,
|
||||
TRANSFORMATION_ZOOM, DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER,
|
||||
DEFAULT_FILE_FORMAT)
|
||||
|
||||
CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format'
|
||||
|
||||
|
||||
class ConverterClass(ConverterBase):
|
||||
class ImageMagick(ConverterBase):
|
||||
def identify_file(self, input_filepath, arguments=None):
|
||||
command = []
|
||||
command.append(unicode(IM_IDENTIFY_PATH))
|
||||
@@ -12,16 +12,18 @@ try:
|
||||
except RuntimeError:
|
||||
USE_GHOSTSCRIPT = False
|
||||
|
||||
from common.utils import fs_cleanup
|
||||
from mimetype.api import get_mimetype
|
||||
|
||||
from ...exceptions import UnknownFileFormat
|
||||
from ...backends import ConverterBase
|
||||
from ...literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE,
|
||||
from . import ConverterBase
|
||||
from ..exceptions import UnknownFileFormat
|
||||
from ..literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE,
|
||||
TRANSFORMATION_ZOOM, DEFAULT_PAGE_NUMBER, DEFAULT_FILE_FORMAT)
|
||||
from ...utils import cleanup
|
||||
|
||||
Image.init()
|
||||
|
||||
|
||||
class ConverterClass(ConverterBase):
|
||||
class Python(ConverterBase):
|
||||
def get_page_count(self, input_filepath):
|
||||
page_count = 1
|
||||
|
||||
@@ -94,7 +96,7 @@ class ConverterClass(ConverterBase):
|
||||
raise UnknownFileFormat
|
||||
finally:
|
||||
if tmpfile:
|
||||
cleanup(tmpfile)
|
||||
fs_cleanup(tmpfile)
|
||||
|
||||
current_page = 0
|
||||
try:
|
||||
@@ -1,3 +0,0 @@
|
||||
from PIL import Image
|
||||
|
||||
Image.init()
|
||||
@@ -12,11 +12,7 @@ register_settings(
|
||||
{'name': u'IM_IDENTIFY_PATH', 'global_name': u'CONVERTER_IM_IDENTIFY_PATH', 'default': u'/usr/bin/identify', 'description': _(u'File path to imagemagick\'s identify program.'), 'exists': True},
|
||||
{'name': u'GM_PATH', 'global_name': u'CONVERTER_GM_PATH', 'default': u'/usr/bin/gm', 'description': _(u'File path to graphicsmagick\'s program.'), 'exists': True},
|
||||
{'name': u'GM_SETTINGS', 'global_name': u'CONVERTER_GM_SETTINGS', 'default': u''},
|
||||
{'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick, converter.backends.graphicsmagick and converter.backends.python.')},
|
||||
{'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python.Python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick.ImageMagick, converter.backends.graphicsmagick.GraphicsMagick and converter.backends.python.Python')},
|
||||
{'name': u'LIBREOFFICE_PATH', 'global_name': u'CONVERTER_LIBREOFFICE_PATH', 'default': u'/usr/bin/libreoffice', 'exists': True, 'description': _(u'Path to the libreoffice program.')},
|
||||
|
||||
# {'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'},
|
||||
# {'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'},
|
||||
# {'name': u'PRINT_QUALITY_OPTIONS', 'global_name': u'CONVERTER_PRINT_QUALITY_OPTIONS', 'default': u'-density 500'},
|
||||
]
|
||||
)
|
||||
|
||||
@@ -1,10 +1,17 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
from .office_converter import OfficeConverter
|
||||
from .exceptions import OfficeBackendError
|
||||
from django.core.exceptions import ImproperlyConfigured
|
||||
|
||||
from common.utils import load_backend
|
||||
|
||||
from .conf.settings import GRAPHICS_BACKEND
|
||||
from .exceptions import OfficeBackendError
|
||||
from .office_converter import OfficeConverter
|
||||
|
||||
try:
|
||||
office_converter = OfficeConverter()
|
||||
except OfficeBackendError:
|
||||
office_converter = None
|
||||
|
||||
backend = load_backend(GRAPHICS_BACKEND)
|
||||
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
import os
|
||||
|
||||
from django.utils.importlib import import_module
|
||||
|
||||
|
||||
def _lazy_load(fn):
|
||||
_cached = []
|
||||
|
||||
def _decorated():
|
||||
if not _cached:
|
||||
_cached.append(fn())
|
||||
return _cached[0]
|
||||
return _decorated
|
||||
|
||||
|
||||
@_lazy_load
|
||||
def load_backend():
|
||||
from converter.conf.settings import GRAPHICS_BACKEND as backend_name
|
||||
|
||||
try:
|
||||
module = import_module('.base', 'converter.backends.%s' % backend_name)
|
||||
import warnings
|
||||
warnings.warn(
|
||||
"Short names for CONVERTER_BACKEND are deprecated; prepend with 'converter.backends.'",
|
||||
PendingDeprecationWarning
|
||||
)
|
||||
return module
|
||||
except ImportError, e:
|
||||
# Look for a fully qualified converter backend name
|
||||
return import_module('.base', backend_name)
|
||||
|
||||
|
||||
def cleanup(filename):
|
||||
"""
|
||||
Tries to remove the given filename. Ignores non-existent files
|
||||
"""
|
||||
try:
|
||||
os.remove(filename)
|
||||
except OSError:
|
||||
pass
|
||||
@@ -1,48 +1,26 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from django.utils.importlib import import_module
|
||||
from django.utils.translation import ugettext as _
|
||||
|
||||
from common.conf.settings import TEMPORARY_DIRECTORY
|
||||
from common.utils import fs_cleanup
|
||||
from converter.api import convert
|
||||
from documents.models import DocumentPage
|
||||
|
||||
from .backends import ocr_backend
|
||||
from .conf.settings import UNPAPER_PATH, LANGUAGE
|
||||
from .exceptions import UnpaperError
|
||||
from .literals import (DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT,
|
||||
DEFAULT_OCR_FILE_EXTENSION)
|
||||
from .parsers import parse_document_page
|
||||
from .parsers.exceptions import ParserError, ParserUnknownFile
|
||||
from .runtime import language_backend, ocr_backend
|
||||
|
||||
|
||||
def get_language_backend():
|
||||
"""
|
||||
Return the OCR cleanup language backend using the selected language
|
||||
in the configuration settings
|
||||
"""
|
||||
try:
|
||||
module = import_module(u'.'.join([u'ocr', u'lang', LANGUAGE]))
|
||||
except ImportError:
|
||||
sys.stderr.write(u'\nWarning: No OCR app language backend for language: %s\n\n' % LANGUAGE)
|
||||
return None
|
||||
return module
|
||||
|
||||
language_backend = get_language_backend()
|
||||
|
||||
|
||||
def cleanup(filename):
|
||||
"""
|
||||
Try to remove the given filename, ignoring non-existent files
|
||||
"""
|
||||
try:
|
||||
os.remove(filename)
|
||||
except OSError:
|
||||
pass
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def do_document_ocr(queue_document):
|
||||
@@ -58,15 +36,21 @@ def do_document_ocr(queue_document):
|
||||
parse_document_page(document_page)
|
||||
except (ParserError, ParserUnknownFile):
|
||||
# Fall back to doing visual OCR
|
||||
ocr_transformations, warnings = queue_document.get_transformation_list()
|
||||
|
||||
document_filepath = document_page.document.get_image_cache_name(page=document_page.page_number, version=document_page.document_version.pk)
|
||||
unpaper_output_filename = u'%s_unpaper_out_page_%s%s%s' % (document_page.document.uuid, document_page.page_number, os.extsep, UNPAPER_FILE_FORMAT)
|
||||
unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename)
|
||||
|
||||
unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT, transformations=ocr_transformations)
|
||||
logger.debug('document_filepath: %s' % document_filepath)
|
||||
|
||||
unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT)
|
||||
|
||||
logger.debug('unpaper_input: %s' % unpaper_input)
|
||||
|
||||
execute_unpaper(input_filepath=unpaper_input, output_filepath=unpaper_output_filepath)
|
||||
|
||||
logger.debug('unpaper_output_filepath: %s' % unpaper_output_filepath)
|
||||
|
||||
# from PIL import Image, ImageOps
|
||||
# im = Image.open(document_filepath)
|
||||
# #if im.mode=='RGBA':
|
||||
@@ -77,8 +61,14 @@ def do_document_ocr(queue_document):
|
||||
|
||||
# Convert to TIFF
|
||||
pre_ocr_filepath = convert(input_filepath=unpaper_output_filepath, file_format=DEFAULT_OCR_FILE_FORMAT)
|
||||
|
||||
logger.debug('pre_ocr_filepath: %s' % pre_ocr_filepath)
|
||||
|
||||
# Tesseract needs an explicit file extension
|
||||
pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])
|
||||
|
||||
logger.debug('pre_ocr_filepath_w_ext: %s' % pre_ocr_filepath_w_ext)
|
||||
|
||||
os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
|
||||
try:
|
||||
ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, LANGUAGE)
|
||||
@@ -87,10 +77,10 @@ def do_document_ocr(queue_document):
|
||||
document_page.page_label = _(u'Text from OCR')
|
||||
document_page.save()
|
||||
finally:
|
||||
cleanup(pre_ocr_filepath_w_ext)
|
||||
cleanup(unpaper_input)
|
||||
cleanup(document_filepath)
|
||||
cleanup(unpaper_output_filepath)
|
||||
fs_cleanup(pre_ocr_filepath_w_ext)
|
||||
fs_cleanup(unpaper_input)
|
||||
fs_cleanup(document_filepath)
|
||||
fs_cleanup(unpaper_output_filepath)
|
||||
|
||||
|
||||
def ocr_cleanup(text):
|
||||
|
||||
@@ -1,28 +1,3 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import sys
|
||||
|
||||
from django.utils.importlib import import_module
|
||||
|
||||
from ..conf.settings import BACKEND
|
||||
|
||||
|
||||
class BackendBase(object):
|
||||
def execute(input_filename, language=None):
|
||||
raise NotImplemented
|
||||
|
||||
|
||||
def get_ocr_backend():
|
||||
"""
|
||||
Return the OCR backend using the path specified in the configuration
|
||||
settings
|
||||
"""
|
||||
try:
|
||||
module = import_module(BACKEND)
|
||||
except ImportError:
|
||||
sys.stderr.write(u'\nWarning: No OCR backend named: %s\n\n' % BACKEND)
|
||||
raise
|
||||
else:
|
||||
return module
|
||||
|
||||
ocr_backend = get_ocr_backend()
|
||||
|
||||
@@ -5,6 +5,8 @@ import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
from common.utils import fs_cleanup
|
||||
|
||||
from . import BackendBase
|
||||
from ..conf.settings import TESSERACT_PATH
|
||||
from ..exceptions import OCRError
|
||||
@@ -27,8 +29,8 @@ class Tesseract(BackendBase):
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
error_text = proc.stderr.read()
|
||||
cleanup(filepath)
|
||||
cleanup(ocr_output)
|
||||
fs_cleanup(filepath)
|
||||
fs_cleanup(ocr_output)
|
||||
if language:
|
||||
# If tesseract gives an error with a language parameter
|
||||
# re-run it with no parameter again
|
||||
@@ -43,14 +45,3 @@ class Tesseract(BackendBase):
|
||||
os.unlink(filepath)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
# TODO: Reduntant, also in api.py
|
||||
def cleanup(filename):
|
||||
"""
|
||||
Try to remove the given filename, ignoring non-existent files
|
||||
"""
|
||||
try:
|
||||
os.remove(filename)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
@@ -16,6 +16,6 @@ register_settings(
|
||||
{'name': u'QUEUE_PROCESSING_INTERVAL', 'global_name': u'OCR_QUEUE_PROCESSING_INTERVAL', 'default': 10},
|
||||
{'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
|
||||
{'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True},
|
||||
{'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')},
|
||||
{'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract.Tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')},
|
||||
]
|
||||
)
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
class BackendBase(object):
|
||||
def check_word(word):
|
||||
raise NotImplemented
|
||||
|
||||
@@ -1,7 +1,12 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import re
|
||||
|
||||
from . import BackendBase
|
||||
|
||||
def check_word(word):
|
||||
|
||||
class LanguageBackend(BackendBase):
|
||||
def check_word(word):
|
||||
ALL_ALPHANUM = re.compile('([0-9a-z])', re.I)
|
||||
NON_ALPHANUM = re.compile('([^0-9a-z])', re.I)
|
||||
|
||||
|
||||
@@ -1,8 +1,13 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
|
||||
import re
|
||||
|
||||
from . import BackendBase
|
||||
|
||||
def check_word(word):
|
||||
|
||||
class LanguageBackend(BackendBase):
|
||||
def check_word(word):
|
||||
ALL_ALPHANUM = re.compile('([0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
|
||||
NON_ALPHANUM = re.compile('([^0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
|
||||
|
||||
|
||||
@@ -1,8 +1,13 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
|
||||
import re
|
||||
|
||||
from . import BackendBase
|
||||
|
||||
def check_word(word):
|
||||
|
||||
class LanguageBackend(BackendBase):
|
||||
def check_word(word):
|
||||
ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
|
||||
NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
|
||||
|
||||
|
||||
12
mayan/apps/ocr/runtime.py
Normal file
12
mayan/apps/ocr/runtime.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
from common.utils import load_backend
|
||||
|
||||
from .conf.settings import BACKEND, LANGUAGE
|
||||
|
||||
try:
|
||||
language_backend = load_backend(u'.'.join([u'ocr', u'lang', LANGUAGE, u'LanguageBackend']))
|
||||
except ImportError:
|
||||
language_backend = None
|
||||
|
||||
ocr_backend = load_backend(BACKEND)
|
||||
Reference in New Issue
Block a user