Unify the way backends are defined and loaded, unify the fs_cleanup function
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
@@ -14,8 +15,11 @@ from django.contrib.contenttypes.models import ContentType
|
|||||||
from django.utils.datastructures import MultiValueDict
|
from django.utils.datastructures import MultiValueDict
|
||||||
from django.utils.http import urlquote as django_urlquote
|
from django.utils.http import urlquote as django_urlquote
|
||||||
from django.utils.http import urlencode as django_urlencode
|
from django.utils.http import urlencode as django_urlencode
|
||||||
|
from django.utils.importlib import import_module
|
||||||
from django.utils.translation import ugettext_lazy as _
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def urlquote(link=None, get=None):
|
def urlquote(link=None, get=None):
|
||||||
u"""
|
u"""
|
||||||
@@ -427,3 +431,34 @@ def copyfile(source, destination, buffer_size=1024 * 1024):
|
|||||||
|
|
||||||
source_descriptor.close()
|
source_descriptor.close()
|
||||||
destination_descriptor.close()
|
destination_descriptor.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _lazy_load(fn):
|
||||||
|
_cached = []
|
||||||
|
|
||||||
|
def _decorated():
|
||||||
|
if not _cached:
|
||||||
|
_cached.append(fn())
|
||||||
|
return _cached[0]
|
||||||
|
return _decorated
|
||||||
|
|
||||||
|
|
||||||
|
def load_backend(backend_string):
|
||||||
|
logger.debug('loading: %s' % backend_string)
|
||||||
|
module_name, klass = backend_string.rsplit('.', 1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
return getattr(import_module(module_name), klass)()
|
||||||
|
except ImportError as exception:
|
||||||
|
logger.debug('error importing: %s' % backend_string)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def fs_cleanup(filename):
|
||||||
|
"""
|
||||||
|
Tries to remove the given filename. Ignores non-existent files
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
os.remove(filename)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|||||||
@@ -1,14 +1,10 @@
|
|||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
from django.utils.translation import ugettext_lazy as _
|
from django.utils.translation import ugettext_lazy as _
|
||||||
from django.core.exceptions import ImproperlyConfigured
|
|
||||||
|
|
||||||
from navigation.api import register_sidebar_template
|
from navigation.api import register_sidebar_template
|
||||||
from project_tools.api import register_tool
|
from project_tools.api import register_tool
|
||||||
|
|
||||||
from .utils import load_backend
|
|
||||||
from .conf.settings import GRAPHICS_BACKEND
|
|
||||||
|
|
||||||
|
|
||||||
def is_superuser(context):
|
def is_superuser(context):
|
||||||
return context['request'].user.is_staff or context['request'].user.is_superuser
|
return context['request'].user.is_staff or context['request'].user.is_superuser
|
||||||
@@ -18,9 +14,4 @@ formats_list = {'text': _('file formats'), 'view': 'formats_list', 'famfam': 'pi
|
|||||||
|
|
||||||
register_sidebar_template(['formats_list'], 'converter_file_formats_help.html')
|
register_sidebar_template(['formats_list'], 'converter_file_formats_help.html')
|
||||||
|
|
||||||
try:
|
|
||||||
backend = load_backend().ConverterClass()
|
|
||||||
except ImproperlyConfigured:
|
|
||||||
raise ImproperlyConfigured(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)
|
|
||||||
|
|
||||||
register_tool(formats_list)
|
register_tool(formats_list)
|
||||||
|
|||||||
@@ -7,16 +7,15 @@ import os
|
|||||||
from django.utils.encoding import smart_str
|
from django.utils.encoding import smart_str
|
||||||
|
|
||||||
from common.conf.settings import TEMPORARY_DIRECTORY
|
from common.conf.settings import TEMPORARY_DIRECTORY
|
||||||
|
from common.utils import fs_cleanup
|
||||||
|
|
||||||
from . import backend
|
|
||||||
from .exceptions import OfficeConversionError, UnknownFileFormat
|
from .exceptions import OfficeConversionError, UnknownFileFormat
|
||||||
from .literals import (DEFAULT_PAGE_NUMBER,
|
from .literals import (DEFAULT_PAGE_NUMBER,
|
||||||
DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, DEFAULT_FILE_FORMAT)
|
DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, DEFAULT_FILE_FORMAT)
|
||||||
from .literals import (TRANSFORMATION_CHOICES, TRANSFORMATION_RESIZE,
|
from .literals import (TRANSFORMATION_CHOICES, TRANSFORMATION_RESIZE,
|
||||||
TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM, DIMENSION_SEPARATOR,
|
TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM, DIMENSION_SEPARATOR,
|
||||||
FILE_FORMATS)
|
FILE_FORMATS)
|
||||||
from .runtime import office_converter
|
from .runtime import backend, office_converter
|
||||||
from .utils import cleanup
|
|
||||||
|
|
||||||
HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
|
HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
|
||||||
|
|
||||||
@@ -96,7 +95,7 @@ def convert(input_filepath, output_filepath=None, cleanup_files=False, mimetype=
|
|||||||
backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, transformations=transformations, page=page, file_format=file_format, mimetype=mimetype)
|
backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, transformations=transformations, page=page, file_format=file_format, mimetype=mimetype)
|
||||||
finally:
|
finally:
|
||||||
if cleanup_files:
|
if cleanup_files:
|
||||||
cleanup(input_filepath)
|
fs_cleanup(input_filepath)
|
||||||
|
|
||||||
return output_filepath
|
return output_filepath
|
||||||
|
|
||||||
|
|||||||
@@ -1,21 +1,21 @@
|
|||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
import re
|
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import re
|
||||||
|
|
||||||
from ...backends import ConverterBase
|
from . import ConverterBase
|
||||||
from ...conf.settings import GM_PATH, GM_SETTINGS
|
from ..conf.settings import GM_PATH, GM_SETTINGS
|
||||||
from ...exceptions import ConvertError, UnknownFileFormat, IdentifyError
|
from ..exceptions import ConvertError, UnknownFileFormat, IdentifyError
|
||||||
from ...literals import (TRANSFORMATION_RESIZE,
|
from ..literals import (TRANSFORMATION_RESIZE,
|
||||||
TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM)
|
TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM)
|
||||||
from ...literals import DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER, \
|
from ..literals import (DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER,
|
||||||
DEFAULT_FILE_FORMAT
|
DEFAULT_FILE_FORMAT)
|
||||||
|
|
||||||
CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format'
|
CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format'
|
||||||
CONVERTER_ERROR_STARTS_WITH = u'starts with'
|
CONVERTER_ERROR_STARTS_WITH = u'starts with'
|
||||||
|
|
||||||
|
|
||||||
class ConverterClass(ConverterBase):
|
class GraphicsMagick(ConverterBase):
|
||||||
def identify_file(self, input_filepath, arguments=None):
|
def identify_file(self, input_filepath, arguments=None):
|
||||||
command = []
|
command = []
|
||||||
command.append(unicode(GM_PATH))
|
command.append(unicode(GM_PATH))
|
||||||
@@ -1,19 +1,19 @@
|
|||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
import re
|
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import re
|
||||||
|
|
||||||
from ...backends import ConverterBase
|
from . import ConverterBase
|
||||||
from ...conf.settings import IM_CONVERT_PATH, IM_IDENTIFY_PATH
|
from ..conf.settings import IM_CONVERT_PATH, IM_IDENTIFY_PATH
|
||||||
from ...exceptions import ConvertError, UnknownFileFormat, IdentifyError
|
from ..exceptions import ConvertError, UnknownFileFormat, IdentifyError
|
||||||
from ...literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE,
|
from ..literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE,
|
||||||
TRANSFORMATION_ZOOM, DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER,
|
TRANSFORMATION_ZOOM, DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER,
|
||||||
DEFAULT_FILE_FORMAT)
|
DEFAULT_FILE_FORMAT)
|
||||||
|
|
||||||
CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format'
|
CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format'
|
||||||
|
|
||||||
|
|
||||||
class ConverterClass(ConverterBase):
|
class ImageMagick(ConverterBase):
|
||||||
def identify_file(self, input_filepath, arguments=None):
|
def identify_file(self, input_filepath, arguments=None):
|
||||||
command = []
|
command = []
|
||||||
command.append(unicode(IM_IDENTIFY_PATH))
|
command.append(unicode(IM_IDENTIFY_PATH))
|
||||||
@@ -12,16 +12,18 @@ try:
|
|||||||
except RuntimeError:
|
except RuntimeError:
|
||||||
USE_GHOSTSCRIPT = False
|
USE_GHOSTSCRIPT = False
|
||||||
|
|
||||||
|
from common.utils import fs_cleanup
|
||||||
from mimetype.api import get_mimetype
|
from mimetype.api import get_mimetype
|
||||||
|
|
||||||
from ...exceptions import UnknownFileFormat
|
from . import ConverterBase
|
||||||
from ...backends import ConverterBase
|
from ..exceptions import UnknownFileFormat
|
||||||
from ...literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE,
|
from ..literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE,
|
||||||
TRANSFORMATION_ZOOM, DEFAULT_PAGE_NUMBER, DEFAULT_FILE_FORMAT)
|
TRANSFORMATION_ZOOM, DEFAULT_PAGE_NUMBER, DEFAULT_FILE_FORMAT)
|
||||||
from ...utils import cleanup
|
|
||||||
|
Image.init()
|
||||||
|
|
||||||
|
|
||||||
class ConverterClass(ConverterBase):
|
class Python(ConverterBase):
|
||||||
def get_page_count(self, input_filepath):
|
def get_page_count(self, input_filepath):
|
||||||
page_count = 1
|
page_count = 1
|
||||||
|
|
||||||
@@ -94,7 +96,7 @@ class ConverterClass(ConverterBase):
|
|||||||
raise UnknownFileFormat
|
raise UnknownFileFormat
|
||||||
finally:
|
finally:
|
||||||
if tmpfile:
|
if tmpfile:
|
||||||
cleanup(tmpfile)
|
fs_cleanup(tmpfile)
|
||||||
|
|
||||||
current_page = 0
|
current_page = 0
|
||||||
try:
|
try:
|
||||||
@@ -1,3 +0,0 @@
|
|||||||
from PIL import Image
|
|
||||||
|
|
||||||
Image.init()
|
|
||||||
@@ -12,11 +12,7 @@ register_settings(
|
|||||||
{'name': u'IM_IDENTIFY_PATH', 'global_name': u'CONVERTER_IM_IDENTIFY_PATH', 'default': u'/usr/bin/identify', 'description': _(u'File path to imagemagick\'s identify program.'), 'exists': True},
|
{'name': u'IM_IDENTIFY_PATH', 'global_name': u'CONVERTER_IM_IDENTIFY_PATH', 'default': u'/usr/bin/identify', 'description': _(u'File path to imagemagick\'s identify program.'), 'exists': True},
|
||||||
{'name': u'GM_PATH', 'global_name': u'CONVERTER_GM_PATH', 'default': u'/usr/bin/gm', 'description': _(u'File path to graphicsmagick\'s program.'), 'exists': True},
|
{'name': u'GM_PATH', 'global_name': u'CONVERTER_GM_PATH', 'default': u'/usr/bin/gm', 'description': _(u'File path to graphicsmagick\'s program.'), 'exists': True},
|
||||||
{'name': u'GM_SETTINGS', 'global_name': u'CONVERTER_GM_SETTINGS', 'default': u''},
|
{'name': u'GM_SETTINGS', 'global_name': u'CONVERTER_GM_SETTINGS', 'default': u''},
|
||||||
{'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick, converter.backends.graphicsmagick and converter.backends.python.')},
|
{'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python.Python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick.ImageMagick, converter.backends.graphicsmagick.GraphicsMagick and converter.backends.python.Python')},
|
||||||
{'name': u'LIBREOFFICE_PATH', 'global_name': u'CONVERTER_LIBREOFFICE_PATH', 'default': u'/usr/bin/libreoffice', 'exists': True, 'description': _(u'Path to the libreoffice program.')},
|
{'name': u'LIBREOFFICE_PATH', 'global_name': u'CONVERTER_LIBREOFFICE_PATH', 'default': u'/usr/bin/libreoffice', 'exists': True, 'description': _(u'Path to the libreoffice program.')},
|
||||||
|
|
||||||
# {'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'},
|
|
||||||
# {'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'},
|
|
||||||
# {'name': u'PRINT_QUALITY_OPTIONS', 'global_name': u'CONVERTER_PRINT_QUALITY_OPTIONS', 'default': u'-density 500'},
|
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,10 +1,17 @@
|
|||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
from .office_converter import OfficeConverter
|
from django.core.exceptions import ImproperlyConfigured
|
||||||
from .exceptions import OfficeBackendError
|
|
||||||
|
|
||||||
|
from common.utils import load_backend
|
||||||
|
|
||||||
|
from .conf.settings import GRAPHICS_BACKEND
|
||||||
|
from .exceptions import OfficeBackendError
|
||||||
|
from .office_converter import OfficeConverter
|
||||||
|
|
||||||
try:
|
try:
|
||||||
office_converter = OfficeConverter()
|
office_converter = OfficeConverter()
|
||||||
except OfficeBackendError:
|
except OfficeBackendError:
|
||||||
office_converter = None
|
office_converter = None
|
||||||
|
|
||||||
|
backend = load_backend(GRAPHICS_BACKEND)
|
||||||
|
|
||||||
|
|||||||
@@ -1,40 +0,0 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
from django.utils.importlib import import_module
|
|
||||||
|
|
||||||
|
|
||||||
def _lazy_load(fn):
|
|
||||||
_cached = []
|
|
||||||
|
|
||||||
def _decorated():
|
|
||||||
if not _cached:
|
|
||||||
_cached.append(fn())
|
|
||||||
return _cached[0]
|
|
||||||
return _decorated
|
|
||||||
|
|
||||||
|
|
||||||
@_lazy_load
|
|
||||||
def load_backend():
|
|
||||||
from converter.conf.settings import GRAPHICS_BACKEND as backend_name
|
|
||||||
|
|
||||||
try:
|
|
||||||
module = import_module('.base', 'converter.backends.%s' % backend_name)
|
|
||||||
import warnings
|
|
||||||
warnings.warn(
|
|
||||||
"Short names for CONVERTER_BACKEND are deprecated; prepend with 'converter.backends.'",
|
|
||||||
PendingDeprecationWarning
|
|
||||||
)
|
|
||||||
return module
|
|
||||||
except ImportError, e:
|
|
||||||
# Look for a fully qualified converter backend name
|
|
||||||
return import_module('.base', backend_name)
|
|
||||||
|
|
||||||
|
|
||||||
def cleanup(filename):
|
|
||||||
"""
|
|
||||||
Tries to remove the given filename. Ignores non-existent files
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
os.remove(filename)
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
@@ -1,48 +1,26 @@
|
|||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from django.utils.importlib import import_module
|
|
||||||
from django.utils.translation import ugettext as _
|
from django.utils.translation import ugettext as _
|
||||||
|
|
||||||
from common.conf.settings import TEMPORARY_DIRECTORY
|
from common.conf.settings import TEMPORARY_DIRECTORY
|
||||||
|
from common.utils import fs_cleanup
|
||||||
from converter.api import convert
|
from converter.api import convert
|
||||||
from documents.models import DocumentPage
|
from documents.models import DocumentPage
|
||||||
|
|
||||||
from .backends import ocr_backend
|
|
||||||
from .conf.settings import UNPAPER_PATH, LANGUAGE
|
from .conf.settings import UNPAPER_PATH, LANGUAGE
|
||||||
from .exceptions import UnpaperError
|
from .exceptions import UnpaperError
|
||||||
from .literals import (DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT,
|
from .literals import (DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT,
|
||||||
DEFAULT_OCR_FILE_EXTENSION)
|
DEFAULT_OCR_FILE_EXTENSION)
|
||||||
from .parsers import parse_document_page
|
from .parsers import parse_document_page
|
||||||
from .parsers.exceptions import ParserError, ParserUnknownFile
|
from .parsers.exceptions import ParserError, ParserUnknownFile
|
||||||
|
from .runtime import language_backend, ocr_backend
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
def get_language_backend():
|
|
||||||
"""
|
|
||||||
Return the OCR cleanup language backend using the selected language
|
|
||||||
in the configuration settings
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
module = import_module(u'.'.join([u'ocr', u'lang', LANGUAGE]))
|
|
||||||
except ImportError:
|
|
||||||
sys.stderr.write(u'\nWarning: No OCR app language backend for language: %s\n\n' % LANGUAGE)
|
|
||||||
return None
|
|
||||||
return module
|
|
||||||
|
|
||||||
language_backend = get_language_backend()
|
|
||||||
|
|
||||||
|
|
||||||
def cleanup(filename):
|
|
||||||
"""
|
|
||||||
Try to remove the given filename, ignoring non-existent files
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
os.remove(filename)
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def do_document_ocr(queue_document):
|
def do_document_ocr(queue_document):
|
||||||
@@ -58,15 +36,21 @@ def do_document_ocr(queue_document):
|
|||||||
parse_document_page(document_page)
|
parse_document_page(document_page)
|
||||||
except (ParserError, ParserUnknownFile):
|
except (ParserError, ParserUnknownFile):
|
||||||
# Fall back to doing visual OCR
|
# Fall back to doing visual OCR
|
||||||
ocr_transformations, warnings = queue_document.get_transformation_list()
|
|
||||||
|
|
||||||
document_filepath = document_page.document.get_image_cache_name(page=document_page.page_number, version=document_page.document_version.pk)
|
document_filepath = document_page.document.get_image_cache_name(page=document_page.page_number, version=document_page.document_version.pk)
|
||||||
unpaper_output_filename = u'%s_unpaper_out_page_%s%s%s' % (document_page.document.uuid, document_page.page_number, os.extsep, UNPAPER_FILE_FORMAT)
|
unpaper_output_filename = u'%s_unpaper_out_page_%s%s%s' % (document_page.document.uuid, document_page.page_number, os.extsep, UNPAPER_FILE_FORMAT)
|
||||||
unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename)
|
unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename)
|
||||||
|
|
||||||
unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT, transformations=ocr_transformations)
|
logger.debug('document_filepath: %s' % document_filepath)
|
||||||
|
|
||||||
|
unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT)
|
||||||
|
|
||||||
|
logger.debug('unpaper_input: %s' % unpaper_input)
|
||||||
|
|
||||||
execute_unpaper(input_filepath=unpaper_input, output_filepath=unpaper_output_filepath)
|
execute_unpaper(input_filepath=unpaper_input, output_filepath=unpaper_output_filepath)
|
||||||
|
|
||||||
|
logger.debug('unpaper_output_filepath: %s' % unpaper_output_filepath)
|
||||||
|
|
||||||
# from PIL import Image, ImageOps
|
# from PIL import Image, ImageOps
|
||||||
# im = Image.open(document_filepath)
|
# im = Image.open(document_filepath)
|
||||||
# #if im.mode=='RGBA':
|
# #if im.mode=='RGBA':
|
||||||
@@ -77,8 +61,14 @@ def do_document_ocr(queue_document):
|
|||||||
|
|
||||||
# Convert to TIFF
|
# Convert to TIFF
|
||||||
pre_ocr_filepath = convert(input_filepath=unpaper_output_filepath, file_format=DEFAULT_OCR_FILE_FORMAT)
|
pre_ocr_filepath = convert(input_filepath=unpaper_output_filepath, file_format=DEFAULT_OCR_FILE_FORMAT)
|
||||||
|
|
||||||
|
logger.debug('pre_ocr_filepath: %s' % pre_ocr_filepath)
|
||||||
|
|
||||||
# Tesseract needs an explicit file extension
|
# Tesseract needs an explicit file extension
|
||||||
pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])
|
pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])
|
||||||
|
|
||||||
|
logger.debug('pre_ocr_filepath_w_ext: %s' % pre_ocr_filepath_w_ext)
|
||||||
|
|
||||||
os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
|
os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
|
||||||
try:
|
try:
|
||||||
ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, LANGUAGE)
|
ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, LANGUAGE)
|
||||||
@@ -87,10 +77,10 @@ def do_document_ocr(queue_document):
|
|||||||
document_page.page_label = _(u'Text from OCR')
|
document_page.page_label = _(u'Text from OCR')
|
||||||
document_page.save()
|
document_page.save()
|
||||||
finally:
|
finally:
|
||||||
cleanup(pre_ocr_filepath_w_ext)
|
fs_cleanup(pre_ocr_filepath_w_ext)
|
||||||
cleanup(unpaper_input)
|
fs_cleanup(unpaper_input)
|
||||||
cleanup(document_filepath)
|
fs_cleanup(document_filepath)
|
||||||
cleanup(unpaper_output_filepath)
|
fs_cleanup(unpaper_output_filepath)
|
||||||
|
|
||||||
|
|
||||||
def ocr_cleanup(text):
|
def ocr_cleanup(text):
|
||||||
|
|||||||
@@ -1,28 +1,3 @@
|
|||||||
from __future__ import absolute_import
|
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from django.utils.importlib import import_module
|
|
||||||
|
|
||||||
from ..conf.settings import BACKEND
|
|
||||||
|
|
||||||
|
|
||||||
class BackendBase(object):
|
class BackendBase(object):
|
||||||
def execute(input_filename, language=None):
|
def execute(input_filename, language=None):
|
||||||
raise NotImplemented
|
raise NotImplemented
|
||||||
|
|
||||||
|
|
||||||
def get_ocr_backend():
|
|
||||||
"""
|
|
||||||
Return the OCR backend using the path specified in the configuration
|
|
||||||
settings
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
module = import_module(BACKEND)
|
|
||||||
except ImportError:
|
|
||||||
sys.stderr.write(u'\nWarning: No OCR backend named: %s\n\n' % BACKEND)
|
|
||||||
raise
|
|
||||||
else:
|
|
||||||
return module
|
|
||||||
|
|
||||||
ocr_backend = get_ocr_backend()
|
|
||||||
|
|||||||
@@ -5,6 +5,8 @@ import os
|
|||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
|
from common.utils import fs_cleanup
|
||||||
|
|
||||||
from . import BackendBase
|
from . import BackendBase
|
||||||
from ..conf.settings import TESSERACT_PATH
|
from ..conf.settings import TESSERACT_PATH
|
||||||
from ..exceptions import OCRError
|
from ..exceptions import OCRError
|
||||||
@@ -27,8 +29,8 @@ class Tesseract(BackendBase):
|
|||||||
return_code = proc.wait()
|
return_code = proc.wait()
|
||||||
if return_code != 0:
|
if return_code != 0:
|
||||||
error_text = proc.stderr.read()
|
error_text = proc.stderr.read()
|
||||||
cleanup(filepath)
|
fs_cleanup(filepath)
|
||||||
cleanup(ocr_output)
|
fs_cleanup(ocr_output)
|
||||||
if language:
|
if language:
|
||||||
# If tesseract gives an error with a language parameter
|
# If tesseract gives an error with a language parameter
|
||||||
# re-run it with no parameter again
|
# re-run it with no parameter again
|
||||||
@@ -43,14 +45,3 @@ class Tesseract(BackendBase):
|
|||||||
os.unlink(filepath)
|
os.unlink(filepath)
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
# TODO: Reduntant, also in api.py
|
|
||||||
def cleanup(filename):
|
|
||||||
"""
|
|
||||||
Try to remove the given filename, ignoring non-existent files
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
os.remove(filename)
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
|
|||||||
@@ -16,6 +16,6 @@ register_settings(
|
|||||||
{'name': u'QUEUE_PROCESSING_INTERVAL', 'global_name': u'OCR_QUEUE_PROCESSING_INTERVAL', 'default': 10},
|
{'name': u'QUEUE_PROCESSING_INTERVAL', 'global_name': u'OCR_QUEUE_PROCESSING_INTERVAL', 'default': 10},
|
||||||
{'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
|
{'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
|
||||||
{'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True},
|
{'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True},
|
||||||
{'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')},
|
{'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract.Tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')},
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -0,0 +1,3 @@
|
|||||||
|
class BackendBase(object):
|
||||||
|
def check_word(word):
|
||||||
|
raise NotImplemented
|
||||||
|
|||||||
@@ -1,7 +1,12 @@
|
|||||||
|
from __future__ import absolute_import
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from . import BackendBase
|
||||||
|
|
||||||
def check_word(word):
|
|
||||||
|
class LanguageBackend(BackendBase):
|
||||||
|
def check_word(word):
|
||||||
ALL_ALPHANUM = re.compile('([0-9a-z])', re.I)
|
ALL_ALPHANUM = re.compile('([0-9a-z])', re.I)
|
||||||
NON_ALPHANUM = re.compile('([^0-9a-z])', re.I)
|
NON_ALPHANUM = re.compile('([^0-9a-z])', re.I)
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,13 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
from __future__ import absolute_import
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from . import BackendBase
|
||||||
|
|
||||||
def check_word(word):
|
|
||||||
|
class LanguageBackend(BackendBase):
|
||||||
|
def check_word(word):
|
||||||
ALL_ALPHANUM = re.compile('([0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
|
ALL_ALPHANUM = re.compile('([0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
|
||||||
NON_ALPHANUM = re.compile('([^0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
|
NON_ALPHANUM = re.compile('([^0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,13 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
from __future__ import absolute_import
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from . import BackendBase
|
||||||
|
|
||||||
def check_word(word):
|
|
||||||
|
class LanguageBackend(BackendBase):
|
||||||
|
def check_word(word):
|
||||||
ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
|
ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
|
||||||
NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
|
NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
|
||||||
|
|
||||||
|
|||||||
12
mayan/apps/ocr/runtime.py
Normal file
12
mayan/apps/ocr/runtime.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
from __future__ import absolute_import
|
||||||
|
|
||||||
|
from common.utils import load_backend
|
||||||
|
|
||||||
|
from .conf.settings import BACKEND, LANGUAGE
|
||||||
|
|
||||||
|
try:
|
||||||
|
language_backend = load_backend(u'.'.join([u'ocr', u'lang', LANGUAGE, u'LanguageBackend']))
|
||||||
|
except ImportError:
|
||||||
|
language_backend = None
|
||||||
|
|
||||||
|
ocr_backend = load_backend(BACKEND)
|
||||||
Reference in New Issue
Block a user