diff --git a/mayan/apps/common/utils.py b/mayan/apps/common/utils.py index ad112e974a..7aef479bfd 100644 --- a/mayan/apps/common/utils.py +++ b/mayan/apps/common/utils.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import +import logging import os import random import re @@ -14,8 +15,11 @@ from django.contrib.contenttypes.models import ContentType from django.utils.datastructures import MultiValueDict from django.utils.http import urlquote as django_urlquote from django.utils.http import urlencode as django_urlencode +from django.utils.importlib import import_module from django.utils.translation import ugettext_lazy as _ +logger = logging.getLogger(__name__) + def urlquote(link=None, get=None): u""" @@ -427,3 +431,34 @@ def copyfile(source, destination, buffer_size=1024 * 1024): source_descriptor.close() destination_descriptor.close() + + +def _lazy_load(fn): + _cached = [] + + def _decorated(): + if not _cached: + _cached.append(fn()) + return _cached[0] + return _decorated + + +def load_backend(backend_string): + logger.debug('loading: %s' % backend_string) + module_name, klass = backend_string.rsplit('.', 1) + + try: + return getattr(import_module(module_name), klass)() + except ImportError as exception: + logger.debug('error importing: %s' % backend_string) + raise + + +def fs_cleanup(filename): + """ + Tries to remove the given filename. Ignores non-existent files + """ + try: + os.remove(filename) + except OSError: + pass diff --git a/mayan/apps/converter/__init__.py b/mayan/apps/converter/__init__.py index 272ebb60cd..0ffd2288a8 100644 --- a/mayan/apps/converter/__init__.py +++ b/mayan/apps/converter/__init__.py @@ -1,14 +1,10 @@ from __future__ import absolute_import from django.utils.translation import ugettext_lazy as _ -from django.core.exceptions import ImproperlyConfigured from navigation.api import register_sidebar_template from project_tools.api import register_tool -from .utils import load_backend -from .conf.settings import GRAPHICS_BACKEND - def is_superuser(context): return context['request'].user.is_staff or context['request'].user.is_superuser @@ -18,9 +14,4 @@ formats_list = {'text': _('file formats'), 'view': 'formats_list', 'famfam': 'pi register_sidebar_template(['formats_list'], 'converter_file_formats_help.html') -try: - backend = load_backend().ConverterClass() -except ImproperlyConfigured: - raise ImproperlyConfigured(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND) - register_tool(formats_list) diff --git a/mayan/apps/converter/api.py b/mayan/apps/converter/api.py index ba8e1cd7ab..320210ff60 100644 --- a/mayan/apps/converter/api.py +++ b/mayan/apps/converter/api.py @@ -7,16 +7,15 @@ import os from django.utils.encoding import smart_str from common.conf.settings import TEMPORARY_DIRECTORY +from common.utils import fs_cleanup -from . import backend from .exceptions import OfficeConversionError, UnknownFileFormat from .literals import (DEFAULT_PAGE_NUMBER, DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, DEFAULT_FILE_FORMAT) from .literals import (TRANSFORMATION_CHOICES, TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM, DIMENSION_SEPARATOR, FILE_FORMATS) -from .runtime import office_converter -from .utils import cleanup +from .runtime import backend, office_converter HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest() @@ -96,7 +95,7 @@ def convert(input_filepath, output_filepath=None, cleanup_files=False, mimetype= backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, transformations=transformations, page=page, file_format=file_format, mimetype=mimetype) finally: if cleanup_files: - cleanup(input_filepath) + fs_cleanup(input_filepath) return output_filepath diff --git a/mayan/apps/converter/backends/graphicsmagick/base.py b/mayan/apps/converter/backends/graphicsmagick.py similarity index 93% rename from mayan/apps/converter/backends/graphicsmagick/base.py rename to mayan/apps/converter/backends/graphicsmagick.py index a08d382691..2694e0c73c 100644 --- a/mayan/apps/converter/backends/graphicsmagick/base.py +++ b/mayan/apps/converter/backends/graphicsmagick.py @@ -1,21 +1,21 @@ from __future__ import absolute_import -import re import subprocess +import re -from ...backends import ConverterBase -from ...conf.settings import GM_PATH, GM_SETTINGS -from ...exceptions import ConvertError, UnknownFileFormat, IdentifyError -from ...literals import (TRANSFORMATION_RESIZE, +from . import ConverterBase +from ..conf.settings import GM_PATH, GM_SETTINGS +from ..exceptions import ConvertError, UnknownFileFormat, IdentifyError +from ..literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM) -from ...literals import DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER, \ - DEFAULT_FILE_FORMAT +from ..literals import (DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER, + DEFAULT_FILE_FORMAT) CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format' CONVERTER_ERROR_STARTS_WITH = u'starts with' -class ConverterClass(ConverterBase): +class GraphicsMagick(ConverterBase): def identify_file(self, input_filepath, arguments=None): command = [] command.append(unicode(GM_PATH)) diff --git a/mayan/apps/converter/backends/graphicsmagick/__init__.py b/mayan/apps/converter/backends/graphicsmagick/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/mayan/apps/converter/backends/imagemagick/base.py b/mayan/apps/converter/backends/imagemagick.py similarity index 93% rename from mayan/apps/converter/backends/imagemagick/base.py rename to mayan/apps/converter/backends/imagemagick.py index 48ae7b16a1..2e9815084d 100644 --- a/mayan/apps/converter/backends/imagemagick/base.py +++ b/mayan/apps/converter/backends/imagemagick.py @@ -1,19 +1,19 @@ from __future__ import absolute_import -import re import subprocess +import re -from ...backends import ConverterBase -from ...conf.settings import IM_CONVERT_PATH, IM_IDENTIFY_PATH -from ...exceptions import ConvertError, UnknownFileFormat, IdentifyError -from ...literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, +from . import ConverterBase +from ..conf.settings import IM_CONVERT_PATH, IM_IDENTIFY_PATH +from ..exceptions import ConvertError, UnknownFileFormat, IdentifyError +from ..literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM, DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER, DEFAULT_FILE_FORMAT) CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format' -class ConverterClass(ConverterBase): +class ImageMagick(ConverterBase): def identify_file(self, input_filepath, arguments=None): command = [] command.append(unicode(IM_IDENTIFY_PATH)) diff --git a/mayan/apps/converter/backends/imagemagick/__init__.py b/mayan/apps/converter/backends/imagemagick/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/mayan/apps/converter/backends/python/base.py b/mayan/apps/converter/backends/python.py similarity index 96% rename from mayan/apps/converter/backends/python/base.py rename to mayan/apps/converter/backends/python.py index fdd3436840..486d692a54 100644 --- a/mayan/apps/converter/backends/python/base.py +++ b/mayan/apps/converter/backends/python.py @@ -12,16 +12,18 @@ try: except RuntimeError: USE_GHOSTSCRIPT = False +from common.utils import fs_cleanup from mimetype.api import get_mimetype -from ...exceptions import UnknownFileFormat -from ...backends import ConverterBase -from ...literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, +from . import ConverterBase +from ..exceptions import UnknownFileFormat +from ..literals import (TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM, DEFAULT_PAGE_NUMBER, DEFAULT_FILE_FORMAT) -from ...utils import cleanup + +Image.init() -class ConverterClass(ConverterBase): +class Python(ConverterBase): def get_page_count(self, input_filepath): page_count = 1 @@ -94,7 +96,7 @@ class ConverterClass(ConverterBase): raise UnknownFileFormat finally: if tmpfile: - cleanup(tmpfile) + fs_cleanup(tmpfile) current_page = 0 try: diff --git a/mayan/apps/converter/backends/python/__init__.py b/mayan/apps/converter/backends/python/__init__.py deleted file mode 100644 index dfeca950f1..0000000000 --- a/mayan/apps/converter/backends/python/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from PIL import Image - -Image.init() diff --git a/mayan/apps/converter/conf/settings.py b/mayan/apps/converter/conf/settings.py index 41e4d5b69d..25359369f7 100644 --- a/mayan/apps/converter/conf/settings.py +++ b/mayan/apps/converter/conf/settings.py @@ -12,11 +12,7 @@ register_settings( {'name': u'IM_IDENTIFY_PATH', 'global_name': u'CONVERTER_IM_IDENTIFY_PATH', 'default': u'/usr/bin/identify', 'description': _(u'File path to imagemagick\'s identify program.'), 'exists': True}, {'name': u'GM_PATH', 'global_name': u'CONVERTER_GM_PATH', 'default': u'/usr/bin/gm', 'description': _(u'File path to graphicsmagick\'s program.'), 'exists': True}, {'name': u'GM_SETTINGS', 'global_name': u'CONVERTER_GM_SETTINGS', 'default': u''}, - {'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick, converter.backends.graphicsmagick and converter.backends.python.')}, + {'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python.Python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick.ImageMagick, converter.backends.graphicsmagick.GraphicsMagick and converter.backends.python.Python')}, {'name': u'LIBREOFFICE_PATH', 'global_name': u'CONVERTER_LIBREOFFICE_PATH', 'default': u'/usr/bin/libreoffice', 'exists': True, 'description': _(u'Path to the libreoffice program.')}, - - # {'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'}, - # {'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'}, - # {'name': u'PRINT_QUALITY_OPTIONS', 'global_name': u'CONVERTER_PRINT_QUALITY_OPTIONS', 'default': u'-density 500'}, ] ) diff --git a/mayan/apps/converter/runtime.py b/mayan/apps/converter/runtime.py index 9aad502667..972cd3524c 100644 --- a/mayan/apps/converter/runtime.py +++ b/mayan/apps/converter/runtime.py @@ -1,10 +1,17 @@ from __future__ import absolute_import -from .office_converter import OfficeConverter -from .exceptions import OfficeBackendError +from django.core.exceptions import ImproperlyConfigured +from common.utils import load_backend + +from .conf.settings import GRAPHICS_BACKEND +from .exceptions import OfficeBackendError +from .office_converter import OfficeConverter try: office_converter = OfficeConverter() except OfficeBackendError: office_converter = None + +backend = load_backend(GRAPHICS_BACKEND) + diff --git a/mayan/apps/converter/utils.py b/mayan/apps/converter/utils.py deleted file mode 100644 index 88994ed211..0000000000 --- a/mayan/apps/converter/utils.py +++ /dev/null @@ -1,40 +0,0 @@ -import os - -from django.utils.importlib import import_module - - -def _lazy_load(fn): - _cached = [] - - def _decorated(): - if not _cached: - _cached.append(fn()) - return _cached[0] - return _decorated - - -@_lazy_load -def load_backend(): - from converter.conf.settings import GRAPHICS_BACKEND as backend_name - - try: - module = import_module('.base', 'converter.backends.%s' % backend_name) - import warnings - warnings.warn( - "Short names for CONVERTER_BACKEND are deprecated; prepend with 'converter.backends.'", - PendingDeprecationWarning - ) - return module - except ImportError, e: - # Look for a fully qualified converter backend name - return import_module('.base', backend_name) - - -def cleanup(filename): - """ - Tries to remove the given filename. Ignores non-existent files - """ - try: - os.remove(filename) - except OSError: - pass diff --git a/mayan/apps/ocr/api.py b/mayan/apps/ocr/api.py index 4324c45b0e..ad407edc0f 100644 --- a/mayan/apps/ocr/api.py +++ b/mayan/apps/ocr/api.py @@ -1,48 +1,26 @@ from __future__ import absolute_import +import logging import os import subprocess import sys -from django.utils.importlib import import_module from django.utils.translation import ugettext as _ from common.conf.settings import TEMPORARY_DIRECTORY +from common.utils import fs_cleanup from converter.api import convert from documents.models import DocumentPage -from .backends import ocr_backend from .conf.settings import UNPAPER_PATH, LANGUAGE from .exceptions import UnpaperError from .literals import (DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT, DEFAULT_OCR_FILE_EXTENSION) from .parsers import parse_document_page from .parsers.exceptions import ParserError, ParserUnknownFile +from .runtime import language_backend, ocr_backend - -def get_language_backend(): - """ - Return the OCR cleanup language backend using the selected language - in the configuration settings - """ - try: - module = import_module(u'.'.join([u'ocr', u'lang', LANGUAGE])) - except ImportError: - sys.stderr.write(u'\nWarning: No OCR app language backend for language: %s\n\n' % LANGUAGE) - return None - return module - -language_backend = get_language_backend() - - -def cleanup(filename): - """ - Try to remove the given filename, ignoring non-existent files - """ - try: - os.remove(filename) - except OSError: - pass +logger = logging.getLogger(__name__) def do_document_ocr(queue_document): @@ -58,15 +36,21 @@ def do_document_ocr(queue_document): parse_document_page(document_page) except (ParserError, ParserUnknownFile): # Fall back to doing visual OCR - ocr_transformations, warnings = queue_document.get_transformation_list() document_filepath = document_page.document.get_image_cache_name(page=document_page.page_number, version=document_page.document_version.pk) unpaper_output_filename = u'%s_unpaper_out_page_%s%s%s' % (document_page.document.uuid, document_page.page_number, os.extsep, UNPAPER_FILE_FORMAT) unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename) - unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT, transformations=ocr_transformations) + logger.debug('document_filepath: %s' % document_filepath) + + unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT) + + logger.debug('unpaper_input: %s' % unpaper_input) + execute_unpaper(input_filepath=unpaper_input, output_filepath=unpaper_output_filepath) + logger.debug('unpaper_output_filepath: %s' % unpaper_output_filepath) + # from PIL import Image, ImageOps # im = Image.open(document_filepath) # #if im.mode=='RGBA': @@ -77,8 +61,14 @@ def do_document_ocr(queue_document): # Convert to TIFF pre_ocr_filepath = convert(input_filepath=unpaper_output_filepath, file_format=DEFAULT_OCR_FILE_FORMAT) + + logger.debug('pre_ocr_filepath: %s' % pre_ocr_filepath) + # Tesseract needs an explicit file extension pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION]) + + logger.debug('pre_ocr_filepath_w_ext: %s' % pre_ocr_filepath_w_ext) + os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext) try: ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, LANGUAGE) @@ -87,10 +77,10 @@ def do_document_ocr(queue_document): document_page.page_label = _(u'Text from OCR') document_page.save() finally: - cleanup(pre_ocr_filepath_w_ext) - cleanup(unpaper_input) - cleanup(document_filepath) - cleanup(unpaper_output_filepath) + fs_cleanup(pre_ocr_filepath_w_ext) + fs_cleanup(unpaper_input) + fs_cleanup(document_filepath) + fs_cleanup(unpaper_output_filepath) def ocr_cleanup(text): diff --git a/mayan/apps/ocr/backends/__init__.py b/mayan/apps/ocr/backends/__init__.py index 5425182c08..42ac15c33b 100644 --- a/mayan/apps/ocr/backends/__init__.py +++ b/mayan/apps/ocr/backends/__init__.py @@ -1,28 +1,3 @@ -from __future__ import absolute_import - -import sys - -from django.utils.importlib import import_module - -from ..conf.settings import BACKEND - - class BackendBase(object): def execute(input_filename, language=None): raise NotImplemented - - -def get_ocr_backend(): - """ - Return the OCR backend using the path specified in the configuration - settings - """ - try: - module = import_module(BACKEND) - except ImportError: - sys.stderr.write(u'\nWarning: No OCR backend named: %s\n\n' % BACKEND) - raise - else: - return module - -ocr_backend = get_ocr_backend() diff --git a/mayan/apps/ocr/backends/tesseract.py b/mayan/apps/ocr/backends/tesseract.py index 8a448773a3..ba679c9b02 100644 --- a/mayan/apps/ocr/backends/tesseract.py +++ b/mayan/apps/ocr/backends/tesseract.py @@ -5,6 +5,8 @@ import os import subprocess import tempfile +from common.utils import fs_cleanup + from . import BackendBase from ..conf.settings import TESSERACT_PATH from ..exceptions import OCRError @@ -27,8 +29,8 @@ class Tesseract(BackendBase): return_code = proc.wait() if return_code != 0: error_text = proc.stderr.read() - cleanup(filepath) - cleanup(ocr_output) + fs_cleanup(filepath) + fs_cleanup(ocr_output) if language: # If tesseract gives an error with a language parameter # re-run it with no parameter again @@ -43,14 +45,3 @@ class Tesseract(BackendBase): os.unlink(filepath) return text - - -# TODO: Reduntant, also in api.py -def cleanup(filename): - """ - Try to remove the given filename, ignoring non-existent files - """ - try: - os.remove(filename) - except OSError: - pass diff --git a/mayan/apps/ocr/conf/settings.py b/mayan/apps/ocr/conf/settings.py index 3ec44f1735..e4c7351d5e 100644 --- a/mayan/apps/ocr/conf/settings.py +++ b/mayan/apps/ocr/conf/settings.py @@ -16,6 +16,6 @@ register_settings( {'name': u'QUEUE_PROCESSING_INTERVAL', 'global_name': u'OCR_QUEUE_PROCESSING_INTERVAL', 'default': 10}, {'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True}, {'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True}, - {'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')}, + {'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract.Tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')}, ] ) diff --git a/mayan/apps/ocr/lang/__init__.py b/mayan/apps/ocr/lang/__init__.py index e69de29bb2..52965545af 100644 --- a/mayan/apps/ocr/lang/__init__.py +++ b/mayan/apps/ocr/lang/__init__.py @@ -0,0 +1,3 @@ +class BackendBase(object): + def check_word(word): + raise NotImplemented diff --git a/mayan/apps/ocr/lang/eng.py b/mayan/apps/ocr/lang/eng.py index 8a3eb91407..53d14a0741 100644 --- a/mayan/apps/ocr/lang/eng.py +++ b/mayan/apps/ocr/lang/eng.py @@ -1,38 +1,43 @@ +from __future__ import absolute_import + import re +from . import BackendBase -def check_word(word): - ALL_ALPHANUM = re.compile('([0-9a-z])', re.I) - NON_ALPHANUM = re.compile('([^0-9a-z])', re.I) - TOO_MANY_VOWELS = re.compile('[aeiou]{3}', re.I) - TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{5}', re.I) - ALL_ALPHA = re.compile('^[a-z]+$', re.I) - SINGLE_LETTER_WORDS = re.compile('^[ai]$', re.I) +class LanguageBackend(BackendBase): + def check_word(word): + ALL_ALPHANUM = re.compile('([0-9a-z])', re.I) + NON_ALPHANUM = re.compile('([^0-9a-z])', re.I) - # (L) If a string is longer than 20 characters, it is garbage - if len(word) > 20: - return None + TOO_MANY_VOWELS = re.compile('[aeiou]{3}', re.I) + TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{5}', re.I) + ALL_ALPHA = re.compile('^[a-z]+$', re.I) + SINGLE_LETTER_WORDS = re.compile('^[ai]$', re.I) - # (A) If a string's ratio of alphanumeric characters to total - # characters is less than 50%, the string is garbage - if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: - return None + # (L) If a string is longer than 20 characters, it is garbage + if len(word) > 20: + return None - # Remove word if all the letters in the word are non alphanumeric - if len(NON_ALPHANUM.findall(word)) == len(word): - return None + # (A) If a string's ratio of alphanumeric characters to total + # characters is less than 50%, the string is garbage + if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: + return None - # Removed words with too many consecutie vowels - if TOO_MANY_VOWELS.findall(word): - return None + # Remove word if all the letters in the word are non alphanumeric + if len(NON_ALPHANUM.findall(word)) == len(word): + return None - # Removed words with too many consecutie consonants - if TOO_MANY_CONSONANTS.findall(word): - return None + # Removed words with too many consecutie vowels + if TOO_MANY_VOWELS.findall(word): + return None - # Only allow specific single letter words - if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): - return None + # Removed words with too many consecutie consonants + if TOO_MANY_CONSONANTS.findall(word): + return None - return word + # Only allow specific single letter words + if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): + return None + + return word diff --git a/mayan/apps/ocr/lang/rus.py b/mayan/apps/ocr/lang/rus.py index f0553ca6d7..2c56e3a38b 100644 --- a/mayan/apps/ocr/lang/rus.py +++ b/mayan/apps/ocr/lang/rus.py @@ -1,39 +1,44 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import + import re +from . import BackendBase -def check_word(word): - ALL_ALPHANUM = re.compile('([0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I) - NON_ALPHANUM = re.compile('([^0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I) - TOO_MANY_VOWELS = re.compile('[ёуеыаоэяию]{3}', re.I) - TOO_MANY_CONSONANTS = re.compile('[йцкнгшщзхъфвпрлджчсмтьб{5}', re.I) - ALL_ALPHA = re.compile('^[ёйцукенгшщзхъфывапролджэячсмитьбю]+$', re.I) - SINGLE_LETTER_WORDS = re.compile('^[уквояси]$', re.I) +class LanguageBackend(BackendBase): + def check_word(word): + ALL_ALPHANUM = re.compile('([0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I) + NON_ALPHANUM = re.compile('([^0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I) - # (L) If a string is longer than 25 characters, it is garbage - if len(word) > 25: - return None + TOO_MANY_VOWELS = re.compile('[ёуеыаоэяию]{3}', re.I) + TOO_MANY_CONSONANTS = re.compile('[йцкнгшщзхъфвпрлджчсмтьб{5}', re.I) + ALL_ALPHA = re.compile('^[ёйцукенгшщзхъфывапролджэячсмитьбю]+$', re.I) + SINGLE_LETTER_WORDS = re.compile('^[уквояси]$', re.I) - # (A) If a string's ratio of alphanumeric characters to total - # characters is less than 50%, the string is garbage - if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: - return None + # (L) If a string is longer than 25 characters, it is garbage + if len(word) > 25: + return None - # Remove word if all the letters in the word are non alphanumeric - if len(NON_ALPHANUM.findall(word)) == len(word): - return None + # (A) If a string's ratio of alphanumeric characters to total + # characters is less than 50%, the string is garbage + if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: + return None - # Removed words with too many consecutie vowels - if TOO_MANY_VOWELS.findall(word): - return None + # Remove word if all the letters in the word are non alphanumeric + if len(NON_ALPHANUM.findall(word)) == len(word): + return None - # Removed words with too many consecutie consonants - if TOO_MANY_CONSONANTS.findall(word): - return None + # Removed words with too many consecutie vowels + if TOO_MANY_VOWELS.findall(word): + return None - # Only allow specific single letter words - if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): - return None + # Removed words with too many consecutie consonants + if TOO_MANY_CONSONANTS.findall(word): + return None - return word + # Only allow specific single letter words + if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): + return None + + return word diff --git a/mayan/apps/ocr/lang/spa.py b/mayan/apps/ocr/lang/spa.py index fd29d49f8a..e0a604ec39 100644 --- a/mayan/apps/ocr/lang/spa.py +++ b/mayan/apps/ocr/lang/spa.py @@ -1,39 +1,44 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import + import re +from . import BackendBase -def check_word(word): - ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I) - NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I) - TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I) - TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I) - ALL_ALPHA = re.compile('^[a-z]+$', re.I) - SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I) +class LanguageBackend(BackendBase): + def check_word(word): + ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I) + NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I) - # (L) If a string is longer than 20 characters, it is garbage - if len(word) > 20: - return None + TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I) + TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I) + ALL_ALPHA = re.compile('^[a-z]+$', re.I) + SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I) - # (A) If a string’s ratio of alphanumeric characters to total - # characters is less than 50%, the string is garbage - if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: - return None + # (L) If a string is longer than 20 characters, it is garbage + if len(word) > 20: + return None - # Remove word if all the letters in the word are non alphanumeric - if len(NON_ALPHANUM.findall(word)) == len(word): - return None + # (A) If a string’s ratio of alphanumeric characters to total + # characters is less than 50%, the string is garbage + if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: + return None - # Removed words with too many consecutie vowels - if TOO_MANY_VOWELS.findall(word): - return None + # Remove word if all the letters in the word are non alphanumeric + if len(NON_ALPHANUM.findall(word)) == len(word): + return None - # Removed words with too many consecutie consonants - if TOO_MANY_CONSONANTS.findall(word): - return None + # Removed words with too many consecutie vowels + if TOO_MANY_VOWELS.findall(word): + return None - # Only allow specific single letter words - if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): - return None + # Removed words with too many consecutie consonants + if TOO_MANY_CONSONANTS.findall(word): + return None - return word + # Only allow specific single letter words + if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): + return None + + return word diff --git a/mayan/apps/ocr/runtime.py b/mayan/apps/ocr/runtime.py new file mode 100644 index 0000000000..0a52ad8fe1 --- /dev/null +++ b/mayan/apps/ocr/runtime.py @@ -0,0 +1,12 @@ +from __future__ import absolute_import + +from common.utils import load_backend + +from .conf.settings import BACKEND, LANGUAGE + +try: + language_backend = load_backend(u'.'.join([u'ocr', u'lang', LANGUAGE, u'LanguageBackend'])) +except ImportError: + language_backend = None + +ocr_backend = load_backend(BACKEND)