diff --git a/apps/ocr/__init__.py b/apps/ocr/__init__.py index 24eb3ad8d4..d0b006548e 100644 --- a/apps/ocr/__init__.py +++ b/apps/ocr/__init__.py @@ -9,18 +9,18 @@ from django.db.models.signals import post_save, post_syncdb from django.dispatch import receiver from django.db.utils import DatabaseError -from navigation.api import (bind_links, register_multi_item_links, - register_multi_item_links) -from documents.models import Document, DocumentVersion +#from navigation.api import (bind_links, register_multi_item_links, +# register_multi_item_links) +#from documents.models import Document, DocumentVersion from maintenance.api import MaintenanceNamespace from project_tools.api import register_tool from acls.api import class_permissions from job_processor.models import JobQueue, JobType from job_processor.exceptions import JobQueuePushError -from .conf.settings import (AUTOMATIC_OCR, QUEUE_PROCESSING_INTERVAL) -from .models import OCRProcessingSingleton -from .api import do_document_ocr +#from .conf.settings import (AUTOMATIC_OCR, QUEUE_PROCESSING_INTERVAL) +#from .models import OCRProcessingSingleton +#from .api import do_document_ocr from .permissions import PERMISSION_OCR_DOCUMENT from .exceptions import AlreadyQueued from . import models as ocr_models @@ -55,11 +55,11 @@ def document_post_save(sender, instance, **kwargs): logger.debug('received post save signal') logger.debug('instance: %s' % instance) if kwargs.get('created', False): - if AUTOMATIC_OCR: - try: - instance.submit_for_ocr() - except JobQueuePushError: - pass + #if AUTOMATIC_OCR: + try: + instance.submit_for_ocr() + except JobQueuePushError: + pass register_tool(ocr_tool_link) diff --git a/apps/ocr/api.py b/apps/ocr/api.py index 5af659a4b3..8078ba3843 100644 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -10,11 +10,11 @@ import sys from django.utils.translation import ugettext as _ from django.utils.importlib import import_module -from common.conf.settings import TEMPORARY_DIRECTORY -from converter.api import convert +#from common.settings import TEMPORARY_DIRECTORY +#from converter.api import convert from documents.models import DocumentPage, DocumentVersion -from .conf.settings import (TESSERACT_PATH, TESSERACT_LANGUAGE, UNPAPER_PATH) +#from .conf.settings import (TESSERACT_PATH, TESSERACT_LANGUAGE, UNPAPER_PATH) from .exceptions import TesseractError, UnpaperError from .parsers import parse_document_page from .parsers.exceptions import ParserError, ParserUnknownFile diff --git a/apps/ocr/conf/__init__.py b/apps/ocr/conf/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/apps/ocr/conf/settings.py b/apps/ocr/conf/settings.py deleted file mode 100644 index 242b777f8d..0000000000 --- a/apps/ocr/conf/settings.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Configuration options for the ocr app""" - -from django.utils.translation import ugettext_lazy as _ - -from smart_settings.api import Setting, SettingNamespace - -namespace = SettingNamespace('ocr', _(u'OCR'), module='ocr.conf.settings', sprite='spellcheck') - -Setting( - namespace=namespace, - name='TESSERACT_PATH', - global_name='OCR_TESSERACT_PATH', - default=u'/usr/bin/tesseract', - exists=True, -) - -Setting( - namespace=namespace, - name='TESSERACT_LANGUAGE', - global_name='OCR_TESSERACT_LANGUAGE', - default=u'eng', -) - -Setting( - namespace=namespace, - name='REPLICATION_DELAY', - global_name='OCR_REPLICATION_DELAY', - default=0, - description=_(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.'), -) - -Setting( - namespace=namespace, - name='NODE_CONCURRENT_EXECUTION', - global_name='OCR_NODE_CONCURRENT_EXECUTION', - default=1, - description=_(u'Maximum amount of concurrent document OCRs a node can perform.') -) - -Setting( - namespace=namespace, - name='AUTOMATIC_OCR', - global_name='OCR_AUTOMATIC_OCR', - default=True, - description=_(u'Automatically queue newly created documents for OCR.') -) - -Setting( - namespace=namespace, - name='QUEUE_PROCESSING_INTERVAL', - global_name='OCR_QUEUE_PROCESSING_INTERVAL', - default=10, - description=_(u'Automatically queue newly created documents for OCR.') -) - -Setting( - namespace=namespace, - name='UNPAPER_PATH', - global_name='OCR_UNPAPER_PATH', - default=u'/usr/bin/unpaper', - description=_(u'File path to unpaper program.'), - exists=True -) - -Setting( - namespace=namespace, - name='PDFTOTEXT_PATH', - global_name='OCR_PDFTOTEXT_PATH', - default=u'/usr/bin/pdftotext', - description=_(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'), - exists=True -) diff --git a/apps/ocr/icons.py b/apps/ocr/icons.py new file mode 100644 index 0000000000..e1de9c684f --- /dev/null +++ b/apps/ocr/icons.py @@ -0,0 +1,7 @@ +from __future__ import absolute_import + +from icons.literals import TEXT_DROPCAPS, TEXT_STRIKETHROUGH +from icons import Icon + +icon_submit_document = Icon(TEXT_DROPCAPS) +icon_ocr_cleanup = Icon(TEXT_STRIKETHROUGH) diff --git a/apps/ocr/links.py b/apps/ocr/links.py index c671ee0ea6..26184428a1 100644 --- a/apps/ocr/links.py +++ b/apps/ocr/links.py @@ -8,6 +8,7 @@ from .permissions import (PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE, PERMISSION_OCR_QUEUE_ENABLE_DISABLE, PERMISSION_OCR_CLEAN_ALL_PAGES) from .models import OCRProcessingSingleton +from .icons import icon_submit_document, icon_ocr_cleanup def is_enabled(context): return OCRProcessingSingleton.get().is_enabled() @@ -16,12 +17,12 @@ def is_disabled(context): return not OCRProcessingSingleton.get().is_enabled() -ocr_log = Link(text=_(u'queue document list'), view='ocr_log', sprite='text', permissions=[PERMISSION_OCR_DOCUMENT]) -ocr_disable = Link(text=_(u'disable OCR processing'), view='ocr_disable', sprite='control_stop_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_disabled) -ocr_enable = Link(text=_(u'enable OCR processing'), view='ocr_enable', sprite='control_play_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_enabled) -submit_document = Link(text=_('submit to OCR queue'), view='submit_document', args='object.id', sprite='text_dropcaps', permissions=[PERMISSION_OCR_DOCUMENT]) -submit_document_multiple = Link(text=_('submit to OCR queue'), view='submit_document_multiple', sprite='text_dropcaps', permissions=[PERMISSION_OCR_DOCUMENT]) +#ocr_log = Link(text=_(u'queue document list'), view='ocr_log', sprite='text', permissions=[PERMISSION_OCR_DOCUMENT]) +#ocr_disable = Link(text=_(u'disable OCR processing'), view='ocr_disable', sprite='control_stop_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_disabled) +#ocr_enable = Link(text=_(u'enable OCR processing'), view='ocr_enable', sprite='control_play_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_enabled) +submit_document = Link(text=_('submit to OCR queue'), view='submit_document', args='object.id', icon=icon_submit_document, permissions=[PERMISSION_OCR_DOCUMENT]) +submit_document_multiple = Link(text=_('submit to OCR queue'), view='submit_document_multiple', icon=icon_submit_document, permissions=[PERMISSION_OCR_DOCUMENT]) -all_document_ocr_cleanup = Link(text=_(u'clean up pages content'), view='all_document_ocr_cleanup', sprite='text_strikethrough', permissions=[PERMISSION_OCR_CLEAN_ALL_PAGES], description=_(u'Runs a language filter to remove common OCR mistakes from document pages content.')) +all_document_ocr_cleanup = Link(text=_(u'clean up pages content'), view='all_document_ocr_cleanup', icon=icon_ocr_cleanup, permissions=[PERMISSION_OCR_CLEAN_ALL_PAGES], description=_(u'Runs a language filter to remove common OCR mistakes from document pages content.')) -ocr_tool_link = Link(text=_(u'OCR'), view='ocr_log', sprite='hourglass', icon='text.png', permissions=[PERMISSION_OCR_DOCUMENT]) # children_view_regex=[r'queue_', r'document_queue']) +#ocr_tool_link = Link(text=_(u'OCR'), view='ocr_log', sprite='hourglass', icon='text.png', permissions=[PERMISSION_OCR_DOCUMENT]) # children_view_regex=[r'queue_', r'document_queue']) diff --git a/apps/ocr/parsers/__init__.py b/apps/ocr/parsers/__init__.py index 5d25f3c2fe..897476fc3d 100644 --- a/apps/ocr/parsers/__init__.py +++ b/apps/ocr/parsers/__init__.py @@ -6,16 +6,16 @@ import subprocess from django.utils.translation import ugettext as _ -from converter import office_converter -from converter.office_converter import OfficeConverter -from converter.exceptions import OfficeConversionError -from documents.utils import document_save_to_temp_dir -from common.utils import copyfile -from common.conf.settings import TEMPORARY_DIRECTORY -from common.textparser import TextParser as OriginalTextParser, TEXT_PARSER_MIMETYPES +#from converter import office_converter +#from converter.office_converter import OfficeConverter +#from converter.exceptions import OfficeConversionError +#from documents.utils import document_save_to_temp_dir +#from common.utils import copyfile +#from common.settings import TEMPORARY_DIRECTORY +#from common.textparser import TextParser as OriginalTextParser, TEXT_PARSER_MIMETYPES from ocr.parsers.exceptions import ParserError, ParserUnknownFile -from ocr.conf.settings import PDFTOTEXT_PATH +#from ocr.settings import PDFTOTEXT_PATH mimetype_registry = {} diff --git a/apps/ocr/registry.py b/apps/ocr/registry.py new file mode 100644 index 0000000000..7df6b324e8 --- /dev/null +++ b/apps/ocr/registry.py @@ -0,0 +1,63 @@ +from __future__ import absolute_import + +from django.utils.translation import ugettext_lazy as _ + +from smart_settings import LocalScope + +from .icons import icon_submit_document + +label = _(u'OCR') +description = _(u'Handles optical character recognition.') +icon = icon_submit_document +dependencies = ['app_registry', 'icons', 'navigation'] +settings = [ + { + 'name': 'TESSERACT_PATH', + 'default': u'/usr/bin/tesseract', + 'exists': True, + 'scopes': [LocalScope()] + }, + { + 'name': 'TESSERACT_LANGUAGE', + 'default': u'eng', + 'scopes': [LocalScope()] + }, + { + 'name': 'REPLICATION_DELAY', + 'default': 0, + 'description': _(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.'), + 'scopes': [LocalScope()] + }, + { + 'name': 'NODE_CONCURRENT_EXECUTION', + 'default': 1, + 'description': _(u'Maximum amount of concurrent document OCRs a node can perform.'), + 'scopes': [LocalScope()] + }, + { + 'name': 'AUTOMATIC_OCR', + 'default': True, + 'description': _(u'Automatically queue newly created documents for OCR.'), + 'scopes': [LocalScope()] + }, + { + 'name': 'QUEUE_PROCESSING_INTERVAL', + 'default': 10, + 'description': _(u'Automatically queue newly created documents for OCR.'), + 'scopes': [LocalScope()] + }, + { + 'name': 'UNPAPER_PATH', + 'default': u'/usr/bin/unpaper', + 'description': _(u'File path to unpaper program.'), + 'exists': True, + 'scopes': [LocalScope()] + }, + { + 'name': 'PDFTOTEXT_PATH', + 'default': u'/usr/bin/pdftotext', + 'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'), + 'exists': True, + 'scopes': [LocalScope()] + }, +]