Initial changes to update the OCR app
This commit is contained in:
@@ -9,18 +9,18 @@ from django.db.models.signals import post_save, post_syncdb
|
||||
from django.dispatch import receiver
|
||||
from django.db.utils import DatabaseError
|
||||
|
||||
from navigation.api import (bind_links, register_multi_item_links,
|
||||
register_multi_item_links)
|
||||
from documents.models import Document, DocumentVersion
|
||||
#from navigation.api import (bind_links, register_multi_item_links,
|
||||
# register_multi_item_links)
|
||||
#from documents.models import Document, DocumentVersion
|
||||
from maintenance.api import MaintenanceNamespace
|
||||
from project_tools.api import register_tool
|
||||
from acls.api import class_permissions
|
||||
from job_processor.models import JobQueue, JobType
|
||||
from job_processor.exceptions import JobQueuePushError
|
||||
|
||||
from .conf.settings import (AUTOMATIC_OCR, QUEUE_PROCESSING_INTERVAL)
|
||||
from .models import OCRProcessingSingleton
|
||||
from .api import do_document_ocr
|
||||
#from .conf.settings import (AUTOMATIC_OCR, QUEUE_PROCESSING_INTERVAL)
|
||||
#from .models import OCRProcessingSingleton
|
||||
#from .api import do_document_ocr
|
||||
from .permissions import PERMISSION_OCR_DOCUMENT
|
||||
from .exceptions import AlreadyQueued
|
||||
from . import models as ocr_models
|
||||
@@ -55,11 +55,11 @@ def document_post_save(sender, instance, **kwargs):
|
||||
logger.debug('received post save signal')
|
||||
logger.debug('instance: %s' % instance)
|
||||
if kwargs.get('created', False):
|
||||
if AUTOMATIC_OCR:
|
||||
try:
|
||||
instance.submit_for_ocr()
|
||||
except JobQueuePushError:
|
||||
pass
|
||||
#if AUTOMATIC_OCR:
|
||||
try:
|
||||
instance.submit_for_ocr()
|
||||
except JobQueuePushError:
|
||||
pass
|
||||
|
||||
|
||||
register_tool(ocr_tool_link)
|
||||
|
||||
@@ -10,11 +10,11 @@ import sys
|
||||
from django.utils.translation import ugettext as _
|
||||
from django.utils.importlib import import_module
|
||||
|
||||
from common.conf.settings import TEMPORARY_DIRECTORY
|
||||
from converter.api import convert
|
||||
#from common.settings import TEMPORARY_DIRECTORY
|
||||
#from converter.api import convert
|
||||
from documents.models import DocumentPage, DocumentVersion
|
||||
|
||||
from .conf.settings import (TESSERACT_PATH, TESSERACT_LANGUAGE, UNPAPER_PATH)
|
||||
#from .conf.settings import (TESSERACT_PATH, TESSERACT_LANGUAGE, UNPAPER_PATH)
|
||||
from .exceptions import TesseractError, UnpaperError
|
||||
from .parsers import parse_document_page
|
||||
from .parsers.exceptions import ParserError, ParserUnknownFile
|
||||
|
||||
@@ -1,72 +0,0 @@
|
||||
"""Configuration options for the ocr app"""
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from smart_settings.api import Setting, SettingNamespace
|
||||
|
||||
namespace = SettingNamespace('ocr', _(u'OCR'), module='ocr.conf.settings', sprite='spellcheck')
|
||||
|
||||
Setting(
|
||||
namespace=namespace,
|
||||
name='TESSERACT_PATH',
|
||||
global_name='OCR_TESSERACT_PATH',
|
||||
default=u'/usr/bin/tesseract',
|
||||
exists=True,
|
||||
)
|
||||
|
||||
Setting(
|
||||
namespace=namespace,
|
||||
name='TESSERACT_LANGUAGE',
|
||||
global_name='OCR_TESSERACT_LANGUAGE',
|
||||
default=u'eng',
|
||||
)
|
||||
|
||||
Setting(
|
||||
namespace=namespace,
|
||||
name='REPLICATION_DELAY',
|
||||
global_name='OCR_REPLICATION_DELAY',
|
||||
default=0,
|
||||
description=_(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.'),
|
||||
)
|
||||
|
||||
Setting(
|
||||
namespace=namespace,
|
||||
name='NODE_CONCURRENT_EXECUTION',
|
||||
global_name='OCR_NODE_CONCURRENT_EXECUTION',
|
||||
default=1,
|
||||
description=_(u'Maximum amount of concurrent document OCRs a node can perform.')
|
||||
)
|
||||
|
||||
Setting(
|
||||
namespace=namespace,
|
||||
name='AUTOMATIC_OCR',
|
||||
global_name='OCR_AUTOMATIC_OCR',
|
||||
default=True,
|
||||
description=_(u'Automatically queue newly created documents for OCR.')
|
||||
)
|
||||
|
||||
Setting(
|
||||
namespace=namespace,
|
||||
name='QUEUE_PROCESSING_INTERVAL',
|
||||
global_name='OCR_QUEUE_PROCESSING_INTERVAL',
|
||||
default=10,
|
||||
description=_(u'Automatically queue newly created documents for OCR.')
|
||||
)
|
||||
|
||||
Setting(
|
||||
namespace=namespace,
|
||||
name='UNPAPER_PATH',
|
||||
global_name='OCR_UNPAPER_PATH',
|
||||
default=u'/usr/bin/unpaper',
|
||||
description=_(u'File path to unpaper program.'),
|
||||
exists=True
|
||||
)
|
||||
|
||||
Setting(
|
||||
namespace=namespace,
|
||||
name='PDFTOTEXT_PATH',
|
||||
global_name='OCR_PDFTOTEXT_PATH',
|
||||
default=u'/usr/bin/pdftotext',
|
||||
description=_(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'),
|
||||
exists=True
|
||||
)
|
||||
7
apps/ocr/icons.py
Normal file
7
apps/ocr/icons.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
from icons.literals import TEXT_DROPCAPS, TEXT_STRIKETHROUGH
|
||||
from icons import Icon
|
||||
|
||||
icon_submit_document = Icon(TEXT_DROPCAPS)
|
||||
icon_ocr_cleanup = Icon(TEXT_STRIKETHROUGH)
|
||||
@@ -8,6 +8,7 @@ from .permissions import (PERMISSION_OCR_DOCUMENT,
|
||||
PERMISSION_OCR_DOCUMENT_DELETE, PERMISSION_OCR_QUEUE_ENABLE_DISABLE,
|
||||
PERMISSION_OCR_CLEAN_ALL_PAGES)
|
||||
from .models import OCRProcessingSingleton
|
||||
from .icons import icon_submit_document, icon_ocr_cleanup
|
||||
|
||||
def is_enabled(context):
|
||||
return OCRProcessingSingleton.get().is_enabled()
|
||||
@@ -16,12 +17,12 @@ def is_disabled(context):
|
||||
return not OCRProcessingSingleton.get().is_enabled()
|
||||
|
||||
|
||||
ocr_log = Link(text=_(u'queue document list'), view='ocr_log', sprite='text', permissions=[PERMISSION_OCR_DOCUMENT])
|
||||
ocr_disable = Link(text=_(u'disable OCR processing'), view='ocr_disable', sprite='control_stop_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_disabled)
|
||||
ocr_enable = Link(text=_(u'enable OCR processing'), view='ocr_enable', sprite='control_play_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_enabled)
|
||||
submit_document = Link(text=_('submit to OCR queue'), view='submit_document', args='object.id', sprite='text_dropcaps', permissions=[PERMISSION_OCR_DOCUMENT])
|
||||
submit_document_multiple = Link(text=_('submit to OCR queue'), view='submit_document_multiple', sprite='text_dropcaps', permissions=[PERMISSION_OCR_DOCUMENT])
|
||||
#ocr_log = Link(text=_(u'queue document list'), view='ocr_log', sprite='text', permissions=[PERMISSION_OCR_DOCUMENT])
|
||||
#ocr_disable = Link(text=_(u'disable OCR processing'), view='ocr_disable', sprite='control_stop_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_disabled)
|
||||
#ocr_enable = Link(text=_(u'enable OCR processing'), view='ocr_enable', sprite='control_play_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_enabled)
|
||||
submit_document = Link(text=_('submit to OCR queue'), view='submit_document', args='object.id', icon=icon_submit_document, permissions=[PERMISSION_OCR_DOCUMENT])
|
||||
submit_document_multiple = Link(text=_('submit to OCR queue'), view='submit_document_multiple', icon=icon_submit_document, permissions=[PERMISSION_OCR_DOCUMENT])
|
||||
|
||||
all_document_ocr_cleanup = Link(text=_(u'clean up pages content'), view='all_document_ocr_cleanup', sprite='text_strikethrough', permissions=[PERMISSION_OCR_CLEAN_ALL_PAGES], description=_(u'Runs a language filter to remove common OCR mistakes from document pages content.'))
|
||||
all_document_ocr_cleanup = Link(text=_(u'clean up pages content'), view='all_document_ocr_cleanup', icon=icon_ocr_cleanup, permissions=[PERMISSION_OCR_CLEAN_ALL_PAGES], description=_(u'Runs a language filter to remove common OCR mistakes from document pages content.'))
|
||||
|
||||
ocr_tool_link = Link(text=_(u'OCR'), view='ocr_log', sprite='hourglass', icon='text.png', permissions=[PERMISSION_OCR_DOCUMENT]) # children_view_regex=[r'queue_', r'document_queue'])
|
||||
#ocr_tool_link = Link(text=_(u'OCR'), view='ocr_log', sprite='hourglass', icon='text.png', permissions=[PERMISSION_OCR_DOCUMENT]) # children_view_regex=[r'queue_', r'document_queue'])
|
||||
|
||||
@@ -6,16 +6,16 @@ import subprocess
|
||||
|
||||
from django.utils.translation import ugettext as _
|
||||
|
||||
from converter import office_converter
|
||||
from converter.office_converter import OfficeConverter
|
||||
from converter.exceptions import OfficeConversionError
|
||||
from documents.utils import document_save_to_temp_dir
|
||||
from common.utils import copyfile
|
||||
from common.conf.settings import TEMPORARY_DIRECTORY
|
||||
from common.textparser import TextParser as OriginalTextParser, TEXT_PARSER_MIMETYPES
|
||||
#from converter import office_converter
|
||||
#from converter.office_converter import OfficeConverter
|
||||
#from converter.exceptions import OfficeConversionError
|
||||
#from documents.utils import document_save_to_temp_dir
|
||||
#from common.utils import copyfile
|
||||
#from common.settings import TEMPORARY_DIRECTORY
|
||||
#from common.textparser import TextParser as OriginalTextParser, TEXT_PARSER_MIMETYPES
|
||||
|
||||
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
|
||||
from ocr.conf.settings import PDFTOTEXT_PATH
|
||||
#from ocr.settings import PDFTOTEXT_PATH
|
||||
|
||||
|
||||
mimetype_registry = {}
|
||||
|
||||
63
apps/ocr/registry.py
Normal file
63
apps/ocr/registry.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from smart_settings import LocalScope
|
||||
|
||||
from .icons import icon_submit_document
|
||||
|
||||
label = _(u'OCR')
|
||||
description = _(u'Handles optical character recognition.')
|
||||
icon = icon_submit_document
|
||||
dependencies = ['app_registry', 'icons', 'navigation']
|
||||
settings = [
|
||||
{
|
||||
'name': 'TESSERACT_PATH',
|
||||
'default': u'/usr/bin/tesseract',
|
||||
'exists': True,
|
||||
'scopes': [LocalScope()]
|
||||
},
|
||||
{
|
||||
'name': 'TESSERACT_LANGUAGE',
|
||||
'default': u'eng',
|
||||
'scopes': [LocalScope()]
|
||||
},
|
||||
{
|
||||
'name': 'REPLICATION_DELAY',
|
||||
'default': 0,
|
||||
'description': _(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.'),
|
||||
'scopes': [LocalScope()]
|
||||
},
|
||||
{
|
||||
'name': 'NODE_CONCURRENT_EXECUTION',
|
||||
'default': 1,
|
||||
'description': _(u'Maximum amount of concurrent document OCRs a node can perform.'),
|
||||
'scopes': [LocalScope()]
|
||||
},
|
||||
{
|
||||
'name': 'AUTOMATIC_OCR',
|
||||
'default': True,
|
||||
'description': _(u'Automatically queue newly created documents for OCR.'),
|
||||
'scopes': [LocalScope()]
|
||||
},
|
||||
{
|
||||
'name': 'QUEUE_PROCESSING_INTERVAL',
|
||||
'default': 10,
|
||||
'description': _(u'Automatically queue newly created documents for OCR.'),
|
||||
'scopes': [LocalScope()]
|
||||
},
|
||||
{
|
||||
'name': 'UNPAPER_PATH',
|
||||
'default': u'/usr/bin/unpaper',
|
||||
'description': _(u'File path to unpaper program.'),
|
||||
'exists': True,
|
||||
'scopes': [LocalScope()]
|
||||
},
|
||||
{
|
||||
'name': 'PDFTOTEXT_PATH',
|
||||
'default': u'/usr/bin/pdftotext',
|
||||
'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'),
|
||||
'exists': True,
|
||||
'scopes': [LocalScope()]
|
||||
},
|
||||
]
|
||||
Reference in New Issue
Block a user