Initial changes to update the OCR app

This commit is contained in:
Roberto Rosario
2012-09-10 23:30:13 -04:00
parent 8728167044
commit babdc4e93a
8 changed files with 100 additions and 101 deletions

View File

@@ -9,18 +9,18 @@ from django.db.models.signals import post_save, post_syncdb
from django.dispatch import receiver
from django.db.utils import DatabaseError
from navigation.api import (bind_links, register_multi_item_links,
register_multi_item_links)
from documents.models import Document, DocumentVersion
#from navigation.api import (bind_links, register_multi_item_links,
# register_multi_item_links)
#from documents.models import Document, DocumentVersion
from maintenance.api import MaintenanceNamespace
from project_tools.api import register_tool
from acls.api import class_permissions
from job_processor.models import JobQueue, JobType
from job_processor.exceptions import JobQueuePushError
from .conf.settings import (AUTOMATIC_OCR, QUEUE_PROCESSING_INTERVAL)
from .models import OCRProcessingSingleton
from .api import do_document_ocr
#from .conf.settings import (AUTOMATIC_OCR, QUEUE_PROCESSING_INTERVAL)
#from .models import OCRProcessingSingleton
#from .api import do_document_ocr
from .permissions import PERMISSION_OCR_DOCUMENT
from .exceptions import AlreadyQueued
from . import models as ocr_models
@@ -55,11 +55,11 @@ def document_post_save(sender, instance, **kwargs):
logger.debug('received post save signal')
logger.debug('instance: %s' % instance)
if kwargs.get('created', False):
if AUTOMATIC_OCR:
try:
instance.submit_for_ocr()
except JobQueuePushError:
pass
#if AUTOMATIC_OCR:
try:
instance.submit_for_ocr()
except JobQueuePushError:
pass
register_tool(ocr_tool_link)

View File

@@ -10,11 +10,11 @@ import sys
from django.utils.translation import ugettext as _
from django.utils.importlib import import_module
from common.conf.settings import TEMPORARY_DIRECTORY
from converter.api import convert
#from common.settings import TEMPORARY_DIRECTORY
#from converter.api import convert
from documents.models import DocumentPage, DocumentVersion
from .conf.settings import (TESSERACT_PATH, TESSERACT_LANGUAGE, UNPAPER_PATH)
#from .conf.settings import (TESSERACT_PATH, TESSERACT_LANGUAGE, UNPAPER_PATH)
from .exceptions import TesseractError, UnpaperError
from .parsers import parse_document_page
from .parsers.exceptions import ParserError, ParserUnknownFile

View File

@@ -1,72 +0,0 @@
"""Configuration options for the ocr app"""
from django.utils.translation import ugettext_lazy as _
from smart_settings.api import Setting, SettingNamespace
namespace = SettingNamespace('ocr', _(u'OCR'), module='ocr.conf.settings', sprite='spellcheck')
Setting(
namespace=namespace,
name='TESSERACT_PATH',
global_name='OCR_TESSERACT_PATH',
default=u'/usr/bin/tesseract',
exists=True,
)
Setting(
namespace=namespace,
name='TESSERACT_LANGUAGE',
global_name='OCR_TESSERACT_LANGUAGE',
default=u'eng',
)
Setting(
namespace=namespace,
name='REPLICATION_DELAY',
global_name='OCR_REPLICATION_DELAY',
default=0,
description=_(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.'),
)
Setting(
namespace=namespace,
name='NODE_CONCURRENT_EXECUTION',
global_name='OCR_NODE_CONCURRENT_EXECUTION',
default=1,
description=_(u'Maximum amount of concurrent document OCRs a node can perform.')
)
Setting(
namespace=namespace,
name='AUTOMATIC_OCR',
global_name='OCR_AUTOMATIC_OCR',
default=True,
description=_(u'Automatically queue newly created documents for OCR.')
)
Setting(
namespace=namespace,
name='QUEUE_PROCESSING_INTERVAL',
global_name='OCR_QUEUE_PROCESSING_INTERVAL',
default=10,
description=_(u'Automatically queue newly created documents for OCR.')
)
Setting(
namespace=namespace,
name='UNPAPER_PATH',
global_name='OCR_UNPAPER_PATH',
default=u'/usr/bin/unpaper',
description=_(u'File path to unpaper program.'),
exists=True
)
Setting(
namespace=namespace,
name='PDFTOTEXT_PATH',
global_name='OCR_PDFTOTEXT_PATH',
default=u'/usr/bin/pdftotext',
description=_(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'),
exists=True
)

7
apps/ocr/icons.py Normal file
View File

@@ -0,0 +1,7 @@
from __future__ import absolute_import
from icons.literals import TEXT_DROPCAPS, TEXT_STRIKETHROUGH
from icons import Icon
icon_submit_document = Icon(TEXT_DROPCAPS)
icon_ocr_cleanup = Icon(TEXT_STRIKETHROUGH)

View File

@@ -8,6 +8,7 @@ from .permissions import (PERMISSION_OCR_DOCUMENT,
PERMISSION_OCR_DOCUMENT_DELETE, PERMISSION_OCR_QUEUE_ENABLE_DISABLE,
PERMISSION_OCR_CLEAN_ALL_PAGES)
from .models import OCRProcessingSingleton
from .icons import icon_submit_document, icon_ocr_cleanup
def is_enabled(context):
return OCRProcessingSingleton.get().is_enabled()
@@ -16,12 +17,12 @@ def is_disabled(context):
return not OCRProcessingSingleton.get().is_enabled()
ocr_log = Link(text=_(u'queue document list'), view='ocr_log', sprite='text', permissions=[PERMISSION_OCR_DOCUMENT])
ocr_disable = Link(text=_(u'disable OCR processing'), view='ocr_disable', sprite='control_stop_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_disabled)
ocr_enable = Link(text=_(u'enable OCR processing'), view='ocr_enable', sprite='control_play_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_enabled)
submit_document = Link(text=_('submit to OCR queue'), view='submit_document', args='object.id', sprite='text_dropcaps', permissions=[PERMISSION_OCR_DOCUMENT])
submit_document_multiple = Link(text=_('submit to OCR queue'), view='submit_document_multiple', sprite='text_dropcaps', permissions=[PERMISSION_OCR_DOCUMENT])
#ocr_log = Link(text=_(u'queue document list'), view='ocr_log', sprite='text', permissions=[PERMISSION_OCR_DOCUMENT])
#ocr_disable = Link(text=_(u'disable OCR processing'), view='ocr_disable', sprite='control_stop_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_disabled)
#ocr_enable = Link(text=_(u'enable OCR processing'), view='ocr_enable', sprite='control_play_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_enabled)
submit_document = Link(text=_('submit to OCR queue'), view='submit_document', args='object.id', icon=icon_submit_document, permissions=[PERMISSION_OCR_DOCUMENT])
submit_document_multiple = Link(text=_('submit to OCR queue'), view='submit_document_multiple', icon=icon_submit_document, permissions=[PERMISSION_OCR_DOCUMENT])
all_document_ocr_cleanup = Link(text=_(u'clean up pages content'), view='all_document_ocr_cleanup', sprite='text_strikethrough', permissions=[PERMISSION_OCR_CLEAN_ALL_PAGES], description=_(u'Runs a language filter to remove common OCR mistakes from document pages content.'))
all_document_ocr_cleanup = Link(text=_(u'clean up pages content'), view='all_document_ocr_cleanup', icon=icon_ocr_cleanup, permissions=[PERMISSION_OCR_CLEAN_ALL_PAGES], description=_(u'Runs a language filter to remove common OCR mistakes from document pages content.'))
ocr_tool_link = Link(text=_(u'OCR'), view='ocr_log', sprite='hourglass', icon='text.png', permissions=[PERMISSION_OCR_DOCUMENT]) # children_view_regex=[r'queue_', r'document_queue'])
#ocr_tool_link = Link(text=_(u'OCR'), view='ocr_log', sprite='hourglass', icon='text.png', permissions=[PERMISSION_OCR_DOCUMENT]) # children_view_regex=[r'queue_', r'document_queue'])

View File

@@ -6,16 +6,16 @@ import subprocess
from django.utils.translation import ugettext as _
from converter import office_converter
from converter.office_converter import OfficeConverter
from converter.exceptions import OfficeConversionError
from documents.utils import document_save_to_temp_dir
from common.utils import copyfile
from common.conf.settings import TEMPORARY_DIRECTORY
from common.textparser import TextParser as OriginalTextParser, TEXT_PARSER_MIMETYPES
#from converter import office_converter
#from converter.office_converter import OfficeConverter
#from converter.exceptions import OfficeConversionError
#from documents.utils import document_save_to_temp_dir
#from common.utils import copyfile
#from common.settings import TEMPORARY_DIRECTORY
#from common.textparser import TextParser as OriginalTextParser, TEXT_PARSER_MIMETYPES
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
from ocr.conf.settings import PDFTOTEXT_PATH
#from ocr.settings import PDFTOTEXT_PATH
mimetype_registry = {}

63
apps/ocr/registry.py Normal file
View File

@@ -0,0 +1,63 @@
from __future__ import absolute_import
from django.utils.translation import ugettext_lazy as _
from smart_settings import LocalScope
from .icons import icon_submit_document
label = _(u'OCR')
description = _(u'Handles optical character recognition.')
icon = icon_submit_document
dependencies = ['app_registry', 'icons', 'navigation']
settings = [
{
'name': 'TESSERACT_PATH',
'default': u'/usr/bin/tesseract',
'exists': True,
'scopes': [LocalScope()]
},
{
'name': 'TESSERACT_LANGUAGE',
'default': u'eng',
'scopes': [LocalScope()]
},
{
'name': 'REPLICATION_DELAY',
'default': 0,
'description': _(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.'),
'scopes': [LocalScope()]
},
{
'name': 'NODE_CONCURRENT_EXECUTION',
'default': 1,
'description': _(u'Maximum amount of concurrent document OCRs a node can perform.'),
'scopes': [LocalScope()]
},
{
'name': 'AUTOMATIC_OCR',
'default': True,
'description': _(u'Automatically queue newly created documents for OCR.'),
'scopes': [LocalScope()]
},
{
'name': 'QUEUE_PROCESSING_INTERVAL',
'default': 10,
'description': _(u'Automatically queue newly created documents for OCR.'),
'scopes': [LocalScope()]
},
{
'name': 'UNPAPER_PATH',
'default': u'/usr/bin/unpaper',
'description': _(u'File path to unpaper program.'),
'exists': True,
'scopes': [LocalScope()]
},
{
'name': 'PDFTOTEXT_PATH',
'default': u'/usr/bin/pdftotext',
'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'),
'exists': True,
'scopes': [LocalScope()]
},
]