Remove OCR cleanup view
This commit is contained in:
@@ -31,94 +31,14 @@ except sh.CommandNotFound:
|
||||
logger.debug('unpaper not found')
|
||||
UNPAPER = None
|
||||
|
||||
|
||||
def do_document_ocr(document_version):
|
||||
"""
|
||||
Try first to extract text from document pages using the registered
|
||||
parser, if the parser fails or if there is no parser registered for
|
||||
the document mimetype do a visual OCR by calling the corresponding
|
||||
OCR backend
|
||||
"""
|
||||
for document_page in document_version.pages.all():
|
||||
"""
|
||||
for document_page in document_version.pages.all():
|
||||
try:
|
||||
# Try to extract text by means of a parser
|
||||
parse_document_page(document_page)
|
||||
except (ParserError, ParserUnknownFile):
|
||||
# Fall back to doing visual OCR
|
||||
|
||||
# TODO: disabling for now
|
||||
"""
|
||||
unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT)
|
||||
|
||||
logger.debug('unpaper_input: %s', unpaper_input)
|
||||
|
||||
unpaper_output = execute_unpaper(input_filepath=unpaper_input)
|
||||
|
||||
logger.debug('unpaper_output: %s', unpaper_output)
|
||||
|
||||
# Convert to TIFF
|
||||
pre_ocr_filepath = convert(input_filepath=unpaper_output, file_format=DEFAULT_OCR_FILE_FORMAT)
|
||||
|
||||
logger.debug('pre_ocr_filepath: %s', pre_ocr_filepath)
|
||||
|
||||
# Tesseract needs an explicit file extension
|
||||
pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])
|
||||
|
||||
logger.debug('pre_ocr_filepath_w_ext: %s', pre_ocr_filepath_w_ext)
|
||||
|
||||
os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
|
||||
try:
|
||||
ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document_version.document.language)
|
||||
|
||||
document_page.content = ocr_cleanup(document_version.document.language, ocr_text)
|
||||
document_page.page_label = _('Text from OCR')
|
||||
document_page.save()
|
||||
finally:
|
||||
fs_cleanup(pre_ocr_filepath_w_ext)
|
||||
fs_cleanup(unpaper_input)
|
||||
fs_cleanup(document_filepath)
|
||||
fs_cleanup(unpaper_output)
|
||||
"""
|
||||
|
||||
|
||||
def ocr_cleanup(language, text):
|
||||
"""
|
||||
Cleanup the OCR's output passing it thru the selected language's
|
||||
cleanup filter
|
||||
"""
|
||||
try:
|
||||
language_backend = import_string('.'.join(['ocr', 'lang', language, 'LanguageBackend']))()
|
||||
except ImportError:
|
||||
language_backend = None
|
||||
|
||||
output = []
|
||||
for line in text.splitlines():
|
||||
line = line.strip()
|
||||
for word in line.split():
|
||||
if language_backend:
|
||||
try:
|
||||
result = language_backend.check_word(word)
|
||||
except Exception as exception:
|
||||
logger.error(exception)
|
||||
raise Exception('ocr_cleanup() %s' % unicode(exception))
|
||||
else:
|
||||
result = word
|
||||
if result:
|
||||
output.append(result)
|
||||
output.append('\n')
|
||||
|
||||
return ' '.join(output)
|
||||
|
||||
|
||||
def clean_pages():
|
||||
"""
|
||||
Tool that executes the OCR cleanup code on all of the existing
|
||||
documents
|
||||
"""
|
||||
for page in DocumentPage.objects.all():
|
||||
if page.content:
|
||||
page.content = ocr_cleanup(page.document.language, page.content)
|
||||
page.save()
|
||||
"""
|
||||
|
||||
|
||||
def execute_unpaper(input_filepath, output_filepath=None):
|
||||
|
||||
@@ -20,8 +20,7 @@ from rest_api.classes import APIEndPoint
|
||||
|
||||
from .handlers import post_version_upload_ocr
|
||||
from .links import (
|
||||
link_document_all_ocr_cleanup, link_document_submit,
|
||||
link_document_submit_multiple, link_entry_delete,
|
||||
link_document_submit, link_document_submit_multiple, link_entry_delete,
|
||||
link_entry_delete_multiple, link_entry_list, link_entry_re_queue,
|
||||
link_entry_re_queue_multiple
|
||||
)
|
||||
@@ -91,8 +90,6 @@ class OCRApp(apps.AppConfig):
|
||||
else:
|
||||
namespace.add_property('unpaper', _('unpaper version'), unpaper('-V').stdout, report=True)
|
||||
|
||||
register_maintenance_links([link_document_all_ocr_cleanup], namespace='ocr', title=_('OCR'))
|
||||
|
||||
register_model_list_columns(DocumentVersionOCRError, [
|
||||
{
|
||||
'name': _('Document'), 'attribute': encapsulate(lambda entry: document_link(entry.document_version.document))
|
||||
|
||||
@@ -5,15 +5,9 @@ from django.utils.translation import ugettext_lazy as _
|
||||
from navigation import Link
|
||||
|
||||
from .permissions import (
|
||||
PERMISSION_OCR_CLEAN_ALL_PAGES, PERMISSION_OCR_DOCUMENT,
|
||||
PERMISSION_OCR_DOCUMENT_DELETE
|
||||
PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE
|
||||
)
|
||||
|
||||
link_document_all_ocr_cleanup = Link(
|
||||
description=_('Runs a language filter to remove common OCR mistakes from document pages content.'),
|
||||
permissions=[PERMISSION_OCR_CLEAN_ALL_PAGES],
|
||||
text=_('Clean up pages content'), view='ocr:document_all_ocr_cleanup'
|
||||
)
|
||||
link_document_submit = Link(permissions=[PERMISSION_OCR_DOCUMENT], text=_('Submit to OCR queue'), view='ocr:document_submit', args='object.id')
|
||||
link_document_submit_multiple = Link(text=_('Submit to OCR queue'), view='ocr:document_submit_multiple')
|
||||
link_entry_delete = Link(permissions=[PERMISSION_OCR_DOCUMENT_DELETE], text=_('Delete'), view='ocr:entry_delete', args='object.id')
|
||||
|
||||
@@ -7,4 +7,3 @@ from permissions.models import Permission, PermissionNamespace
|
||||
ocr_namespace = PermissionNamespace('ocr', _('OCR'))
|
||||
PERMISSION_OCR_DOCUMENT = Permission.objects.register(ocr_namespace, 'ocr_document', _('Submit documents for OCR'))
|
||||
PERMISSION_OCR_DOCUMENT_DELETE = Permission.objects.register(ocr_namespace, 'ocr_document_delete', _('Delete documents from OCR queue'))
|
||||
PERMISSION_OCR_CLEAN_ALL_PAGES = Permission.objects.register(ocr_namespace, 'ocr_clean_all_pages', _('Can execute the OCR clean up on all document pages'))
|
||||
|
||||
@@ -8,7 +8,6 @@ urlpatterns = patterns(
|
||||
'ocr.views',
|
||||
url(r'^document/(?P<pk>\d+)/submit/$', 'document_submit', (), 'document_submit'),
|
||||
url(r'^document/multiple/submit/$', 'document_submit_multiple', (), 'document_submit_multiple'),
|
||||
url(r'^document/all/clean_up/$', 'document_all_ocr_cleanup', (), 'document_all_ocr_cleanup'),
|
||||
|
||||
url(r'^all/$', 'entry_list', (), 'entry_list'),
|
||||
url(r'^(?P<pk>\d+)/delete/$', 'entry_delete', (), 'entry_delete'),
|
||||
|
||||
@@ -13,11 +13,9 @@ from acls.models import AccessEntry
|
||||
from documents.models import Document, DocumentVersion
|
||||
from permissions.models import Permission
|
||||
|
||||
from .api import clean_pages
|
||||
from .models import DocumentVersionOCRError
|
||||
from .permissions import (
|
||||
PERMISSION_OCR_CLEAN_ALL_PAGES, PERMISSION_OCR_DOCUMENT,
|
||||
PERMISSION_OCR_DOCUMENT_DELETE
|
||||
PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE
|
||||
)
|
||||
|
||||
|
||||
@@ -44,30 +42,6 @@ def document_submit_multiple(request):
|
||||
return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse(settings.LOGIN_REDIRECT_URL)))
|
||||
|
||||
|
||||
def document_all_ocr_cleanup(request):
|
||||
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_CLEAN_ALL_PAGES])
|
||||
|
||||
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', reverse(settings.LOGIN_REDIRECT_URL))))
|
||||
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', reverse(settings.LOGIN_REDIRECT_URL))))
|
||||
|
||||
if request.method != 'POST':
|
||||
return render_to_response('appearance/generic_confirm.html', {
|
||||
'previous': previous,
|
||||
'next': next,
|
||||
'title': _('Are you sure you wish to clean up all the pages content?'),
|
||||
'message': _('On large databases this operation may take some time to execute.'),
|
||||
}, context_instance=RequestContext(request))
|
||||
else:
|
||||
try:
|
||||
# TODO: turn this into a Celery task
|
||||
clean_pages()
|
||||
messages.success(request, _('Document pages content clean up complete.'))
|
||||
except Exception as exception:
|
||||
messages.error(request, _('Document pages content clean up error: %s') % exception)
|
||||
|
||||
return HttpResponseRedirect(next)
|
||||
|
||||
|
||||
def entry_list(request):
|
||||
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT])
|
||||
|
||||
|
||||
Reference in New Issue
Block a user