Remove OCR cleanup view
This commit is contained in:
@@ -31,94 +31,14 @@ except sh.CommandNotFound:
|
|||||||
logger.debug('unpaper not found')
|
logger.debug('unpaper not found')
|
||||||
UNPAPER = None
|
UNPAPER = None
|
||||||
|
|
||||||
|
"""
|
||||||
def do_document_ocr(document_version):
|
for document_page in document_version.pages.all():
|
||||||
"""
|
|
||||||
Try first to extract text from document pages using the registered
|
|
||||||
parser, if the parser fails or if there is no parser registered for
|
|
||||||
the document mimetype do a visual OCR by calling the corresponding
|
|
||||||
OCR backend
|
|
||||||
"""
|
|
||||||
for document_page in document_version.pages.all():
|
|
||||||
try:
|
|
||||||
# Try to extract text by means of a parser
|
|
||||||
parse_document_page(document_page)
|
|
||||||
except (ParserError, ParserUnknownFile):
|
|
||||||
# Fall back to doing visual OCR
|
|
||||||
|
|
||||||
# TODO: disabling for now
|
|
||||||
"""
|
|
||||||
unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT)
|
|
||||||
|
|
||||||
logger.debug('unpaper_input: %s', unpaper_input)
|
|
||||||
|
|
||||||
unpaper_output = execute_unpaper(input_filepath=unpaper_input)
|
|
||||||
|
|
||||||
logger.debug('unpaper_output: %s', unpaper_output)
|
|
||||||
|
|
||||||
# Convert to TIFF
|
|
||||||
pre_ocr_filepath = convert(input_filepath=unpaper_output, file_format=DEFAULT_OCR_FILE_FORMAT)
|
|
||||||
|
|
||||||
logger.debug('pre_ocr_filepath: %s', pre_ocr_filepath)
|
|
||||||
|
|
||||||
# Tesseract needs an explicit file extension
|
|
||||||
pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])
|
|
||||||
|
|
||||||
logger.debug('pre_ocr_filepath_w_ext: %s', pre_ocr_filepath_w_ext)
|
|
||||||
|
|
||||||
os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
|
|
||||||
try:
|
|
||||||
ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document_version.document.language)
|
|
||||||
|
|
||||||
document_page.content = ocr_cleanup(document_version.document.language, ocr_text)
|
|
||||||
document_page.page_label = _('Text from OCR')
|
|
||||||
document_page.save()
|
|
||||||
finally:
|
|
||||||
fs_cleanup(pre_ocr_filepath_w_ext)
|
|
||||||
fs_cleanup(unpaper_input)
|
|
||||||
fs_cleanup(document_filepath)
|
|
||||||
fs_cleanup(unpaper_output)
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def ocr_cleanup(language, text):
|
|
||||||
"""
|
|
||||||
Cleanup the OCR's output passing it thru the selected language's
|
|
||||||
cleanup filter
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
language_backend = import_string('.'.join(['ocr', 'lang', language, 'LanguageBackend']))()
|
# Try to extract text by means of a parser
|
||||||
except ImportError:
|
parse_document_page(document_page)
|
||||||
language_backend = None
|
except (ParserError, ParserUnknownFile):
|
||||||
|
# Fall back to doing visual OCR
|
||||||
output = []
|
"""
|
||||||
for line in text.splitlines():
|
|
||||||
line = line.strip()
|
|
||||||
for word in line.split():
|
|
||||||
if language_backend:
|
|
||||||
try:
|
|
||||||
result = language_backend.check_word(word)
|
|
||||||
except Exception as exception:
|
|
||||||
logger.error(exception)
|
|
||||||
raise Exception('ocr_cleanup() %s' % unicode(exception))
|
|
||||||
else:
|
|
||||||
result = word
|
|
||||||
if result:
|
|
||||||
output.append(result)
|
|
||||||
output.append('\n')
|
|
||||||
|
|
||||||
return ' '.join(output)
|
|
||||||
|
|
||||||
|
|
||||||
def clean_pages():
|
|
||||||
"""
|
|
||||||
Tool that executes the OCR cleanup code on all of the existing
|
|
||||||
documents
|
|
||||||
"""
|
|
||||||
for page in DocumentPage.objects.all():
|
|
||||||
if page.content:
|
|
||||||
page.content = ocr_cleanup(page.document.language, page.content)
|
|
||||||
page.save()
|
|
||||||
|
|
||||||
|
|
||||||
def execute_unpaper(input_filepath, output_filepath=None):
|
def execute_unpaper(input_filepath, output_filepath=None):
|
||||||
|
|||||||
@@ -20,8 +20,7 @@ from rest_api.classes import APIEndPoint
|
|||||||
|
|
||||||
from .handlers import post_version_upload_ocr
|
from .handlers import post_version_upload_ocr
|
||||||
from .links import (
|
from .links import (
|
||||||
link_document_all_ocr_cleanup, link_document_submit,
|
link_document_submit, link_document_submit_multiple, link_entry_delete,
|
||||||
link_document_submit_multiple, link_entry_delete,
|
|
||||||
link_entry_delete_multiple, link_entry_list, link_entry_re_queue,
|
link_entry_delete_multiple, link_entry_list, link_entry_re_queue,
|
||||||
link_entry_re_queue_multiple
|
link_entry_re_queue_multiple
|
||||||
)
|
)
|
||||||
@@ -91,8 +90,6 @@ class OCRApp(apps.AppConfig):
|
|||||||
else:
|
else:
|
||||||
namespace.add_property('unpaper', _('unpaper version'), unpaper('-V').stdout, report=True)
|
namespace.add_property('unpaper', _('unpaper version'), unpaper('-V').stdout, report=True)
|
||||||
|
|
||||||
register_maintenance_links([link_document_all_ocr_cleanup], namespace='ocr', title=_('OCR'))
|
|
||||||
|
|
||||||
register_model_list_columns(DocumentVersionOCRError, [
|
register_model_list_columns(DocumentVersionOCRError, [
|
||||||
{
|
{
|
||||||
'name': _('Document'), 'attribute': encapsulate(lambda entry: document_link(entry.document_version.document))
|
'name': _('Document'), 'attribute': encapsulate(lambda entry: document_link(entry.document_version.document))
|
||||||
|
|||||||
@@ -5,15 +5,9 @@ from django.utils.translation import ugettext_lazy as _
|
|||||||
from navigation import Link
|
from navigation import Link
|
||||||
|
|
||||||
from .permissions import (
|
from .permissions import (
|
||||||
PERMISSION_OCR_CLEAN_ALL_PAGES, PERMISSION_OCR_DOCUMENT,
|
PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE
|
||||||
PERMISSION_OCR_DOCUMENT_DELETE
|
|
||||||
)
|
)
|
||||||
|
|
||||||
link_document_all_ocr_cleanup = Link(
|
|
||||||
description=_('Runs a language filter to remove common OCR mistakes from document pages content.'),
|
|
||||||
permissions=[PERMISSION_OCR_CLEAN_ALL_PAGES],
|
|
||||||
text=_('Clean up pages content'), view='ocr:document_all_ocr_cleanup'
|
|
||||||
)
|
|
||||||
link_document_submit = Link(permissions=[PERMISSION_OCR_DOCUMENT], text=_('Submit to OCR queue'), view='ocr:document_submit', args='object.id')
|
link_document_submit = Link(permissions=[PERMISSION_OCR_DOCUMENT], text=_('Submit to OCR queue'), view='ocr:document_submit', args='object.id')
|
||||||
link_document_submit_multiple = Link(text=_('Submit to OCR queue'), view='ocr:document_submit_multiple')
|
link_document_submit_multiple = Link(text=_('Submit to OCR queue'), view='ocr:document_submit_multiple')
|
||||||
link_entry_delete = Link(permissions=[PERMISSION_OCR_DOCUMENT_DELETE], text=_('Delete'), view='ocr:entry_delete', args='object.id')
|
link_entry_delete = Link(permissions=[PERMISSION_OCR_DOCUMENT_DELETE], text=_('Delete'), view='ocr:entry_delete', args='object.id')
|
||||||
|
|||||||
@@ -7,4 +7,3 @@ from permissions.models import Permission, PermissionNamespace
|
|||||||
ocr_namespace = PermissionNamespace('ocr', _('OCR'))
|
ocr_namespace = PermissionNamespace('ocr', _('OCR'))
|
||||||
PERMISSION_OCR_DOCUMENT = Permission.objects.register(ocr_namespace, 'ocr_document', _('Submit documents for OCR'))
|
PERMISSION_OCR_DOCUMENT = Permission.objects.register(ocr_namespace, 'ocr_document', _('Submit documents for OCR'))
|
||||||
PERMISSION_OCR_DOCUMENT_DELETE = Permission.objects.register(ocr_namespace, 'ocr_document_delete', _('Delete documents from OCR queue'))
|
PERMISSION_OCR_DOCUMENT_DELETE = Permission.objects.register(ocr_namespace, 'ocr_document_delete', _('Delete documents from OCR queue'))
|
||||||
PERMISSION_OCR_CLEAN_ALL_PAGES = Permission.objects.register(ocr_namespace, 'ocr_clean_all_pages', _('Can execute the OCR clean up on all document pages'))
|
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ urlpatterns = patterns(
|
|||||||
'ocr.views',
|
'ocr.views',
|
||||||
url(r'^document/(?P<pk>\d+)/submit/$', 'document_submit', (), 'document_submit'),
|
url(r'^document/(?P<pk>\d+)/submit/$', 'document_submit', (), 'document_submit'),
|
||||||
url(r'^document/multiple/submit/$', 'document_submit_multiple', (), 'document_submit_multiple'),
|
url(r'^document/multiple/submit/$', 'document_submit_multiple', (), 'document_submit_multiple'),
|
||||||
url(r'^document/all/clean_up/$', 'document_all_ocr_cleanup', (), 'document_all_ocr_cleanup'),
|
|
||||||
|
|
||||||
url(r'^all/$', 'entry_list', (), 'entry_list'),
|
url(r'^all/$', 'entry_list', (), 'entry_list'),
|
||||||
url(r'^(?P<pk>\d+)/delete/$', 'entry_delete', (), 'entry_delete'),
|
url(r'^(?P<pk>\d+)/delete/$', 'entry_delete', (), 'entry_delete'),
|
||||||
|
|||||||
@@ -13,11 +13,9 @@ from acls.models import AccessEntry
|
|||||||
from documents.models import Document, DocumentVersion
|
from documents.models import Document, DocumentVersion
|
||||||
from permissions.models import Permission
|
from permissions.models import Permission
|
||||||
|
|
||||||
from .api import clean_pages
|
|
||||||
from .models import DocumentVersionOCRError
|
from .models import DocumentVersionOCRError
|
||||||
from .permissions import (
|
from .permissions import (
|
||||||
PERMISSION_OCR_CLEAN_ALL_PAGES, PERMISSION_OCR_DOCUMENT,
|
PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE
|
||||||
PERMISSION_OCR_DOCUMENT_DELETE
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -44,30 +42,6 @@ def document_submit_multiple(request):
|
|||||||
return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse(settings.LOGIN_REDIRECT_URL)))
|
return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse(settings.LOGIN_REDIRECT_URL)))
|
||||||
|
|
||||||
|
|
||||||
def document_all_ocr_cleanup(request):
|
|
||||||
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_CLEAN_ALL_PAGES])
|
|
||||||
|
|
||||||
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', reverse(settings.LOGIN_REDIRECT_URL))))
|
|
||||||
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', reverse(settings.LOGIN_REDIRECT_URL))))
|
|
||||||
|
|
||||||
if request.method != 'POST':
|
|
||||||
return render_to_response('appearance/generic_confirm.html', {
|
|
||||||
'previous': previous,
|
|
||||||
'next': next,
|
|
||||||
'title': _('Are you sure you wish to clean up all the pages content?'),
|
|
||||||
'message': _('On large databases this operation may take some time to execute.'),
|
|
||||||
}, context_instance=RequestContext(request))
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
# TODO: turn this into a Celery task
|
|
||||||
clean_pages()
|
|
||||||
messages.success(request, _('Document pages content clean up complete.'))
|
|
||||||
except Exception as exception:
|
|
||||||
messages.error(request, _('Document pages content clean up error: %s') % exception)
|
|
||||||
|
|
||||||
return HttpResponseRedirect(next)
|
|
||||||
|
|
||||||
|
|
||||||
def entry_list(request):
|
def entry_list(request):
|
||||||
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT])
|
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT])
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user