diff --git a/mayan/apps/ocr/api.py b/mayan/apps/ocr/api.py index 8899765c5b..6447643ae9 100644 --- a/mayan/apps/ocr/api.py +++ b/mayan/apps/ocr/api.py @@ -31,94 +31,14 @@ except sh.CommandNotFound: logger.debug('unpaper not found') UNPAPER = None - -def do_document_ocr(document_version): - """ - Try first to extract text from document pages using the registered - parser, if the parser fails or if there is no parser registered for - the document mimetype do a visual OCR by calling the corresponding - OCR backend - """ - for document_page in document_version.pages.all(): - try: - # Try to extract text by means of a parser - parse_document_page(document_page) - except (ParserError, ParserUnknownFile): - # Fall back to doing visual OCR - - # TODO: disabling for now - """ - unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT) - - logger.debug('unpaper_input: %s', unpaper_input) - - unpaper_output = execute_unpaper(input_filepath=unpaper_input) - - logger.debug('unpaper_output: %s', unpaper_output) - - # Convert to TIFF - pre_ocr_filepath = convert(input_filepath=unpaper_output, file_format=DEFAULT_OCR_FILE_FORMAT) - - logger.debug('pre_ocr_filepath: %s', pre_ocr_filepath) - - # Tesseract needs an explicit file extension - pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION]) - - logger.debug('pre_ocr_filepath_w_ext: %s', pre_ocr_filepath_w_ext) - - os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext) - try: - ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document_version.document.language) - - document_page.content = ocr_cleanup(document_version.document.language, ocr_text) - document_page.page_label = _('Text from OCR') - document_page.save() - finally: - fs_cleanup(pre_ocr_filepath_w_ext) - fs_cleanup(unpaper_input) - fs_cleanup(document_filepath) - fs_cleanup(unpaper_output) - """ - - -def ocr_cleanup(language, text): - """ - Cleanup the OCR's output passing it thru the selected language's - cleanup filter - """ +""" +for document_page in document_version.pages.all(): try: - language_backend = import_string('.'.join(['ocr', 'lang', language, 'LanguageBackend']))() - except ImportError: - language_backend = None - - output = [] - for line in text.splitlines(): - line = line.strip() - for word in line.split(): - if language_backend: - try: - result = language_backend.check_word(word) - except Exception as exception: - logger.error(exception) - raise Exception('ocr_cleanup() %s' % unicode(exception)) - else: - result = word - if result: - output.append(result) - output.append('\n') - - return ' '.join(output) - - -def clean_pages(): - """ - Tool that executes the OCR cleanup code on all of the existing - documents - """ - for page in DocumentPage.objects.all(): - if page.content: - page.content = ocr_cleanup(page.document.language, page.content) - page.save() + # Try to extract text by means of a parser + parse_document_page(document_page) + except (ParserError, ParserUnknownFile): + # Fall back to doing visual OCR +""" def execute_unpaper(input_filepath, output_filepath=None): diff --git a/mayan/apps/ocr/apps.py b/mayan/apps/ocr/apps.py index d749b0fcc5..6b5d5daa69 100644 --- a/mayan/apps/ocr/apps.py +++ b/mayan/apps/ocr/apps.py @@ -20,8 +20,7 @@ from rest_api.classes import APIEndPoint from .handlers import post_version_upload_ocr from .links import ( - link_document_all_ocr_cleanup, link_document_submit, - link_document_submit_multiple, link_entry_delete, + link_document_submit, link_document_submit_multiple, link_entry_delete, link_entry_delete_multiple, link_entry_list, link_entry_re_queue, link_entry_re_queue_multiple ) @@ -91,8 +90,6 @@ class OCRApp(apps.AppConfig): else: namespace.add_property('unpaper', _('unpaper version'), unpaper('-V').stdout, report=True) - register_maintenance_links([link_document_all_ocr_cleanup], namespace='ocr', title=_('OCR')) - register_model_list_columns(DocumentVersionOCRError, [ { 'name': _('Document'), 'attribute': encapsulate(lambda entry: document_link(entry.document_version.document)) diff --git a/mayan/apps/ocr/links.py b/mayan/apps/ocr/links.py index 4fe67f3315..80973dc417 100644 --- a/mayan/apps/ocr/links.py +++ b/mayan/apps/ocr/links.py @@ -5,15 +5,9 @@ from django.utils.translation import ugettext_lazy as _ from navigation import Link from .permissions import ( - PERMISSION_OCR_CLEAN_ALL_PAGES, PERMISSION_OCR_DOCUMENT, - PERMISSION_OCR_DOCUMENT_DELETE + PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE ) -link_document_all_ocr_cleanup = Link( - description=_('Runs a language filter to remove common OCR mistakes from document pages content.'), - permissions=[PERMISSION_OCR_CLEAN_ALL_PAGES], - text=_('Clean up pages content'), view='ocr:document_all_ocr_cleanup' -) link_document_submit = Link(permissions=[PERMISSION_OCR_DOCUMENT], text=_('Submit to OCR queue'), view='ocr:document_submit', args='object.id') link_document_submit_multiple = Link(text=_('Submit to OCR queue'), view='ocr:document_submit_multiple') link_entry_delete = Link(permissions=[PERMISSION_OCR_DOCUMENT_DELETE], text=_('Delete'), view='ocr:entry_delete', args='object.id') diff --git a/mayan/apps/ocr/permissions.py b/mayan/apps/ocr/permissions.py index b6bf977a6c..c0811fef54 100644 --- a/mayan/apps/ocr/permissions.py +++ b/mayan/apps/ocr/permissions.py @@ -7,4 +7,3 @@ from permissions.models import Permission, PermissionNamespace ocr_namespace = PermissionNamespace('ocr', _('OCR')) PERMISSION_OCR_DOCUMENT = Permission.objects.register(ocr_namespace, 'ocr_document', _('Submit documents for OCR')) PERMISSION_OCR_DOCUMENT_DELETE = Permission.objects.register(ocr_namespace, 'ocr_document_delete', _('Delete documents from OCR queue')) -PERMISSION_OCR_CLEAN_ALL_PAGES = Permission.objects.register(ocr_namespace, 'ocr_clean_all_pages', _('Can execute the OCR clean up on all document pages')) diff --git a/mayan/apps/ocr/urls.py b/mayan/apps/ocr/urls.py index 983dd167d5..02a0bb1120 100644 --- a/mayan/apps/ocr/urls.py +++ b/mayan/apps/ocr/urls.py @@ -8,7 +8,6 @@ urlpatterns = patterns( 'ocr.views', url(r'^document/(?P\d+)/submit/$', 'document_submit', (), 'document_submit'), url(r'^document/multiple/submit/$', 'document_submit_multiple', (), 'document_submit_multiple'), - url(r'^document/all/clean_up/$', 'document_all_ocr_cleanup', (), 'document_all_ocr_cleanup'), url(r'^all/$', 'entry_list', (), 'entry_list'), url(r'^(?P\d+)/delete/$', 'entry_delete', (), 'entry_delete'), diff --git a/mayan/apps/ocr/views.py b/mayan/apps/ocr/views.py index 45299a9e72..d39a19985b 100644 --- a/mayan/apps/ocr/views.py +++ b/mayan/apps/ocr/views.py @@ -13,11 +13,9 @@ from acls.models import AccessEntry from documents.models import Document, DocumentVersion from permissions.models import Permission -from .api import clean_pages from .models import DocumentVersionOCRError from .permissions import ( - PERMISSION_OCR_CLEAN_ALL_PAGES, PERMISSION_OCR_DOCUMENT, - PERMISSION_OCR_DOCUMENT_DELETE + PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE ) @@ -44,30 +42,6 @@ def document_submit_multiple(request): return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse(settings.LOGIN_REDIRECT_URL))) -def document_all_ocr_cleanup(request): - Permission.objects.check_permissions(request.user, [PERMISSION_OCR_CLEAN_ALL_PAGES]) - - previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', reverse(settings.LOGIN_REDIRECT_URL)))) - next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', reverse(settings.LOGIN_REDIRECT_URL)))) - - if request.method != 'POST': - return render_to_response('appearance/generic_confirm.html', { - 'previous': previous, - 'next': next, - 'title': _('Are you sure you wish to clean up all the pages content?'), - 'message': _('On large databases this operation may take some time to execute.'), - }, context_instance=RequestContext(request)) - else: - try: - # TODO: turn this into a Celery task - clean_pages() - messages.success(request, _('Document pages content clean up complete.')) - except Exception as exception: - messages.error(request, _('Document pages content clean up error: %s') % exception) - - return HttpResponseRedirect(next) - - def entry_list(request): Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT])