Remove OCR cleanup view

2015-06-09 03:29:15 -04:00
parent 5275061f9f
commit 74bd69b61c
6 changed files with 10 additions and 127 deletions
--- a/mayan/apps/ocr/api.py
+++ b/mayan/apps/ocr/api.py
@@ -31,94 +31,14 @@ except sh.CommandNotFound:
    logger.debug('unpaper not found')
    UNPAPER = None

-
-def do_document_ocr(document_version):
-    """
-    Try first to extract text from document pages using the registered
-    parser, if the parser fails or if there is no parser registered for
-    the document mimetype do a visual OCR by calling the corresponding
-    OCR backend
-    """
-    for document_page in document_version.pages.all():
-        try:
-            # Try to extract text by means of a parser
-            parse_document_page(document_page)
-        except (ParserError, ParserUnknownFile):
-            # Fall back to doing visual OCR
-
-            # TODO: disabling for now
-            """
-            unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT)
-
-            logger.debug('unpaper_input: %s', unpaper_input)
-
-            unpaper_output = execute_unpaper(input_filepath=unpaper_input)
-
-            logger.debug('unpaper_output: %s', unpaper_output)
-
-            # Convert to TIFF
-            pre_ocr_filepath = convert(input_filepath=unpaper_output, file_format=DEFAULT_OCR_FILE_FORMAT)
-
-            logger.debug('pre_ocr_filepath: %s', pre_ocr_filepath)
-
-            # Tesseract needs an explicit file extension
-            pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])
-
-            logger.debug('pre_ocr_filepath_w_ext: %s', pre_ocr_filepath_w_ext)
-
-            os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
-            try:
-                ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document_version.document.language)
-
-                document_page.content = ocr_cleanup(document_version.document.language, ocr_text)
-                document_page.page_label = _('Text from OCR')
-                document_page.save()
-            finally:
-                fs_cleanup(pre_ocr_filepath_w_ext)
-                fs_cleanup(unpaper_input)
-                fs_cleanup(document_filepath)
-                fs_cleanup(unpaper_output)
-            """
-
-
-def ocr_cleanup(language, text):
-    """
-    Cleanup the OCR's output passing it thru the selected language's
-    cleanup filter
-    """
+"""
+for document_page in document_version.pages.all():
    try:
-        language_backend = import_string('.'.join(['ocr', 'lang', language, 'LanguageBackend']))()
-    except ImportError:
-        language_backend = None
-
-    output = []
-    for line in text.splitlines():
-        line = line.strip()
-        for word in line.split():
-            if language_backend:
-                try:
-                    result = language_backend.check_word(word)
-                except Exception as exception:
-                    logger.error(exception)
-                    raise Exception('ocr_cleanup() %s' % unicode(exception))
-            else:
-                result = word
-            if result:
-                output.append(result)
-        output.append('\n')
-
-    return ' '.join(output)
-
-
-def clean_pages():
-    """
-    Tool that executes the OCR cleanup code on all of the existing
-    documents
-    """
-    for page in DocumentPage.objects.all():
-        if page.content:
-            page.content = ocr_cleanup(page.document.language, page.content)
-            page.save()
+        # Try to extract text by means of a parser
+        parse_document_page(document_page)
+    except (ParserError, ParserUnknownFile):
+        # Fall back to doing visual OCR
+"""


 def execute_unpaper(input_filepath, output_filepath=None):