diff --git a/apps/ocr/api.py b/apps/ocr/api.py index 14c71e8542..6711b610f5 100644 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -19,6 +19,10 @@ from ocr.exceptions import TesseractError, PdftotextError def get_language_backend(): + """ + Return the OCR cleanup language backend using the selected tesseract + language in the configuration settings + """ try: module = import_module(u'.'.join([u'ocr', u'lang', TESSERACT_LANGUAGE])) except ImportError: @@ -30,7 +34,9 @@ backend = get_language_backend() def cleanup(filename): - ''' tries to remove the given filename. Ignores non-existent files ''' + """ + Try to remove the given filename, ignoring non-existent files + """ try: os.remove(filename) except OSError: @@ -38,6 +44,9 @@ def cleanup(filename): def run_tesseract(input_filename, output_filename_base, lang=None): + """ + Execute the command line binary of tesseract + """ command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(output_filename_base)] if lang is not None: command += [u'-l', lang] @@ -50,6 +59,9 @@ def run_tesseract(input_filename, output_filename_base, lang=None): def run_pdftotext(input_filename, output_filename, page_number=None): + """ + Execute the command line binary of pdftotext + """ command = [unicode(PDFTOTEXT_PATH)] if page_number: command.extend(['-nopgbrk', '-f', unicode(page_number), '-l', unicode(page_number)]) @@ -62,6 +74,11 @@ def run_pdftotext(input_filename, output_filename, page_number=None): def do_document_ocr(document): + """ + Do OCR on all the pages of the given document object, first + trying to extract text from PDF using pdftotext then by calling + tesseract + """ for page_index, document_page in enumerate(document.documentpage_set.all()): desc, filepath = tempfile.mkstemp() imagefile = None @@ -100,6 +117,11 @@ def do_document_ocr(document): def ocr_cleanup(text): + """ + Cleanup the OCR's output passing it thru the selected language's + cleanup filter + """ + output = [] for line in text.splitlines(): line = line.strip() @@ -116,6 +138,10 @@ def ocr_cleanup(text): def clean_pages(): + """ + Tool that executes the OCR cleanup code on all of the existing + documents + """ for page in DocumentPage.objects.all(): if page.content: page.content = ocr_cleanup(page.content)