Added doctring description

2011-05-03 20:58:58 -04:00
parent 6960bff9ef
commit 1e0d8d1f25
1 changed files with 27 additions and 1 deletions
--- a/apps/ocr/api.py
+++ b/apps/ocr/api.py
@@ -19,6 +19,10 @@ from ocr.exceptions import TesseractError, PdftotextError


 def get_language_backend():
+    """
+    Return the OCR cleanup language backend using the selected tesseract
+    language in the configuration settings
+    """
    try:
        module = import_module(u'.'.join([u'ocr', u'lang', TESSERACT_LANGUAGE]))
    except ImportError:
@@ -30,7 +34,9 @@ backend = get_language_backend()


 def cleanup(filename):
-    ''' tries to remove the given filename. Ignores non-existent files '''
+    """
+    Try to remove the given filename, ignoring non-existent files
+    """
    try:
        os.remove(filename)
    except OSError:
@@ -38,6 +44,9 @@ def cleanup(filename):


 def run_tesseract(input_filename, output_filename_base, lang=None):
+    """
+    Execute the command line binary of tesseract
+    """
    command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(output_filename_base)]
    if lang is not None:
        command += [u'-l', lang]
@@ -50,6 +59,9 @@ def run_tesseract(input_filename, output_filename_base, lang=None):


 def run_pdftotext(input_filename, output_filename, page_number=None):
+    """
+        Execute the command line binary of pdftotext
+    """
    command = [unicode(PDFTOTEXT_PATH)]
    if page_number:
        command.extend(['-nopgbrk', '-f', unicode(page_number), '-l', unicode(page_number)])
@@ -62,6 +74,11 @@ def run_pdftotext(input_filename, output_filename, page_number=None):


 def do_document_ocr(document):
+    """
+    Do OCR on all the pages of the given document object, first
+    trying to extract text from PDF using pdftotext then by calling
+    tesseract
+    """
    for page_index, document_page in enumerate(document.documentpage_set.all()):
        desc, filepath = tempfile.mkstemp()
        imagefile = None
@@ -100,6 +117,11 @@ def do_document_ocr(document):


 def ocr_cleanup(text):
+    """
+    Cleanup the OCR's output passing it thru the selected language's
+    cleanup filter
+    """
+    
    output = []
    for line in text.splitlines():
        line = line.strip()
@@ -116,6 +138,10 @@ def ocr_cleanup(text):


 def clean_pages():
+    """
+    Tool that executes the OCR cleanup code on all of the existing
+    documents
+    """
    for page in DocumentPage.objects.all():
        if page.content:
            page.content = ocr_cleanup(page.content)