Apply transformation before doing OCR, added unpaper to the OCR pre processing pipe

2011-02-16 03:32:21 -04:00
parent 15afaadc4c
commit b1e2f64617
8 changed files with 106 additions and 30 deletions
--- a/README.md
+++ b/README.md
@@ -38,8 +38,9 @@ Or execute pip install -r requirements/production.txt to install the dependencie
 Executables:

 * ImageMagick - Convert, Edit, Or Compose Bitmap Images
-* tesseract-ocr - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google.
 * libmagic
+* tesseract-ocr - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google.
+* unpaper - post-processing scanned and photocopied book pages

 License
 -------
--- a/apps/converter/api.py
+++ b/apps/converter/api.py
@@ -7,6 +7,7 @@ import shutil
 from django.template.defaultfilters import slugify

 from converter.conf.settings import CONVERT_PATH
+from converter.conf.settings import UNPAPER_PATH
 from converter.conf.settings import IDENTIFY_PATH
 from converter.conf.settings import OCR_OPTIONS
 from converter.conf.settings import DEFAULT_OPTIONS
@@ -15,7 +16,7 @@ from converter.conf.settings import HIGH_QUALITY_OPTIONS

 #from converter.conf.settings import UNOCONV_PATH

-from converter import TEMPORARY_DIRECTORY
+from converter import TEMPORARY_DIRECTORY, TRANFORMATION_CHOICES
 from utils import from_descriptor_to_tempfile


@@ -50,17 +51,28 @@ def get_errors(error_string):


 #TODO: Timeout & kill child
-def execute_convert(input_filepath, arguments, output_filepath, quality=QUALITY_DEFAULT):
+def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None):
    command = []
    command.append(CONVERT_PATH)
    command.extend(shlex.split(str(QUALITY_SETTINGS[quality])))
    command.append(input_filepath)
-    command.extend(shlex.split(str(arguments)))
+    if arguments:
+        command.extend(shlex.split(str(arguments)))
    command.append(output_filepath)
-
    proc = subprocess.Popen(command, stderr=subprocess.PIPE)
    return (proc.wait(), proc.stderr.read())

+
+def execute_unpaper(input_filepath, output_filepath):
+    command = []
+    command.append(UNPAPER_PATH)
+    command.append('--overwrite')
+    command.append(input_filepath)
+    command.append(output_filepath)
+    proc = subprocess.Popen(command, stderr=subprocess.PIPE)
+    return (proc.wait(), proc.stderr.read())
+
+
 def execute_unoconv(input_filepath, output_filepath, arguments=''):
    command = [UNOCONV_PATH]
    command.extend(['--stdout'])
@@ -135,7 +147,7 @@ def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, f
    try:
        input_arg = '%s[%s]' % (input_filepath, page)
        extra_options += ' -resize %s' % size
-        status, error_string = execute_convert(input_arg, extra_options, '%s:%s' % (format, output_filepath), quality=quality)
+        status, error_string = execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath='%s:%s' % (format, output_filepath), quality=quality)
        if status:
            errors = get_errors(error_string)
            raise ConvertError(status, errors)
@@ -170,12 +182,44 @@ def convert_document_for_ocr(document, page=0, format='tif'):
    #Convert for OCR
    temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
    temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
-    output_arg = '%s_ocr%s%s%s' % (temp_path, page, os.extsep, format)
+    transformation_output_file = '%s_trans%s%s%s' % (temp_path, page, os.extsep, format)
+    unpaper_input_file = '%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
+    unpaper_output_file = '%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
+    convert_output_file = '%s_ocr%s%s%s' % (temp_path, page, os.extsep, format)
+    
    input_arg = '%s[%s]' % (input_filepath, page)
+
+    transformation_list = []
    try:
-        status, error_string = execute_convert(input_arg, OCR_OPTIONS, output_arg)
+        #Catch invalid or non existing pages
+        document_page = document.documentpage_set.get(document=document, page_number=page+1)
+        for page_transformation in document_page.documentpagetransformation_set.all():
+            try:
+                if page_transformation.transformation in TRANFORMATION_CHOICES:
+                    output = TRANFORMATION_CHOICES[page_transformation.transformation] % eval(page_transformation.arguments)
+                    transformation_list.append(output)
+            except Exception, e:
+                if request.user.is_staff:
+                    messages.warning(request, _(u'Error for transformation %(transformation)s:, %(error)s') % 
+                        {'transformation':page_transformation.get_transformation_display(),
+                        'error':e})
+                else:
+                    pass
+    except ObjectDoesNotExist:
+        pass
+
+    tranformation_string = ' '.join(transformation_list)
+    try:
+        status, error_string = execute_convert(input_filepath=input_arg, quality=QUALITY_HIGH, arguments=tranformation_string, output_filepath=transformation_output_file)
+        status, error_string = execute_convert(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file)
+        status, error_string = execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file)
+        status, error_string = execute_convert(input_filepath=unpaper_output_file, output_filepath=convert_output_file)
+        
        if status:
            errors = get_errors(error_string)
            raise ConvertError(status, errors)
    finally:
-        return output_arg
+        cleanup(transformation_output_file)
+        cleanup(unpaper_input_file)
+        cleanup(unpaper_output_file)
+        return convert_output_file
--- a/apps/converter/conf/settings.py
+++ b/apps/converter/conf/settings.py
@@ -1,6 +1,7 @@
 from django.conf import settings

 CONVERT_PATH = getattr(settings, 'CONVERTER_CONVERT_PATH', u'/usr/bin/convert')
+UNPAPER_PATH = getattr(settings, 'CONVERTER_UNPAPER_PATH', u'/usr/bin/unpaper')
 IDENTIFY_PATH = getattr(settings, 'CONVERTER_IDENTIFY_PATH', u'/usr/bin/identify')
 OCR_OPTIONS = getattr(settings, 'CONVERTER_OCR_OPTIONS', u'-colorspace Gray -depth 8 -resample 200x200')
 DEFAULT_OPTIONS = getattr(settings, 'CONVERTER_DEFAULT_OPTIONS', u'')
--- a/apps/ocr/api.py
+++ b/apps/ocr/api.py
@@ -22,10 +22,20 @@ def cleanup(filename):
        pass

 class TesseractError(Exception):
-    def __init__(self, status, message):
-        self.status = status
-        self.message = message
+    pass
+#    def __init__(self, status, message):
+#        self.status = status
+#        self.message = message

+def get_errors(error_string):
+    '''
+    returns all lines in the error_string that start with the string "error"
+
+    '''
+    lines = error_string.splitlines()
+    return lines[1]
+    #error_lines = (line for line in lines if line.find('error') >= 0)
+    #return '\n'.join(error_lines)

 def run_tesseract(input_filename, output_filename_base, lang=None):
    command = [TESSERACT_PATH, input_filename, output_filename_base]
@@ -44,18 +54,19 @@ def ocr_document(document):
            status, error_string = run_tesseract(imagefile, filepath)
            if status:
                errors = get_errors(error_string)
-                raise TesseractError(status, errors)
+                raise TesseractError(errors)
        finally:
            ocr_output = os.extsep.join([filepath, 'txt'])
-            f = file(ocr_output)
-            try:
-                document_page, created = DocumentPage.objects.get_or_create(document=document,
-                    page_number=page_index+1)
-                document_page.content = f.read().strip()
-                document_page.page_label = _(u'Text from OCR')
-                document_page.save()
-            finally:
-                f.close()
-                cleanup(filepath)
-                cleanup(ocr_output)
-                cleanup(imagefile)
+
+        f = file(ocr_output)
+        try:
+            document_page, created = DocumentPage.objects.get_or_create(document=document,
+                page_number=page_index+1)
+            document_page.content = f.read().strip()
+            document_page.page_label = _(u'Text from OCR')
+            document_page.save()
+        finally:
+            f.close()
+            #cleanup(filepath)
+            #cleanup(ocr_output)
+            #cleanup(imagefile)
--- a/docs/CREDITS
+++ b/docs/CREDITS
@@ -4,7 +4,8 @@ Python
    Copyright (c) 1995-2001 Corporation for National Research Initiatives.
    Copyright (c) 1991-1995 Stichting Mathematisch Centrum, Amsterdam.

-Django - A high-level Python Web framework that encourages rapid development and clean, pragmatic design.
+Django - A high-level Python Web framework that encourages rapid 
+         development and clean, pragmatic design.
    Copyright Django Software Foundation
    http://www.djangoproject.com/

@@ -32,7 +33,8 @@ django-extensions - Extensions for Django
    Copyright Bas van Oostveen (v.oostveen@gmail.com)
    http://code.google.com/p/django-command-extensions/

-django-rosetta - A Django application that eases the translation of Django projects
+django-rosetta - A Django application that eases the translation of
+                 Django projects
    Copyright Marco Bonetti (mbonetti@gmail.com)
    http://code.google.com/p/django-rosetta/

@@ -48,7 +50,8 @@ django-filetransfers - File upload/download abstraction
    Waldemar Kornewald
    http://www.allbuttonspressed.com/projects/django-filetransfers

-tesseract - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google.
+tesseract - An OCR Engine that was developed at HP Labs between 1985 and
+            1995... and now at Google.
    http://code.google.com/p/tesseract-ocr/

 Image file 1068504_92921456 "Mayan piramid" (Stock Exchange)
@@ -63,3 +66,12 @@ Fat cow icon set
 Python-magic - python-magic is a simple wrapper for libmagic
    Adam Hupp <adam at hupp.org>
    https://github.com/ahupp/python-magic
+
+Fancybox - FancyBox is a tool for displaying images, html content and
+           multi-media in a Mac-style "lightbox" that floats overtop of
+           web page. 
+    http://fancybox.net
+
+unpaper - post-processing scanned and photocopied book pages
+    Jens Gulden 2005-2007 - unpaper@jensgulden.de.
+    http://unpaper.berlios.de/
--- a/docs/Changelog.txt
+++ b/docs/Changelog.txt
@@ -10,3 +10,5 @@
    To update a previous database do: [d.update_page_count() for d in Document.objects.all()]
 * Added support for document page transformation (no GUI yet)
 * Added views to create, edit and grant/revoke permissions to roles
+* Apply default transformations to document before OCR
+* Added unpaper to the OCR convertion pipe
--- a/docs/TODO
+++ b/docs/TODO
@@ -31,6 +31,8 @@
 * Assign default role to new users                                     - DONE
 * DB stored transformations                                            - DONE
 * Recognize multi-page documents                                       - DONE
+* Add unpaper to pre OCR document cleanup                              - DONE
+* Role editing view under setup                                        - STARTED
 * Document list filtering by metadata
 * Filterform date filtering widget
 * Validate GET data before saving file
@@ -53,10 +55,10 @@
 * Field for document language or autodetect
 * Count pages in a PDF file http://pybrary.net/pyPdf/
 * Download a document in diffent formats: (jpg, png, pdf)
+* Download a document in diffent formats: (jpg, png, pdf)
 * Cache.cleanup function to delete cached images when document hash changes
 * Divide navigation links search by object and by view
 * Add show_summary method to model to display as results of a search
-* Add unpaper to pre OCR document cleanup
 * Support distributed OCR queues (RabbitMQ & Celery?)
 * DXF viewer - http://code.google.com/p/dxf-reader/source/browse/#svn%2Ftrunk
 * Support spreadsheets, wordprocessing docs using openoffice in server mode
@@ -64,10 +66,12 @@
 * Handle ziped or rar archives
 * Display preferences 'document transformations' (Rotation, default zoom)
 * Gallery view for document groups
-* Role editing view under setup
 * Download metadata group documents as a single zip file
 * Download original document or transformed document
 * Include annotations in transformed documents downloads
 * Document view temp transformations
 * Implement permissions decorators
 * block Setup menu item to non staff and non superuser users
+* Don't append an extension separator if extension is non existant
+* Don't do OCR on wordproccessing or spreadsheet document, strip tags and store text
+* Storage backend to storage backend copy support, to move/migrate document to new storage backend
--- a/settings.py
+++ b/settings.py
@@ -216,6 +216,7 @@ LOGIN_EXEMPT_URLS = (
 #CONVERTER_CONVERT_PATH = u'/usr/bin/convert'
 #CONVERTER_OCR_OPTIONS = u'-colorspace Gray -depth 8 -resample 200x200'
 #CONVERTER_IDENTIFY_PATH = u'/usr/bin/identify'
+#CONVERTER_UNPAPER_PATH = u'/usr/bin/unpaper'

 # OCR
 #OCR_TESSERACT_PATH = u'/usr/bin/tesseract'