Finished adding language specific ocr cleanup code

2011-04-07 12:23:26 -04:00
parent d86c521858
commit d54fd98ec5
9 changed files with 134 additions and 54 deletions
--- a/apps/main/init.py
+++ b/apps/main/init.py
@@ -6,6 +6,7 @@ from permissions import role_list

 from documents import document_find_all_duplicates
 from filesystem_serving import filesystem_serving_recreate_all_links
+from ocr import all_document_ocr_cleanup

 from main.conf.settings import SIDE_BAR_SEARCH

@@ -17,7 +18,7 @@ main_menu = [
    {'text':_(u'home'), 'view':'home', 'famfam':'house', 'position':0},
    {'text':_(u'tools'), 'view':'tools_menu', 'links': [
        document_find_all_duplicates, filesystem_serving_recreate_all_links,
-        statistics, diagnostics,
+        all_document_ocr_cleanup, statistics, diagnostics,
        ],'famfam':'wrench', 'name':'tools','position':7},

    {'text':_(u'setup'), 'view':'check_settings', 'links': [
--- a/apps/main/views.py
+++ b/apps/main/views.py
@@ -128,7 +128,8 @@ def blank_menu(request):
        'title':_(u'Tools menu'),
        'paragraphs':[
            _(u'"Find all duplicates": Search all the documents\' checksums and return a list of the exact matches.'),
-            _(u'"Recreate index links": Deletes and creates from scratch all the file system indexing links.')
+            _(u'"Recreate index links": Deletes and creates from scratch all the file system indexing links.'),
+            _(u'"Clean up pages content": Runs a language filter to remove common OCR mistakes from document pages content.')
        ],
        },
    context_instance=RequestContext(request))    
--- a/apps/ocr/init.py
+++ b/apps/ocr/init.py
@@ -19,11 +19,13 @@ from literals import QUEUEDOCUMENT_STATE_PROCESSING, \
 PERMISSION_OCR_DOCUMENT = 'ocr_document'
 PERMISSION_OCR_DOCUMENT_DELETE = 'ocr_document_delete'
 PERMISSION_OCR_QUEUE_ENABLE_DISABLE = 'ocr_queue_enable_disable'
+PERMISSION_OCR_CLEAN_ALL_PAGES = 'ocr_clean_all_pages'

 register_permissions('ocr', [
    {'name':PERMISSION_OCR_DOCUMENT, 'label':_(u'Submit document for OCR')},
    {'name':PERMISSION_OCR_DOCUMENT_DELETE, 'label':_(u'Delete document for OCR queue')},
    {'name':PERMISSION_OCR_QUEUE_ENABLE_DISABLE, 'label':_(u'Can enable/disable an OCR queue')},
+    {'name':PERMISSION_OCR_CLEAN_ALL_PAGES, 'label':_(u'Can execute an OCR clean up on all document pages')},
 ])

 #Links
@@ -36,6 +38,8 @@ queue_document_multiple_delete = {'text':_(u'delete'), 'view':'queue_document_mu
 document_queue_disable = {'text':_(u'stop queue'), 'view':'document_queue_disable', 'args':'object.id', 'famfam':'control_stop_blue', 'permissions':{'namespace':'ocr', 'permissions':[PERMISSION_OCR_QUEUE_ENABLE_DISABLE]}}
 document_queue_enable = {'text':_(u'activate queue'), 'view':'document_queue_enable', 'args':'object.id', 'famfam':'control_play_blue', 'permissions':{'namespace':'ocr', 'permissions':[PERMISSION_OCR_QUEUE_ENABLE_DISABLE]}}

+all_document_ocr_cleanup = {'text':_(u'clean up pages content'), 'view':'all_document_ocr_cleanup', 'famfam':'text_strikethrough', 'permissions':{'namespace':'ocr', 'permissions':[PERMISSION_OCR_CLEAN_ALL_PAGES]}}
+
 register_links(Document, [submit_document], menu_name='sidebar')
 register_links(DocumentQueue, [document_queue_disable, document_queue_enable])

--- a/apps/ocr/api.py
+++ b/apps/ocr/api.py
@@ -1,20 +1,32 @@
-# -*- coding: iso-8859-1 -*-
 #Some code from http://wiki.github.com/hoffstaetter/python-tesseract

 import codecs
 import os
 import subprocess
-import re
 import tempfile
+import sys

 from django.utils.translation import ugettext as _
+from django.utils.importlib import import_module

 from common import TEMPORARY_DIRECTORY
 from converter.api import convert_document_for_ocr
+from documents.models import DocumentPage

 from ocr.conf.settings import TESSERACT_PATH
 from ocr.conf.settings import TESSERACT_LANGUAGE

+    
+def get_language_backend():
+    try:
+        module = import_module(u'.'.join([u'ocr',u'lang', TESSERACT_LANGUAGE]))
+    except ImportError:
+        sys.stderr.write('\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE)
+        return None
+    return module
+
+backend = get_language_backend()
+

 class TesseractError(Exception):
    pass
@@ -48,8 +60,8 @@ def do_document_ocr(document):
            run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
            ocr_output = os.extsep.join([filepath, 'txt'])
            f = codecs.open(ocr_output, 'r', 'utf-8')
-            document_page = document.documentpage_set.get(page_number=page_index+1)
-            document_page.content = f.read().strip()
+            document_page = document.documentpage_set.get(page_number=page_index + 1)
+            document_page.content = ocr_cleanup(f.read().strip())
            document_page.page_label = _(u'Text from OCR')
            document_page.save()
            f.close()
@@ -68,59 +80,19 @@ def ocr_cleanup(text):
    for line in text.splitlines():
        line = line.strip()
        for word in line.split():
-            result = check_word(word)
+            if backend:
+                result = backend.check_word(word)
+            else:
+                result = word
            if result:
                output.append(result)
        output.append('\n')
            
    return u' '.join(output)

-
-def check_word(word):
-    ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
-    NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
-
-    TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
-    TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
-    ALL_ALPHA = re.compile('^[a-z]+$', re.I)
-    SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
-
-    #(L) If a string is longer than 20 characters, it is
-    #garbage:
-    if len(word) > 20:
-        return None
-
-    #(A) If a string’s ratio of alphanumeric characters to total 
-    #characters. is less than 50%, the string is garbage:
-    if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
-        return None
-
-    #Remove word if all the letters in the word are non alphanumeric
-    if len(NON_ALPHANUM.findall(word)) == len(word):
-        return None
-    
-    #Removed words with too many consecutie vowels
-    if TOO_MANY_VOWELS.findall(word):
-        return None 
-
-    #Removed words with too many consecutie consonants
-    if TOO_MANY_CONSONANTS.findall(word):
-        return None 
-
-    #Only allow specific single letter words
-    if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
-        return None
-        
-    return word
-    
-    
-
-from ocr.api import ocr_cleanup
-from documents.models import DocumentPage
+   
 def clean_pages():
    for page in DocumentPage.objects.all():
        if page.content:
            page.content = ocr_cleanup(page.content)
-            #print page.content
-            print page.pk
            page.save()
--- a/apps/ocr/lang/init.py
+++ b/apps/ocr/lang/init.py
--- a/apps/ocr/lang/eng.py
+++ b/apps/ocr/lang/eng.py
@@ -0,0 +1,38 @@
+import re
+
+def check_word(word):
+    ALL_ALPHANUM = re.compile('([0-9a-z])', re.I)
+    NON_ALPHANUM = re.compile('([^0-9a-z])', re.I)
+
+    TOO_MANY_VOWELS = re.compile('[aeiou]{3}', re.I)
+    TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{5}', re.I)
+    ALL_ALPHA = re.compile('^[a-z]+$', re.I)
+    SINGLE_LETTER_WORDS = re.compile('^[ai]$', re.I)
+
+    #(L) If a string is longer than 20 characters, it is garbage
+    if len(word) > 20:
+        return None
+
+    #(A) If a string’s ratio of alphanumeric characters to total 
+    #characters is less than 50%, the string is garbage
+    if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
+        return None
+
+    #Remove word if all the letters in the word are non alphanumeric
+    if len(NON_ALPHANUM.findall(word)) == len(word):
+        return None
+    
+    #Removed words with too many consecutie vowels
+    if TOO_MANY_VOWELS.findall(word):
+        return None 
+
+    #Removed words with too many consecutie consonants
+    if TOO_MANY_CONSONANTS.findall(word):
+        return None 
+
+    #Only allow specific single letter words
+    if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
+        return None
+        
+    return word
+    
--- a/apps/ocr/lang/spa.py
+++ b/apps/ocr/lang/spa.py
@@ -0,0 +1,39 @@
+# -*- coding: iso-8859-1 -*-
+import re
+
+def check_word(word):
+    ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
+    NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
+
+    TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
+    TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
+    ALL_ALPHA = re.compile('^[a-z]+$', re.I)
+    SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
+
+    #(L) If a string is longer than 20 characters, it is garbage
+    if len(word) > 20:
+        return None
+
+    #(A) If a string’s ratio of alphanumeric characters to total 
+    #characters is less than 50%, the string is garbage
+    if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
+        return None
+
+    #Remove word if all the letters in the word are non alphanumeric
+    if len(NON_ALPHANUM.findall(word)) == len(word):
+        return None
+    
+    #Removed words with too many consecutie vowels
+    if TOO_MANY_VOWELS.findall(word):
+        return None 
+
+    #Removed words with too many consecutie consonants
+    if TOO_MANY_CONSONANTS.findall(word):
+        return None 
+
+    #Only allow specific single letter words
+    if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
+        return None
+        
+    return word
+    
--- a/apps/ocr/urls.py
+++ b/apps/ocr/urls.py
@@ -11,4 +11,6 @@ urlpatterns = patterns('ocr.views',

    url(r'^ocr/queue/(?P<document_queue_id>\d+)/enable/$', 'document_queue_enable', (), 'document_queue_enable'),
    url(r'^ocr/queue/(?P<document_queue_id>\d+)/disable/$', 'document_queue_disable', (), 'document_queue_disable'),
+    
+    url(r'^ocr/document/all/clean_up/$', 'all_document_ocr_cleanup', (), 'all_document_ocr_cleanup'),
 )
--- a/apps/ocr/views.py
+++ b/apps/ocr/views.py
@@ -12,13 +12,14 @@ from permissions.api import check_permissions
 from documents.models import Document

 from ocr import PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE, \
-    PERMISSION_OCR_QUEUE_ENABLE_DISABLE
+    PERMISSION_OCR_QUEUE_ENABLE_DISABLE, PERMISSION_OCR_CLEAN_ALL_PAGES
+
 from models import DocumentQueue, QueueDocument
 from literals import QUEUEDOCUMENT_STATE_PENDING, \
    QUEUEDOCUMENT_STATE_PROCESSING, QUEUEDOCUMENT_STATE_ERROR, \
    DOCUMENTQUEUE_STATE_STOPPED, DOCUMENTQUEUE_STATE_ACTIVE
 from exceptions import AlreadyQueued
-
+from api import clean_pages

 def _display_thumbnail(ocr_document):
    try:
@@ -249,4 +250,26 @@ def document_queue_enable(request, document_queue_id):
        'title':_(u'Are you sure you wish to activate document queue: %s') % document_queue,
        'next':next,
        'previous':previous,
-    }, context_instance=RequestContext(request))        
+    }, context_instance=RequestContext(request))
+
+
+def all_document_ocr_cleanup(request):
+    check_permissions(request.user, 'ocr', [PERMISSION_OCR_CLEAN_ALL_PAGES])
+
+    previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
+    next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None)))
+    
+    if request.method != 'POST':
+        return render_to_response('generic_confirm.html', {
+            'previous':previous,
+            'next':next,
+            'message':_(u'On large databases this operation may take some time to execute.'),
+        }, context_instance=RequestContext(request))
+    else:     
+        try:
+            clean_pages()
+            messages.success(request, _(u'Document pages content clean up complete.'))
+        except Exception, e:
+            messages.error(request, _(u'Document pages content clean up error: %s') % e)
+            
+        return HttpResponseRedirect(next)