Finished adding language specific ocr cleanup code

This commit is contained in:
Roberto Rosario
2011-04-07 12:23:26 -04:00
parent d86c521858
commit d54fd98ec5
9 changed files with 134 additions and 54 deletions

View File

@@ -6,6 +6,7 @@ from permissions import role_list
from documents import document_find_all_duplicates
from filesystem_serving import filesystem_serving_recreate_all_links
from ocr import all_document_ocr_cleanup
from main.conf.settings import SIDE_BAR_SEARCH
@@ -17,7 +18,7 @@ main_menu = [
{'text':_(u'home'), 'view':'home', 'famfam':'house', 'position':0},
{'text':_(u'tools'), 'view':'tools_menu', 'links': [
document_find_all_duplicates, filesystem_serving_recreate_all_links,
statistics, diagnostics,
all_document_ocr_cleanup, statistics, diagnostics,
],'famfam':'wrench', 'name':'tools','position':7},
{'text':_(u'setup'), 'view':'check_settings', 'links': [

View File

@@ -128,7 +128,8 @@ def blank_menu(request):
'title':_(u'Tools menu'),
'paragraphs':[
_(u'"Find all duplicates": Search all the documents\' checksums and return a list of the exact matches.'),
_(u'"Recreate index links": Deletes and creates from scratch all the file system indexing links.')
_(u'"Recreate index links": Deletes and creates from scratch all the file system indexing links.'),
_(u'"Clean up pages content": Runs a language filter to remove common OCR mistakes from document pages content.')
],
},
context_instance=RequestContext(request))

View File

@@ -19,11 +19,13 @@ from literals import QUEUEDOCUMENT_STATE_PROCESSING, \
PERMISSION_OCR_DOCUMENT = 'ocr_document'
PERMISSION_OCR_DOCUMENT_DELETE = 'ocr_document_delete'
PERMISSION_OCR_QUEUE_ENABLE_DISABLE = 'ocr_queue_enable_disable'
PERMISSION_OCR_CLEAN_ALL_PAGES = 'ocr_clean_all_pages'
register_permissions('ocr', [
{'name':PERMISSION_OCR_DOCUMENT, 'label':_(u'Submit document for OCR')},
{'name':PERMISSION_OCR_DOCUMENT_DELETE, 'label':_(u'Delete document for OCR queue')},
{'name':PERMISSION_OCR_QUEUE_ENABLE_DISABLE, 'label':_(u'Can enable/disable an OCR queue')},
{'name':PERMISSION_OCR_CLEAN_ALL_PAGES, 'label':_(u'Can execute an OCR clean up on all document pages')},
])
#Links
@@ -36,6 +38,8 @@ queue_document_multiple_delete = {'text':_(u'delete'), 'view':'queue_document_mu
document_queue_disable = {'text':_(u'stop queue'), 'view':'document_queue_disable', 'args':'object.id', 'famfam':'control_stop_blue', 'permissions':{'namespace':'ocr', 'permissions':[PERMISSION_OCR_QUEUE_ENABLE_DISABLE]}}
document_queue_enable = {'text':_(u'activate queue'), 'view':'document_queue_enable', 'args':'object.id', 'famfam':'control_play_blue', 'permissions':{'namespace':'ocr', 'permissions':[PERMISSION_OCR_QUEUE_ENABLE_DISABLE]}}
all_document_ocr_cleanup = {'text':_(u'clean up pages content'), 'view':'all_document_ocr_cleanup', 'famfam':'text_strikethrough', 'permissions':{'namespace':'ocr', 'permissions':[PERMISSION_OCR_CLEAN_ALL_PAGES]}}
register_links(Document, [submit_document], menu_name='sidebar')
register_links(DocumentQueue, [document_queue_disable, document_queue_enable])

View File

@@ -1,20 +1,32 @@
# -*- coding: iso-8859-1 -*-
#Some code from http://wiki.github.com/hoffstaetter/python-tesseract
import codecs
import os
import subprocess
import re
import tempfile
import sys
from django.utils.translation import ugettext as _
from django.utils.importlib import import_module
from common import TEMPORARY_DIRECTORY
from converter.api import convert_document_for_ocr
from documents.models import DocumentPage
from ocr.conf.settings import TESSERACT_PATH
from ocr.conf.settings import TESSERACT_LANGUAGE
def get_language_backend():
try:
module = import_module(u'.'.join([u'ocr',u'lang', TESSERACT_LANGUAGE]))
except ImportError:
sys.stderr.write('\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE)
return None
return module
backend = get_language_backend()
class TesseractError(Exception):
pass
@@ -48,8 +60,8 @@ def do_document_ocr(document):
run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
ocr_output = os.extsep.join([filepath, 'txt'])
f = codecs.open(ocr_output, 'r', 'utf-8')
document_page = document.documentpage_set.get(page_number=page_index+1)
document_page.content = f.read().strip()
document_page = document.documentpage_set.get(page_number=page_index + 1)
document_page.content = ocr_cleanup(f.read().strip())
document_page.page_label = _(u'Text from OCR')
document_page.save()
f.close()
@@ -68,59 +80,19 @@ def ocr_cleanup(text):
for line in text.splitlines():
line = line.strip()
for word in line.split():
result = check_word(word)
if backend:
result = backend.check_word(word)
else:
result = word
if result:
output.append(result)
output.append('\n')
return u' '.join(output)
def check_word(word):
ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
ALL_ALPHA = re.compile('^[a-z]+$', re.I)
SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
#(L) If a string is longer than 20 characters, it is
#garbage:
if len(word) > 20:
return None
#(A) If a strings ratio of alphanumeric characters to total
#characters. is less than 50%, the string is garbage:
if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
return None
#Remove word if all the letters in the word are non alphanumeric
if len(NON_ALPHANUM.findall(word)) == len(word):
return None
#Removed words with too many consecutie vowels
if TOO_MANY_VOWELS.findall(word):
return None
#Removed words with too many consecutie consonants
if TOO_MANY_CONSONANTS.findall(word):
return None
#Only allow specific single letter words
if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
return None
return word
from ocr.api import ocr_cleanup
from documents.models import DocumentPage
def clean_pages():
for page in DocumentPage.objects.all():
if page.content:
page.content = ocr_cleanup(page.content)
#print page.content
print page.pk
page.save()

View File

38
apps/ocr/lang/eng.py Normal file
View File

@@ -0,0 +1,38 @@
import re
def check_word(word):
ALL_ALPHANUM = re.compile('([0-9a-z])', re.I)
NON_ALPHANUM = re.compile('([^0-9a-z])', re.I)
TOO_MANY_VOWELS = re.compile('[aeiou]{3}', re.I)
TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{5}', re.I)
ALL_ALPHA = re.compile('^[a-z]+$', re.I)
SINGLE_LETTER_WORDS = re.compile('^[ai]$', re.I)
#(L) If a string is longer than 20 characters, it is garbage
if len(word) > 20:
return None
#(A) If a strings ratio of alphanumeric characters to total
#characters is less than 50%, the string is garbage
if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
return None
#Remove word if all the letters in the word are non alphanumeric
if len(NON_ALPHANUM.findall(word)) == len(word):
return None
#Removed words with too many consecutie vowels
if TOO_MANY_VOWELS.findall(word):
return None
#Removed words with too many consecutie consonants
if TOO_MANY_CONSONANTS.findall(word):
return None
#Only allow specific single letter words
if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
return None
return word

39
apps/ocr/lang/spa.py Normal file
View File

@@ -0,0 +1,39 @@
# -*- coding: iso-8859-1 -*-
import re
def check_word(word):
ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
ALL_ALPHA = re.compile('^[a-z]+$', re.I)
SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
#(L) If a string is longer than 20 characters, it is garbage
if len(word) > 20:
return None
#(A) If a strings ratio of alphanumeric characters to total
#characters is less than 50%, the string is garbage
if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
return None
#Remove word if all the letters in the word are non alphanumeric
if len(NON_ALPHANUM.findall(word)) == len(word):
return None
#Removed words with too many consecutie vowels
if TOO_MANY_VOWELS.findall(word):
return None
#Removed words with too many consecutie consonants
if TOO_MANY_CONSONANTS.findall(word):
return None
#Only allow specific single letter words
if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
return None
return word

View File

@@ -11,4 +11,6 @@ urlpatterns = patterns('ocr.views',
url(r'^ocr/queue/(?P<document_queue_id>\d+)/enable/$', 'document_queue_enable', (), 'document_queue_enable'),
url(r'^ocr/queue/(?P<document_queue_id>\d+)/disable/$', 'document_queue_disable', (), 'document_queue_disable'),
url(r'^ocr/document/all/clean_up/$', 'all_document_ocr_cleanup', (), 'all_document_ocr_cleanup'),
)

View File

@@ -12,13 +12,14 @@ from permissions.api import check_permissions
from documents.models import Document
from ocr import PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE, \
PERMISSION_OCR_QUEUE_ENABLE_DISABLE
PERMISSION_OCR_QUEUE_ENABLE_DISABLE, PERMISSION_OCR_CLEAN_ALL_PAGES
from models import DocumentQueue, QueueDocument
from literals import QUEUEDOCUMENT_STATE_PENDING, \
QUEUEDOCUMENT_STATE_PROCESSING, QUEUEDOCUMENT_STATE_ERROR, \
DOCUMENTQUEUE_STATE_STOPPED, DOCUMENTQUEUE_STATE_ACTIVE
from exceptions import AlreadyQueued
from api import clean_pages
def _display_thumbnail(ocr_document):
try:
@@ -249,4 +250,26 @@ def document_queue_enable(request, document_queue_id):
'title':_(u'Are you sure you wish to activate document queue: %s') % document_queue,
'next':next,
'previous':previous,
}, context_instance=RequestContext(request))
}, context_instance=RequestContext(request))
def all_document_ocr_cleanup(request):
check_permissions(request.user, 'ocr', [PERMISSION_OCR_CLEAN_ALL_PAGES])
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None)))
if request.method != 'POST':
return render_to_response('generic_confirm.html', {
'previous':previous,
'next':next,
'message':_(u'On large databases this operation may take some time to execute.'),
}, context_instance=RequestContext(request))
else:
try:
clean_pages()
messages.success(request, _(u'Document pages content clean up complete.'))
except Exception, e:
messages.error(request, _(u'Document pages content clean up error: %s') % e)
return HttpResponseRedirect(next)