diff --git a/apps/main/__init__.py b/apps/main/__init__.py index ea6a16184a..dde7aac847 100644 --- a/apps/main/__init__.py +++ b/apps/main/__init__.py @@ -6,6 +6,7 @@ from permissions import role_list from documents import document_find_all_duplicates from filesystem_serving import filesystem_serving_recreate_all_links +from ocr import all_document_ocr_cleanup from main.conf.settings import SIDE_BAR_SEARCH @@ -17,7 +18,7 @@ main_menu = [ {'text':_(u'home'), 'view':'home', 'famfam':'house', 'position':0}, {'text':_(u'tools'), 'view':'tools_menu', 'links': [ document_find_all_duplicates, filesystem_serving_recreate_all_links, - statistics, diagnostics, + all_document_ocr_cleanup, statistics, diagnostics, ],'famfam':'wrench', 'name':'tools','position':7}, {'text':_(u'setup'), 'view':'check_settings', 'links': [ diff --git a/apps/main/views.py b/apps/main/views.py index 5d6ab0f335..82c8644c35 100644 --- a/apps/main/views.py +++ b/apps/main/views.py @@ -128,7 +128,8 @@ def blank_menu(request): 'title':_(u'Tools menu'), 'paragraphs':[ _(u'"Find all duplicates": Search all the documents\' checksums and return a list of the exact matches.'), - _(u'"Recreate index links": Deletes and creates from scratch all the file system indexing links.') + _(u'"Recreate index links": Deletes and creates from scratch all the file system indexing links.'), + _(u'"Clean up pages content": Runs a language filter to remove common OCR mistakes from document pages content.') ], }, context_instance=RequestContext(request)) diff --git a/apps/ocr/__init__.py b/apps/ocr/__init__.py index d9796f3518..7e921c87bb 100644 --- a/apps/ocr/__init__.py +++ b/apps/ocr/__init__.py @@ -19,11 +19,13 @@ from literals import QUEUEDOCUMENT_STATE_PROCESSING, \ PERMISSION_OCR_DOCUMENT = 'ocr_document' PERMISSION_OCR_DOCUMENT_DELETE = 'ocr_document_delete' PERMISSION_OCR_QUEUE_ENABLE_DISABLE = 'ocr_queue_enable_disable' +PERMISSION_OCR_CLEAN_ALL_PAGES = 'ocr_clean_all_pages' register_permissions('ocr', [ {'name':PERMISSION_OCR_DOCUMENT, 'label':_(u'Submit document for OCR')}, {'name':PERMISSION_OCR_DOCUMENT_DELETE, 'label':_(u'Delete document for OCR queue')}, {'name':PERMISSION_OCR_QUEUE_ENABLE_DISABLE, 'label':_(u'Can enable/disable an OCR queue')}, + {'name':PERMISSION_OCR_CLEAN_ALL_PAGES, 'label':_(u'Can execute an OCR clean up on all document pages')}, ]) #Links @@ -36,6 +38,8 @@ queue_document_multiple_delete = {'text':_(u'delete'), 'view':'queue_document_mu document_queue_disable = {'text':_(u'stop queue'), 'view':'document_queue_disable', 'args':'object.id', 'famfam':'control_stop_blue', 'permissions':{'namespace':'ocr', 'permissions':[PERMISSION_OCR_QUEUE_ENABLE_DISABLE]}} document_queue_enable = {'text':_(u'activate queue'), 'view':'document_queue_enable', 'args':'object.id', 'famfam':'control_play_blue', 'permissions':{'namespace':'ocr', 'permissions':[PERMISSION_OCR_QUEUE_ENABLE_DISABLE]}} +all_document_ocr_cleanup = {'text':_(u'clean up pages content'), 'view':'all_document_ocr_cleanup', 'famfam':'text_strikethrough', 'permissions':{'namespace':'ocr', 'permissions':[PERMISSION_OCR_CLEAN_ALL_PAGES]}} + register_links(Document, [submit_document], menu_name='sidebar') register_links(DocumentQueue, [document_queue_disable, document_queue_enable]) diff --git a/apps/ocr/api.py b/apps/ocr/api.py index 6b0d88ebcb..ad1f7e3a96 100644 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -1,20 +1,32 @@ -# -*- coding: iso-8859-1 -*- #Some code from http://wiki.github.com/hoffstaetter/python-tesseract import codecs import os import subprocess -import re import tempfile +import sys from django.utils.translation import ugettext as _ +from django.utils.importlib import import_module from common import TEMPORARY_DIRECTORY from converter.api import convert_document_for_ocr +from documents.models import DocumentPage from ocr.conf.settings import TESSERACT_PATH from ocr.conf.settings import TESSERACT_LANGUAGE + +def get_language_backend(): + try: + module = import_module(u'.'.join([u'ocr',u'lang', TESSERACT_LANGUAGE])) + except ImportError: + sys.stderr.write('\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE) + return None + return module + +backend = get_language_backend() + class TesseractError(Exception): pass @@ -48,8 +60,8 @@ def do_document_ocr(document): run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE) ocr_output = os.extsep.join([filepath, 'txt']) f = codecs.open(ocr_output, 'r', 'utf-8') - document_page = document.documentpage_set.get(page_number=page_index+1) - document_page.content = f.read().strip() + document_page = document.documentpage_set.get(page_number=page_index + 1) + document_page.content = ocr_cleanup(f.read().strip()) document_page.page_label = _(u'Text from OCR') document_page.save() f.close() @@ -68,59 +80,19 @@ def ocr_cleanup(text): for line in text.splitlines(): line = line.strip() for word in line.split(): - result = check_word(word) + if backend: + result = backend.check_word(word) + else: + result = word if result: output.append(result) output.append('\n') return u' '.join(output) - -def check_word(word): - ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I) - NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I) - - TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I) - TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I) - ALL_ALPHA = re.compile('^[a-z]+$', re.I) - SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I) - - #(L) If a string is longer than 20 characters, it is - #garbage: - if len(word) > 20: - return None - - #(A) If a string’s ratio of alphanumeric characters to total - #characters. is less than 50%, the string is garbage: - if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: - return None - - #Remove word if all the letters in the word are non alphanumeric - if len(NON_ALPHANUM.findall(word)) == len(word): - return None - - #Removed words with too many consecutie vowels - if TOO_MANY_VOWELS.findall(word): - return None - - #Removed words with too many consecutie consonants - if TOO_MANY_CONSONANTS.findall(word): - return None - - #Only allow specific single letter words - if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): - return None - - return word - - - -from ocr.api import ocr_cleanup -from documents.models import DocumentPage + def clean_pages(): for page in DocumentPage.objects.all(): if page.content: page.content = ocr_cleanup(page.content) - #print page.content - print page.pk page.save() diff --git a/apps/ocr/lang/__init__.py b/apps/ocr/lang/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/apps/ocr/lang/eng.py b/apps/ocr/lang/eng.py new file mode 100644 index 0000000000..721d61943f --- /dev/null +++ b/apps/ocr/lang/eng.py @@ -0,0 +1,38 @@ +import re + +def check_word(word): + ALL_ALPHANUM = re.compile('([0-9a-z])', re.I) + NON_ALPHANUM = re.compile('([^0-9a-z])', re.I) + + TOO_MANY_VOWELS = re.compile('[aeiou]{3}', re.I) + TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{5}', re.I) + ALL_ALPHA = re.compile('^[a-z]+$', re.I) + SINGLE_LETTER_WORDS = re.compile('^[ai]$', re.I) + + #(L) If a string is longer than 20 characters, it is garbage + if len(word) > 20: + return None + + #(A) If a string’s ratio of alphanumeric characters to total + #characters is less than 50%, the string is garbage + if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: + return None + + #Remove word if all the letters in the word are non alphanumeric + if len(NON_ALPHANUM.findall(word)) == len(word): + return None + + #Removed words with too many consecutie vowels + if TOO_MANY_VOWELS.findall(word): + return None + + #Removed words with too many consecutie consonants + if TOO_MANY_CONSONANTS.findall(word): + return None + + #Only allow specific single letter words + if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): + return None + + return word + diff --git a/apps/ocr/lang/spa.py b/apps/ocr/lang/spa.py new file mode 100644 index 0000000000..149b2d9201 --- /dev/null +++ b/apps/ocr/lang/spa.py @@ -0,0 +1,39 @@ +# -*- coding: iso-8859-1 -*- +import re + +def check_word(word): + ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I) + NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I) + + TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I) + TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I) + ALL_ALPHA = re.compile('^[a-z]+$', re.I) + SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I) + + #(L) If a string is longer than 20 characters, it is garbage + if len(word) > 20: + return None + + #(A) If a string’s ratio of alphanumeric characters to total + #characters is less than 50%, the string is garbage + if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: + return None + + #Remove word if all the letters in the word are non alphanumeric + if len(NON_ALPHANUM.findall(word)) == len(word): + return None + + #Removed words with too many consecutie vowels + if TOO_MANY_VOWELS.findall(word): + return None + + #Removed words with too many consecutie consonants + if TOO_MANY_CONSONANTS.findall(word): + return None + + #Only allow specific single letter words + if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): + return None + + return word + diff --git a/apps/ocr/urls.py b/apps/ocr/urls.py index a6e58c09dc..b65f910e26 100644 --- a/apps/ocr/urls.py +++ b/apps/ocr/urls.py @@ -11,4 +11,6 @@ urlpatterns = patterns('ocr.views', url(r'^ocr/queue/(?P\d+)/enable/$', 'document_queue_enable', (), 'document_queue_enable'), url(r'^ocr/queue/(?P\d+)/disable/$', 'document_queue_disable', (), 'document_queue_disable'), + + url(r'^ocr/document/all/clean_up/$', 'all_document_ocr_cleanup', (), 'all_document_ocr_cleanup'), ) diff --git a/apps/ocr/views.py b/apps/ocr/views.py index 66281214a0..b32f5b8399 100644 --- a/apps/ocr/views.py +++ b/apps/ocr/views.py @@ -12,13 +12,14 @@ from permissions.api import check_permissions from documents.models import Document from ocr import PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE, \ - PERMISSION_OCR_QUEUE_ENABLE_DISABLE + PERMISSION_OCR_QUEUE_ENABLE_DISABLE, PERMISSION_OCR_CLEAN_ALL_PAGES + from models import DocumentQueue, QueueDocument from literals import QUEUEDOCUMENT_STATE_PENDING, \ QUEUEDOCUMENT_STATE_PROCESSING, QUEUEDOCUMENT_STATE_ERROR, \ DOCUMENTQUEUE_STATE_STOPPED, DOCUMENTQUEUE_STATE_ACTIVE from exceptions import AlreadyQueued - +from api import clean_pages def _display_thumbnail(ocr_document): try: @@ -249,4 +250,26 @@ def document_queue_enable(request, document_queue_id): 'title':_(u'Are you sure you wish to activate document queue: %s') % document_queue, 'next':next, 'previous':previous, - }, context_instance=RequestContext(request)) + }, context_instance=RequestContext(request)) + + +def all_document_ocr_cleanup(request): + check_permissions(request.user, 'ocr', [PERMISSION_OCR_CLEAN_ALL_PAGES]) + + previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None))) + next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None))) + + if request.method != 'POST': + return render_to_response('generic_confirm.html', { + 'previous':previous, + 'next':next, + 'message':_(u'On large databases this operation may take some time to execute.'), + }, context_instance=RequestContext(request)) + else: + try: + clean_pages() + messages.success(request, _(u'Document pages content clean up complete.')) + except Exception, e: + messages.error(request, _(u'Document pages content clean up error: %s') % e) + + return HttpResponseRedirect(next)