Finished adding language specific ocr cleanup code
This commit is contained in:
@@ -6,6 +6,7 @@ from permissions import role_list
|
||||
|
||||
from documents import document_find_all_duplicates
|
||||
from filesystem_serving import filesystem_serving_recreate_all_links
|
||||
from ocr import all_document_ocr_cleanup
|
||||
|
||||
from main.conf.settings import SIDE_BAR_SEARCH
|
||||
|
||||
@@ -17,7 +18,7 @@ main_menu = [
|
||||
{'text':_(u'home'), 'view':'home', 'famfam':'house', 'position':0},
|
||||
{'text':_(u'tools'), 'view':'tools_menu', 'links': [
|
||||
document_find_all_duplicates, filesystem_serving_recreate_all_links,
|
||||
statistics, diagnostics,
|
||||
all_document_ocr_cleanup, statistics, diagnostics,
|
||||
],'famfam':'wrench', 'name':'tools','position':7},
|
||||
|
||||
{'text':_(u'setup'), 'view':'check_settings', 'links': [
|
||||
|
||||
@@ -128,7 +128,8 @@ def blank_menu(request):
|
||||
'title':_(u'Tools menu'),
|
||||
'paragraphs':[
|
||||
_(u'"Find all duplicates": Search all the documents\' checksums and return a list of the exact matches.'),
|
||||
_(u'"Recreate index links": Deletes and creates from scratch all the file system indexing links.')
|
||||
_(u'"Recreate index links": Deletes and creates from scratch all the file system indexing links.'),
|
||||
_(u'"Clean up pages content": Runs a language filter to remove common OCR mistakes from document pages content.')
|
||||
],
|
||||
},
|
||||
context_instance=RequestContext(request))
|
||||
|
||||
@@ -19,11 +19,13 @@ from literals import QUEUEDOCUMENT_STATE_PROCESSING, \
|
||||
PERMISSION_OCR_DOCUMENT = 'ocr_document'
|
||||
PERMISSION_OCR_DOCUMENT_DELETE = 'ocr_document_delete'
|
||||
PERMISSION_OCR_QUEUE_ENABLE_DISABLE = 'ocr_queue_enable_disable'
|
||||
PERMISSION_OCR_CLEAN_ALL_PAGES = 'ocr_clean_all_pages'
|
||||
|
||||
register_permissions('ocr', [
|
||||
{'name':PERMISSION_OCR_DOCUMENT, 'label':_(u'Submit document for OCR')},
|
||||
{'name':PERMISSION_OCR_DOCUMENT_DELETE, 'label':_(u'Delete document for OCR queue')},
|
||||
{'name':PERMISSION_OCR_QUEUE_ENABLE_DISABLE, 'label':_(u'Can enable/disable an OCR queue')},
|
||||
{'name':PERMISSION_OCR_CLEAN_ALL_PAGES, 'label':_(u'Can execute an OCR clean up on all document pages')},
|
||||
])
|
||||
|
||||
#Links
|
||||
@@ -36,6 +38,8 @@ queue_document_multiple_delete = {'text':_(u'delete'), 'view':'queue_document_mu
|
||||
document_queue_disable = {'text':_(u'stop queue'), 'view':'document_queue_disable', 'args':'object.id', 'famfam':'control_stop_blue', 'permissions':{'namespace':'ocr', 'permissions':[PERMISSION_OCR_QUEUE_ENABLE_DISABLE]}}
|
||||
document_queue_enable = {'text':_(u'activate queue'), 'view':'document_queue_enable', 'args':'object.id', 'famfam':'control_play_blue', 'permissions':{'namespace':'ocr', 'permissions':[PERMISSION_OCR_QUEUE_ENABLE_DISABLE]}}
|
||||
|
||||
all_document_ocr_cleanup = {'text':_(u'clean up pages content'), 'view':'all_document_ocr_cleanup', 'famfam':'text_strikethrough', 'permissions':{'namespace':'ocr', 'permissions':[PERMISSION_OCR_CLEAN_ALL_PAGES]}}
|
||||
|
||||
register_links(Document, [submit_document], menu_name='sidebar')
|
||||
register_links(DocumentQueue, [document_queue_disable, document_queue_enable])
|
||||
|
||||
|
||||
@@ -1,20 +1,32 @@
|
||||
# -*- coding: iso-8859-1 -*-
|
||||
#Some code from http://wiki.github.com/hoffstaetter/python-tesseract
|
||||
|
||||
import codecs
|
||||
import os
|
||||
import subprocess
|
||||
import re
|
||||
import tempfile
|
||||
import sys
|
||||
|
||||
from django.utils.translation import ugettext as _
|
||||
from django.utils.importlib import import_module
|
||||
|
||||
from common import TEMPORARY_DIRECTORY
|
||||
from converter.api import convert_document_for_ocr
|
||||
from documents.models import DocumentPage
|
||||
|
||||
from ocr.conf.settings import TESSERACT_PATH
|
||||
from ocr.conf.settings import TESSERACT_LANGUAGE
|
||||
|
||||
|
||||
def get_language_backend():
|
||||
try:
|
||||
module = import_module(u'.'.join([u'ocr',u'lang', TESSERACT_LANGUAGE]))
|
||||
except ImportError:
|
||||
sys.stderr.write('\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE)
|
||||
return None
|
||||
return module
|
||||
|
||||
backend = get_language_backend()
|
||||
|
||||
|
||||
class TesseractError(Exception):
|
||||
pass
|
||||
@@ -48,8 +60,8 @@ def do_document_ocr(document):
|
||||
run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
|
||||
ocr_output = os.extsep.join([filepath, 'txt'])
|
||||
f = codecs.open(ocr_output, 'r', 'utf-8')
|
||||
document_page = document.documentpage_set.get(page_number=page_index+1)
|
||||
document_page.content = f.read().strip()
|
||||
document_page = document.documentpage_set.get(page_number=page_index + 1)
|
||||
document_page.content = ocr_cleanup(f.read().strip())
|
||||
document_page.page_label = _(u'Text from OCR')
|
||||
document_page.save()
|
||||
f.close()
|
||||
@@ -68,59 +80,19 @@ def ocr_cleanup(text):
|
||||
for line in text.splitlines():
|
||||
line = line.strip()
|
||||
for word in line.split():
|
||||
result = check_word(word)
|
||||
if backend:
|
||||
result = backend.check_word(word)
|
||||
else:
|
||||
result = word
|
||||
if result:
|
||||
output.append(result)
|
||||
output.append('\n')
|
||||
|
||||
return u' '.join(output)
|
||||
|
||||
|
||||
def check_word(word):
|
||||
ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
|
||||
NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
|
||||
|
||||
TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
|
||||
TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
|
||||
ALL_ALPHA = re.compile('^[a-z]+$', re.I)
|
||||
SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
|
||||
|
||||
#(L) If a string is longer than 20 characters, it is
|
||||
#garbage:
|
||||
if len(word) > 20:
|
||||
return None
|
||||
|
||||
#(A) If a string’s ratio of alphanumeric characters to total
|
||||
#characters. is less than 50%, the string is garbage:
|
||||
if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
|
||||
return None
|
||||
|
||||
#Remove word if all the letters in the word are non alphanumeric
|
||||
if len(NON_ALPHANUM.findall(word)) == len(word):
|
||||
return None
|
||||
|
||||
#Removed words with too many consecutie vowels
|
||||
if TOO_MANY_VOWELS.findall(word):
|
||||
return None
|
||||
|
||||
#Removed words with too many consecutie consonants
|
||||
if TOO_MANY_CONSONANTS.findall(word):
|
||||
return None
|
||||
|
||||
#Only allow specific single letter words
|
||||
if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
|
||||
return None
|
||||
|
||||
return word
|
||||
|
||||
|
||||
|
||||
from ocr.api import ocr_cleanup
|
||||
from documents.models import DocumentPage
|
||||
|
||||
def clean_pages():
|
||||
for page in DocumentPage.objects.all():
|
||||
if page.content:
|
||||
page.content = ocr_cleanup(page.content)
|
||||
#print page.content
|
||||
print page.pk
|
||||
page.save()
|
||||
|
||||
0
apps/ocr/lang/__init__.py
Normal file
0
apps/ocr/lang/__init__.py
Normal file
38
apps/ocr/lang/eng.py
Normal file
38
apps/ocr/lang/eng.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import re
|
||||
|
||||
def check_word(word):
|
||||
ALL_ALPHANUM = re.compile('([0-9a-z])', re.I)
|
||||
NON_ALPHANUM = re.compile('([^0-9a-z])', re.I)
|
||||
|
||||
TOO_MANY_VOWELS = re.compile('[aeiou]{3}', re.I)
|
||||
TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{5}', re.I)
|
||||
ALL_ALPHA = re.compile('^[a-z]+$', re.I)
|
||||
SINGLE_LETTER_WORDS = re.compile('^[ai]$', re.I)
|
||||
|
||||
#(L) If a string is longer than 20 characters, it is garbage
|
||||
if len(word) > 20:
|
||||
return None
|
||||
|
||||
#(A) If a string’s ratio of alphanumeric characters to total
|
||||
#characters is less than 50%, the string is garbage
|
||||
if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
|
||||
return None
|
||||
|
||||
#Remove word if all the letters in the word are non alphanumeric
|
||||
if len(NON_ALPHANUM.findall(word)) == len(word):
|
||||
return None
|
||||
|
||||
#Removed words with too many consecutie vowels
|
||||
if TOO_MANY_VOWELS.findall(word):
|
||||
return None
|
||||
|
||||
#Removed words with too many consecutie consonants
|
||||
if TOO_MANY_CONSONANTS.findall(word):
|
||||
return None
|
||||
|
||||
#Only allow specific single letter words
|
||||
if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
|
||||
return None
|
||||
|
||||
return word
|
||||
|
||||
39
apps/ocr/lang/spa.py
Normal file
39
apps/ocr/lang/spa.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# -*- coding: iso-8859-1 -*-
|
||||
import re
|
||||
|
||||
def check_word(word):
|
||||
ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
|
||||
NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
|
||||
|
||||
TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
|
||||
TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
|
||||
ALL_ALPHA = re.compile('^[a-z]+$', re.I)
|
||||
SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
|
||||
|
||||
#(L) If a string is longer than 20 characters, it is garbage
|
||||
if len(word) > 20:
|
||||
return None
|
||||
|
||||
#(A) If a string’s ratio of alphanumeric characters to total
|
||||
#characters is less than 50%, the string is garbage
|
||||
if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
|
||||
return None
|
||||
|
||||
#Remove word if all the letters in the word are non alphanumeric
|
||||
if len(NON_ALPHANUM.findall(word)) == len(word):
|
||||
return None
|
||||
|
||||
#Removed words with too many consecutie vowels
|
||||
if TOO_MANY_VOWELS.findall(word):
|
||||
return None
|
||||
|
||||
#Removed words with too many consecutie consonants
|
||||
if TOO_MANY_CONSONANTS.findall(word):
|
||||
return None
|
||||
|
||||
#Only allow specific single letter words
|
||||
if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
|
||||
return None
|
||||
|
||||
return word
|
||||
|
||||
@@ -11,4 +11,6 @@ urlpatterns = patterns('ocr.views',
|
||||
|
||||
url(r'^ocr/queue/(?P<document_queue_id>\d+)/enable/$', 'document_queue_enable', (), 'document_queue_enable'),
|
||||
url(r'^ocr/queue/(?P<document_queue_id>\d+)/disable/$', 'document_queue_disable', (), 'document_queue_disable'),
|
||||
|
||||
url(r'^ocr/document/all/clean_up/$', 'all_document_ocr_cleanup', (), 'all_document_ocr_cleanup'),
|
||||
)
|
||||
|
||||
@@ -12,13 +12,14 @@ from permissions.api import check_permissions
|
||||
from documents.models import Document
|
||||
|
||||
from ocr import PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE, \
|
||||
PERMISSION_OCR_QUEUE_ENABLE_DISABLE
|
||||
PERMISSION_OCR_QUEUE_ENABLE_DISABLE, PERMISSION_OCR_CLEAN_ALL_PAGES
|
||||
|
||||
from models import DocumentQueue, QueueDocument
|
||||
from literals import QUEUEDOCUMENT_STATE_PENDING, \
|
||||
QUEUEDOCUMENT_STATE_PROCESSING, QUEUEDOCUMENT_STATE_ERROR, \
|
||||
DOCUMENTQUEUE_STATE_STOPPED, DOCUMENTQUEUE_STATE_ACTIVE
|
||||
from exceptions import AlreadyQueued
|
||||
|
||||
from api import clean_pages
|
||||
|
||||
def _display_thumbnail(ocr_document):
|
||||
try:
|
||||
@@ -249,4 +250,26 @@ def document_queue_enable(request, document_queue_id):
|
||||
'title':_(u'Are you sure you wish to activate document queue: %s') % document_queue,
|
||||
'next':next,
|
||||
'previous':previous,
|
||||
}, context_instance=RequestContext(request))
|
||||
}, context_instance=RequestContext(request))
|
||||
|
||||
|
||||
def all_document_ocr_cleanup(request):
|
||||
check_permissions(request.user, 'ocr', [PERMISSION_OCR_CLEAN_ALL_PAGES])
|
||||
|
||||
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
|
||||
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None)))
|
||||
|
||||
if request.method != 'POST':
|
||||
return render_to_response('generic_confirm.html', {
|
||||
'previous':previous,
|
||||
'next':next,
|
||||
'message':_(u'On large databases this operation may take some time to execute.'),
|
||||
}, context_instance=RequestContext(request))
|
||||
else:
|
||||
try:
|
||||
clean_pages()
|
||||
messages.success(request, _(u'Document pages content clean up complete.'))
|
||||
except Exception, e:
|
||||
messages.error(request, _(u'Document pages content clean up error: %s') % e)
|
||||
|
||||
return HttpResponseRedirect(next)
|
||||
|
||||
Reference in New Issue
Block a user