Initial commit for the ocr_cleanup branch
This commit is contained in:
@@ -1,8 +1,10 @@
|
|||||||
|
# -*- coding: iso-8859-1 -*-
|
||||||
#Some code from http://wiki.github.com/hoffstaetter/python-tesseract
|
#Some code from http://wiki.github.com/hoffstaetter/python-tesseract
|
||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
from django.utils.translation import ugettext as _
|
from django.utils.translation import ugettext as _
|
||||||
@@ -59,3 +61,66 @@ def do_document_ocr(document):
|
|||||||
finally:
|
finally:
|
||||||
cleanup(filepath)
|
cleanup(filepath)
|
||||||
cleanup(imagefile)
|
cleanup(imagefile)
|
||||||
|
|
||||||
|
|
||||||
|
def ocr_cleanup(text):
|
||||||
|
output = []
|
||||||
|
for line in text.splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
for word in line.split():
|
||||||
|
result = check_word(word)
|
||||||
|
if result:
|
||||||
|
output.append(result)
|
||||||
|
output.append('\n')
|
||||||
|
|
||||||
|
return u' '.join(output)
|
||||||
|
|
||||||
|
|
||||||
|
def check_word(word):
|
||||||
|
ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
|
||||||
|
NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
|
||||||
|
|
||||||
|
TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
|
||||||
|
TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
|
||||||
|
ALL_ALPHA = re.compile('^[a-z]+$', re.I)
|
||||||
|
SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
|
||||||
|
|
||||||
|
#(L) If a string is longer than 20 characters, it is
|
||||||
|
#garbage:
|
||||||
|
if len(word) > 20:
|
||||||
|
return None
|
||||||
|
|
||||||
|
#(A) If a string’s ratio of alphanumeric characters to total
|
||||||
|
#characters. is less than 50%, the string is garbage:
|
||||||
|
if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
|
||||||
|
return None
|
||||||
|
|
||||||
|
#Remove word if all the letters in the word are non alphanumeric
|
||||||
|
if len(NON_ALPHANUM.findall(word)) == len(word):
|
||||||
|
return None
|
||||||
|
|
||||||
|
#Removed words with too many consecutie vowels
|
||||||
|
if TOO_MANY_VOWELS.findall(word):
|
||||||
|
return None
|
||||||
|
|
||||||
|
#Removed words with too many consecutie consonants
|
||||||
|
if TOO_MANY_CONSONANTS.findall(word):
|
||||||
|
return None
|
||||||
|
|
||||||
|
#Only allow specific single letter words
|
||||||
|
if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
|
||||||
|
return None
|
||||||
|
|
||||||
|
return word
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from ocr.api import ocr_cleanup
|
||||||
|
from documents.models import DocumentPage
|
||||||
|
def clean_pages():
|
||||||
|
for page in DocumentPage.objects.all():
|
||||||
|
if page.content:
|
||||||
|
page.content = ocr_cleanup(page.content)
|
||||||
|
#print page.content
|
||||||
|
print page.pk
|
||||||
|
page.save()
|
||||||
|
|||||||
Reference in New Issue
Block a user