Initial commit for the ocr_cleanup branch

2011-04-07 04:07:59 -04:00
parent cea79ea240
commit d1ff305a3f
1 changed files with 65 additions and 0 deletions
--- a/apps/ocr/api.py
+++ b/apps/ocr/api.py
@@ -1,8 +1,10 @@
 # -*- coding: iso-8859-1 -*-
 #Some code from http://wiki.github.com/hoffstaetter/python-tesseract
 import codecs
 import os
 import subprocess
 import re
 import tempfile
 from django.utils.translation import ugettext as _
@@ -59,3 +61,66 @@ def do_document_ocr(document):
        finally:
            cleanup(filepath)
            cleanup(imagefile)
 def ocr_cleanup(text):
    output = []
    for line in text.splitlines():
        line = line.strip()
        for word in line.split():
            result = check_word(word)
            if result:
                output.append(result)
        output.append('\n')
    return u' '.join(output)
 def check_word(word):
    ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
    NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
    TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
    TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
    ALL_ALPHA = re.compile('^[a-z]+$', re.I)
    SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
    #(L) If a string is longer than 20 characters, it is
    #garbage:
    if len(word) > 20:
        return None
    #(A) If a string’s ratio of alphanumeric characters to total 
    #characters. is less than 50%, the string is garbage:
    if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
        return None
    #Remove word if all the letters in the word are non alphanumeric
    if len(NON_ALPHANUM.findall(word)) == len(word):
        return None
    #Removed words with too many consecutie vowels
    if TOO_MANY_VOWELS.findall(word):
        return None 
    #Removed words with too many consecutie consonants
    if TOO_MANY_CONSONANTS.findall(word):
        return None 
    #Only allow specific single letter words
    if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
        return None
    return word
 from ocr.api import ocr_cleanup
 from documents.models import DocumentPage
 def clean_pages():
    for page in DocumentPage.objects.all():
        if page.content:
            page.content = ocr_cleanup(page.content)
            #print page.content
            print page.pk
            page.save()