From d1ff305a3f48233763b3c6f4bebb1be72a238454 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Thu, 7 Apr 2011 04:07:59 -0400 Subject: [PATCH] Initial commit for the ocr_cleanup branch --- apps/ocr/api.py | 65 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/apps/ocr/api.py b/apps/ocr/api.py index 3f3648d8b9..6b0d88ebcb 100644 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -1,8 +1,10 @@ +# -*- coding: iso-8859-1 -*- #Some code from http://wiki.github.com/hoffstaetter/python-tesseract import codecs import os import subprocess +import re import tempfile from django.utils.translation import ugettext as _ @@ -59,3 +61,66 @@ def do_document_ocr(document): finally: cleanup(filepath) cleanup(imagefile) + + +def ocr_cleanup(text): + output = [] + for line in text.splitlines(): + line = line.strip() + for word in line.split(): + result = check_word(word) + if result: + output.append(result) + output.append('\n') + + return u' '.join(output) + + +def check_word(word): + ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I) + NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I) + + TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I) + TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I) + ALL_ALPHA = re.compile('^[a-z]+$', re.I) + SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I) + + #(L) If a string is longer than 20 characters, it is + #garbage: + if len(word) > 20: + return None + + #(A) If a string’s ratio of alphanumeric characters to total + #characters. is less than 50%, the string is garbage: + if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: + return None + + #Remove word if all the letters in the word are non alphanumeric + if len(NON_ALPHANUM.findall(word)) == len(word): + return None + + #Removed words with too many consecutie vowels + if TOO_MANY_VOWELS.findall(word): + return None + + #Removed words with too many consecutie consonants + if TOO_MANY_CONSONANTS.findall(word): + return None + + #Only allow specific single letter words + if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): + return None + + return word + + + +from ocr.api import ocr_cleanup +from documents.models import DocumentPage +def clean_pages(): + for page in DocumentPage.objects.all(): + if page.content: + page.content = ocr_cleanup(page.content) + #print page.content + print page.pk + page.save()