Initial commit for the ocr_cleanup branch

2011-04-07 04:07:59 -04:00
parent cea79ea240
commit d1ff305a3f
1 changed files with 65 additions and 0 deletions
--- a/apps/ocr/api.py
+++ b/apps/ocr/api.py
@@ -1,8 +1,10 @@
+# -*- coding: iso-8859-1 -*-
 #Some code from http://wiki.github.com/hoffstaetter/python-tesseract

 import codecs
 import os
 import subprocess
+import re
 import tempfile

 from django.utils.translation import ugettext as _
@@ -59,3 +61,66 @@ def do_document_ocr(document):
        finally:
            cleanup(filepath)
            cleanup(imagefile)
+
+
+def ocr_cleanup(text):
+    output = []
+    for line in text.splitlines():
+        line = line.strip()
+        for word in line.split():
+            result = check_word(word)
+            if result:
+                output.append(result)
+        output.append('\n')
+            
+    return u' '.join(output)
+
+
+def check_word(word):
+    ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
+    NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
+
+    TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
+    TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
+    ALL_ALPHA = re.compile('^[a-z]+$', re.I)
+    SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
+
+    #(L) If a string is longer than 20 characters, it is
+    #garbage:
+    if len(word) > 20:
+        return None
+
+    #(A) If a string’s ratio of alphanumeric characters to total 
+    #characters. is less than 50%, the string is garbage:
+    if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
+        return None
+
+    #Remove word if all the letters in the word are non alphanumeric
+    if len(NON_ALPHANUM.findall(word)) == len(word):
+        return None
+    
+    #Removed words with too many consecutie vowels
+    if TOO_MANY_VOWELS.findall(word):
+        return None 
+
+    #Removed words with too many consecutie consonants
+    if TOO_MANY_CONSONANTS.findall(word):
+        return None 
+
+    #Only allow specific single letter words
+    if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
+        return None
+        
+    return word
+    
+    
+
+from ocr.api import ocr_cleanup
+from documents.models import DocumentPage
+def clean_pages():
+    for page in DocumentPage.objects.all():
+        if page.content:
+            page.content = ocr_cleanup(page.content)
+            #print page.content
+            print page.pk
+            page.save()