Files
mayan-edms/apps/ocr/api.py
2011-04-07 04:07:59 -04:00

127 lines
3.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: iso-8859-1 -*-
#Some code from http://wiki.github.com/hoffstaetter/python-tesseract
import codecs
import os
import subprocess
import re
import tempfile
from django.utils.translation import ugettext as _
from common import TEMPORARY_DIRECTORY
from converter.api import convert_document_for_ocr
from ocr.conf.settings import TESSERACT_PATH
from ocr.conf.settings import TESSERACT_LANGUAGE
class TesseractError(Exception):
pass
def cleanup(filename):
''' tries to remove the given filename. Ignores non-existent files '''
try:
os.remove(filename)
except OSError:
pass
def run_tesseract(input_filename, output_filename_base, lang=None):
command = [TESSERACT_PATH, input_filename, output_filename_base]
if lang is not None:
command += ['-l', lang]
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
error_text = proc.stderr.read()
raise TesseractError(error_text)
def do_document_ocr(document):
for page_index, document_page in enumerate(document.documentpage_set.all()):
imagefile = convert_document_for_ocr(document, page=page_index)
desc, filepath = tempfile.mkstemp()
try:
run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
ocr_output = os.extsep.join([filepath, 'txt'])
f = codecs.open(ocr_output, 'r', 'utf-8')
document_page = document.documentpage_set.get(page_number=page_index+1)
document_page.content = f.read().strip()
document_page.page_label = _(u'Text from OCR')
document_page.save()
f.close()
cleanup(ocr_output)
except TesseractError, e:
cleanup(filepath)
cleanup(imagefile)
raise TesseractError(e)
finally:
cleanup(filepath)
cleanup(imagefile)
def ocr_cleanup(text):
output = []
for line in text.splitlines():
line = line.strip()
for word in line.split():
result = check_word(word)
if result:
output.append(result)
output.append('\n')
return u' '.join(output)
def check_word(word):
ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
ALL_ALPHA = re.compile('^[a-z]+$', re.I)
SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
#(L) If a string is longer than 20 characters, it is
#garbage:
if len(word) > 20:
return None
#(A) If a strings ratio of alphanumeric characters to total
#characters. is less than 50%, the string is garbage:
if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
return None
#Remove word if all the letters in the word are non alphanumeric
if len(NON_ALPHANUM.findall(word)) == len(word):
return None
#Removed words with too many consecutie vowels
if TOO_MANY_VOWELS.findall(word):
return None
#Removed words with too many consecutie consonants
if TOO_MANY_CONSONANTS.findall(word):
return None
#Only allow specific single letter words
if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
return None
return word
from ocr.api import ocr_cleanup
from documents.models import DocumentPage
def clean_pages():
for page in DocumentPage.objects.all():
if page.content:
page.content = ocr_cleanup(page.content)
#print page.content
print page.pk
page.save()