127 lines
3.7 KiB
Python
127 lines
3.7 KiB
Python
# -*- coding: iso-8859-1 -*-
|
||
#Some code from http://wiki.github.com/hoffstaetter/python-tesseract
|
||
|
||
import codecs
|
||
import os
|
||
import subprocess
|
||
import re
|
||
import tempfile
|
||
|
||
from django.utils.translation import ugettext as _
|
||
|
||
from common import TEMPORARY_DIRECTORY
|
||
from converter.api import convert_document_for_ocr
|
||
|
||
from ocr.conf.settings import TESSERACT_PATH
|
||
from ocr.conf.settings import TESSERACT_LANGUAGE
|
||
|
||
|
||
class TesseractError(Exception):
|
||
pass
|
||
|
||
|
||
def cleanup(filename):
|
||
''' tries to remove the given filename. Ignores non-existent files '''
|
||
try:
|
||
os.remove(filename)
|
||
except OSError:
|
||
pass
|
||
|
||
|
||
def run_tesseract(input_filename, output_filename_base, lang=None):
|
||
command = [TESSERACT_PATH, input_filename, output_filename_base]
|
||
if lang is not None:
|
||
command += ['-l', lang]
|
||
|
||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||
return_code = proc.wait()
|
||
if return_code != 0:
|
||
error_text = proc.stderr.read()
|
||
raise TesseractError(error_text)
|
||
|
||
|
||
def do_document_ocr(document):
|
||
for page_index, document_page in enumerate(document.documentpage_set.all()):
|
||
imagefile = convert_document_for_ocr(document, page=page_index)
|
||
desc, filepath = tempfile.mkstemp()
|
||
try:
|
||
run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
|
||
ocr_output = os.extsep.join([filepath, 'txt'])
|
||
f = codecs.open(ocr_output, 'r', 'utf-8')
|
||
document_page = document.documentpage_set.get(page_number=page_index+1)
|
||
document_page.content = f.read().strip()
|
||
document_page.page_label = _(u'Text from OCR')
|
||
document_page.save()
|
||
f.close()
|
||
cleanup(ocr_output)
|
||
except TesseractError, e:
|
||
cleanup(filepath)
|
||
cleanup(imagefile)
|
||
raise TesseractError(e)
|
||
finally:
|
||
cleanup(filepath)
|
||
cleanup(imagefile)
|
||
|
||
|
||
def ocr_cleanup(text):
|
||
output = []
|
||
for line in text.splitlines():
|
||
line = line.strip()
|
||
for word in line.split():
|
||
result = check_word(word)
|
||
if result:
|
||
output.append(result)
|
||
output.append('\n')
|
||
|
||
return u' '.join(output)
|
||
|
||
|
||
def check_word(word):
|
||
ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
|
||
NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
|
||
|
||
TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
|
||
TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
|
||
ALL_ALPHA = re.compile('^[a-z]+$', re.I)
|
||
SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
|
||
|
||
#(L) If a string is longer than 20 characters, it is
|
||
#garbage:
|
||
if len(word) > 20:
|
||
return None
|
||
|
||
#(A) If a string’s ratio of alphanumeric characters to total
|
||
#characters. is less than 50%, the string is garbage:
|
||
if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
|
||
return None
|
||
|
||
#Remove word if all the letters in the word are non alphanumeric
|
||
if len(NON_ALPHANUM.findall(word)) == len(word):
|
||
return None
|
||
|
||
#Removed words with too many consecutie vowels
|
||
if TOO_MANY_VOWELS.findall(word):
|
||
return None
|
||
|
||
#Removed words with too many consecutie consonants
|
||
if TOO_MANY_CONSONANTS.findall(word):
|
||
return None
|
||
|
||
#Only allow specific single letter words
|
||
if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
|
||
return None
|
||
|
||
return word
|
||
|
||
|
||
|
||
from ocr.api import ocr_cleanup
|
||
from documents.models import DocumentPage
|
||
def clean_pages():
|
||
for page in DocumentPage.objects.all():
|
||
if page.content:
|
||
page.content = ocr_cleanup(page.content)
|
||
#print page.content
|
||
print page.pk
|
||
page.save()
|