diff --git a/apps/ocr/api.py b/apps/ocr/api.py index 57b1bb54eb..f244b45ae5 100755 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -1,5 +1,6 @@ #Some code from http://wiki.github.com/hoffstaetter/python-tesseract +import codecs import os import subprocess import tempfile @@ -14,6 +15,11 @@ from documents.models import Document from converter.api import convert_document_for_ocr from ocr.conf.settings import TESSERACT_PATH +from ocr.conf.settings import TESSERACT_LANGUAGE + + +class TesseractError(Exception): + pass def cleanup(filename): @@ -23,51 +29,37 @@ def cleanup(filename): except OSError: pass -class TesseractError(Exception): - pass -# def __init__(self, status, message): -# self.status = status -# self.message = message - -def get_errors(error_string): - ''' - returns all lines in the error_string that start with the string "error" - - ''' - lines = error_string.splitlines() - return lines[1] - #error_lines = (line for line in lines if line.find('error') >= 0) - #return '\n'.join(error_lines) def run_tesseract(input_filename, output_filename_base, lang=None): command = [TESSERACT_PATH, input_filename, output_filename_base] if lang is not None: command += ['-l', lang] - - proc = subprocess.Popen(command, stderr=subprocess.PIPE) - return (proc.wait(), proc.stderr.read()) + + proc = subprocess.Popen(command, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + return_code = proc.wait() + if return_code != 0: + error_text = proc.stderr.read() + raise TesseractError(error_text) def do_document_ocr(document): - for page_index, document_page in enumerate(document.documentpage_set.all()): + for page_index, document_page in enumerate(document.documentpage_set.all()): imagefile = convert_document_for_ocr(document, page=page_index) desc, filepath = tempfile.mkstemp() try: - status, error_string = run_tesseract(imagefile, filepath) - if status: - errors = get_errors(error_string) - raise TesseractError(errors) - finally: + run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE) ocr_output = os.extsep.join([filepath, 'txt']) - - f = file(ocr_output) - try: + f = codecs.open(ocr_output, 'r', 'utf-8') document_page = document.documentpage_set.get(page_number=page_index+1) document_page.content = f.read().strip() document_page.page_label = _(u'Text from OCR') document_page.save() - finally: f.close() - cleanup(filepath) cleanup(ocr_output) + except TesseractError, e: + cleanup(filepath) + cleanup(imagefile) + raise TesseractError(e) + finally: + cleanup(filepath) cleanup(imagefile)