From 8579c5081d58cb59a35d3cde4ec331a7a7d5cbbb Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Tue, 19 Jul 2011 20:56:21 -0400 Subject: [PATCH] Improved OCR file conversion --- apps/ocr/api.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/ocr/api.py b/apps/ocr/api.py index 585ef9ac2b..35f00ce63a 100644 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -16,11 +16,12 @@ from documents.utils import document_save_to_temp_dir from ocr.conf.settings import TESSERACT_PATH from ocr.conf.settings import TESSERACT_LANGUAGE -from ocr.exceptions import TesseractError +from ocr.exceptions import TesseractError, UnpaperError from ocr.conf.settings import UNPAPER_PATH from ocr.parsers import parse_document_page from ocr.parsers.exceptions import ParserError, ParserUnknownFile -from ocr.literals import DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT +from ocr.literals import DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT, \ + DEFAULT_OCR_FILE_EXTENSION def get_language_backend(): @@ -100,7 +101,7 @@ def do_document_ocr(queue_document): # Convert to TIFF pre_ocr_filepath = output_filepath=convert(input_filepath=unpaper_output_filepath, file_format=DEFAULT_OCR_FILE_FORMAT) # Tesseract needs an explicit file extension - pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_FORMAT]) + pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION]) os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext) try: fd, ocr_output = run_tesseract(pre_ocr_filepath_w_ext, TESSERACT_LANGUAGE)