diff --git a/README.md b/README.md index 7cdb366d54..8561c7e094 100755 --- a/README.md +++ b/README.md @@ -38,8 +38,9 @@ Or execute pip install -r requirements/production.txt to install the dependencie Executables: * ImageMagick - Convert, Edit, Or Compose Bitmap Images -* tesseract-ocr - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google. * libmagic +* tesseract-ocr - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google. +* unpaper - post-processing scanned and photocopied book pages License ------- diff --git a/apps/converter/api.py b/apps/converter/api.py index 643ab89cd6..8be618fe72 100755 --- a/apps/converter/api.py +++ b/apps/converter/api.py @@ -7,6 +7,7 @@ import shutil from django.template.defaultfilters import slugify from converter.conf.settings import CONVERT_PATH +from converter.conf.settings import UNPAPER_PATH from converter.conf.settings import IDENTIFY_PATH from converter.conf.settings import OCR_OPTIONS from converter.conf.settings import DEFAULT_OPTIONS @@ -15,7 +16,7 @@ from converter.conf.settings import HIGH_QUALITY_OPTIONS #from converter.conf.settings import UNOCONV_PATH -from converter import TEMPORARY_DIRECTORY +from converter import TEMPORARY_DIRECTORY, TRANFORMATION_CHOICES from utils import from_descriptor_to_tempfile @@ -50,17 +51,28 @@ def get_errors(error_string): #TODO: Timeout & kill child -def execute_convert(input_filepath, arguments, output_filepath, quality=QUALITY_DEFAULT): +def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None): command = [] command.append(CONVERT_PATH) command.extend(shlex.split(str(QUALITY_SETTINGS[quality]))) command.append(input_filepath) - command.extend(shlex.split(str(arguments))) + if arguments: + command.extend(shlex.split(str(arguments))) command.append(output_filepath) - proc = subprocess.Popen(command, stderr=subprocess.PIPE) return (proc.wait(), proc.stderr.read()) + +def execute_unpaper(input_filepath, output_filepath): + command = [] + command.append(UNPAPER_PATH) + command.append('--overwrite') + command.append(input_filepath) + command.append(output_filepath) + proc = subprocess.Popen(command, stderr=subprocess.PIPE) + return (proc.wait(), proc.stderr.read()) + + def execute_unoconv(input_filepath, output_filepath, arguments=''): command = [UNOCONV_PATH] command.extend(['--stdout']) @@ -135,7 +147,7 @@ def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, f try: input_arg = '%s[%s]' % (input_filepath, page) extra_options += ' -resize %s' % size - status, error_string = execute_convert(input_arg, extra_options, '%s:%s' % (format, output_filepath), quality=quality) + status, error_string = execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath='%s:%s' % (format, output_filepath), quality=quality) if status: errors = get_errors(error_string) raise ConvertError(status, errors) @@ -170,12 +182,44 @@ def convert_document_for_ocr(document, page=0, format='tif'): #Convert for OCR temp_filename, separator = os.path.splitext(os.path.basename(input_filepath)) temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename) - output_arg = '%s_ocr%s%s%s' % (temp_path, page, os.extsep, format) + transformation_output_file = '%s_trans%s%s%s' % (temp_path, page, os.extsep, format) + unpaper_input_file = '%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep) + unpaper_output_file = '%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep) + convert_output_file = '%s_ocr%s%s%s' % (temp_path, page, os.extsep, format) + input_arg = '%s[%s]' % (input_filepath, page) + + transformation_list = [] try: - status, error_string = execute_convert(input_arg, OCR_OPTIONS, output_arg) + #Catch invalid or non existing pages + document_page = document.documentpage_set.get(document=document, page_number=page+1) + for page_transformation in document_page.documentpagetransformation_set.all(): + try: + if page_transformation.transformation in TRANFORMATION_CHOICES: + output = TRANFORMATION_CHOICES[page_transformation.transformation] % eval(page_transformation.arguments) + transformation_list.append(output) + except Exception, e: + if request.user.is_staff: + messages.warning(request, _(u'Error for transformation %(transformation)s:, %(error)s') % + {'transformation':page_transformation.get_transformation_display(), + 'error':e}) + else: + pass + except ObjectDoesNotExist: + pass + + tranformation_string = ' '.join(transformation_list) + try: + status, error_string = execute_convert(input_filepath=input_arg, quality=QUALITY_HIGH, arguments=tranformation_string, output_filepath=transformation_output_file) + status, error_string = execute_convert(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file) + status, error_string = execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file) + status, error_string = execute_convert(input_filepath=unpaper_output_file, output_filepath=convert_output_file) + if status: errors = get_errors(error_string) raise ConvertError(status, errors) finally: - return output_arg + cleanup(transformation_output_file) + cleanup(unpaper_input_file) + cleanup(unpaper_output_file) + return convert_output_file diff --git a/apps/converter/conf/settings.py b/apps/converter/conf/settings.py index f7cd3901dd..1570deebe0 100755 --- a/apps/converter/conf/settings.py +++ b/apps/converter/conf/settings.py @@ -1,6 +1,7 @@ from django.conf import settings CONVERT_PATH = getattr(settings, 'CONVERTER_CONVERT_PATH', u'/usr/bin/convert') +UNPAPER_PATH = getattr(settings, 'CONVERTER_UNPAPER_PATH', u'/usr/bin/unpaper') IDENTIFY_PATH = getattr(settings, 'CONVERTER_IDENTIFY_PATH', u'/usr/bin/identify') OCR_OPTIONS = getattr(settings, 'CONVERTER_OCR_OPTIONS', u'-colorspace Gray -depth 8 -resample 200x200') DEFAULT_OPTIONS = getattr(settings, 'CONVERTER_DEFAULT_OPTIONS', u'') diff --git a/apps/ocr/api.py b/apps/ocr/api.py index 6ba305c8b0..800cf72fbb 100755 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -22,10 +22,20 @@ def cleanup(filename): pass class TesseractError(Exception): - def __init__(self, status, message): - self.status = status - self.message = message + pass +# def __init__(self, status, message): +# self.status = status +# self.message = message +def get_errors(error_string): + ''' + returns all lines in the error_string that start with the string "error" + + ''' + lines = error_string.splitlines() + return lines[1] + #error_lines = (line for line in lines if line.find('error') >= 0) + #return '\n'.join(error_lines) def run_tesseract(input_filename, output_filename_base, lang=None): command = [TESSERACT_PATH, input_filename, output_filename_base] @@ -44,18 +54,19 @@ def ocr_document(document): status, error_string = run_tesseract(imagefile, filepath) if status: errors = get_errors(error_string) - raise TesseractError(status, errors) + raise TesseractError(errors) finally: ocr_output = os.extsep.join([filepath, 'txt']) - f = file(ocr_output) - try: - document_page, created = DocumentPage.objects.get_or_create(document=document, - page_number=page_index+1) - document_page.content = f.read().strip() - document_page.page_label = _(u'Text from OCR') - document_page.save() - finally: - f.close() - cleanup(filepath) - cleanup(ocr_output) - cleanup(imagefile) + + f = file(ocr_output) + try: + document_page, created = DocumentPage.objects.get_or_create(document=document, + page_number=page_index+1) + document_page.content = f.read().strip() + document_page.page_label = _(u'Text from OCR') + document_page.save() + finally: + f.close() + #cleanup(filepath) + #cleanup(ocr_output) + #cleanup(imagefile) diff --git a/docs/CREDITS b/docs/CREDITS index 6f1e229511..e109ae11d3 100755 --- a/docs/CREDITS +++ b/docs/CREDITS @@ -4,7 +4,8 @@ Python Copyright (c) 1995-2001 Corporation for National Research Initiatives. Copyright (c) 1991-1995 Stichting Mathematisch Centrum, Amsterdam. -Django - A high-level Python Web framework that encourages rapid development and clean, pragmatic design. +Django - A high-level Python Web framework that encourages rapid + development and clean, pragmatic design. Copyright Django Software Foundation http://www.djangoproject.com/ @@ -32,7 +33,8 @@ django-extensions - Extensions for Django Copyright Bas van Oostveen (v.oostveen@gmail.com) http://code.google.com/p/django-command-extensions/ -django-rosetta - A Django application that eases the translation of Django projects +django-rosetta - A Django application that eases the translation of + Django projects Copyright Marco Bonetti (mbonetti@gmail.com) http://code.google.com/p/django-rosetta/ @@ -48,7 +50,8 @@ django-filetransfers - File upload/download abstraction Waldemar Kornewald http://www.allbuttonspressed.com/projects/django-filetransfers -tesseract - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google. +tesseract - An OCR Engine that was developed at HP Labs between 1985 and + 1995... and now at Google. http://code.google.com/p/tesseract-ocr/ Image file 1068504_92921456 "Mayan piramid" (Stock Exchange) @@ -63,3 +66,12 @@ Fat cow icon set Python-magic - python-magic is a simple wrapper for libmagic Adam Hupp https://github.com/ahupp/python-magic + +Fancybox - FancyBox is a tool for displaying images, html content and + multi-media in a Mac-style "lightbox" that floats overtop of + web page. + http://fancybox.net + +unpaper - post-processing scanned and photocopied book pages + Jens Gulden 2005-2007 - unpaper@jensgulden.de. + http://unpaper.berlios.de/ diff --git a/docs/Changelog.txt b/docs/Changelog.txt index bd01a98096..7554a9b691 100644 --- a/docs/Changelog.txt +++ b/docs/Changelog.txt @@ -10,3 +10,5 @@ To update a previous database do: [d.update_page_count() for d in Document.objects.all()] * Added support for document page transformation (no GUI yet) * Added views to create, edit and grant/revoke permissions to roles +* Apply default transformations to document before OCR +* Added unpaper to the OCR convertion pipe diff --git a/docs/TODO b/docs/TODO index a72c96f986..3b98897afb 100755 --- a/docs/TODO +++ b/docs/TODO @@ -31,6 +31,8 @@ * Assign default role to new users - DONE * DB stored transformations - DONE * Recognize multi-page documents - DONE +* Add unpaper to pre OCR document cleanup - DONE +* Role editing view under setup - STARTED * Document list filtering by metadata * Filterform date filtering widget * Validate GET data before saving file @@ -53,10 +55,10 @@ * Field for document language or autodetect * Count pages in a PDF file http://pybrary.net/pyPdf/ * Download a document in diffent formats: (jpg, png, pdf) +* Download a document in diffent formats: (jpg, png, pdf) * Cache.cleanup function to delete cached images when document hash changes * Divide navigation links search by object and by view * Add show_summary method to model to display as results of a search -* Add unpaper to pre OCR document cleanup * Support distributed OCR queues (RabbitMQ & Celery?) * DXF viewer - http://code.google.com/p/dxf-reader/source/browse/#svn%2Ftrunk * Support spreadsheets, wordprocessing docs using openoffice in server mode @@ -64,10 +66,12 @@ * Handle ziped or rar archives * Display preferences 'document transformations' (Rotation, default zoom) * Gallery view for document groups -* Role editing view under setup * Download metadata group documents as a single zip file * Download original document or transformed document * Include annotations in transformed documents downloads * Document view temp transformations * Implement permissions decorators * block Setup menu item to non staff and non superuser users +* Don't append an extension separator if extension is non existant +* Don't do OCR on wordproccessing or spreadsheet document, strip tags and store text +* Storage backend to storage backend copy support, to move/migrate document to new storage backend diff --git a/settings.py b/settings.py index 0c6d83b713..ba076de626 100755 --- a/settings.py +++ b/settings.py @@ -216,6 +216,7 @@ LOGIN_EXEMPT_URLS = ( #CONVERTER_CONVERT_PATH = u'/usr/bin/convert' #CONVERTER_OCR_OPTIONS = u'-colorspace Gray -depth 8 -resample 200x200' #CONVERTER_IDENTIFY_PATH = u'/usr/bin/identify' +#CONVERTER_UNPAPER_PATH = u'/usr/bin/unpaper' # OCR #OCR_TESSERACT_PATH = u'/usr/bin/tesseract'