Apply transformation before doing OCR, added unpaper to the OCR pre processing pipe
This commit is contained in:
@@ -38,8 +38,9 @@ Or execute pip install -r requirements/production.txt to install the dependencie
|
||||
Executables:
|
||||
|
||||
* ImageMagick - Convert, Edit, Or Compose Bitmap Images
|
||||
* tesseract-ocr - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google.
|
||||
* libmagic
|
||||
* tesseract-ocr - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google.
|
||||
* unpaper - post-processing scanned and photocopied book pages
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
@@ -7,6 +7,7 @@ import shutil
|
||||
from django.template.defaultfilters import slugify
|
||||
|
||||
from converter.conf.settings import CONVERT_PATH
|
||||
from converter.conf.settings import UNPAPER_PATH
|
||||
from converter.conf.settings import IDENTIFY_PATH
|
||||
from converter.conf.settings import OCR_OPTIONS
|
||||
from converter.conf.settings import DEFAULT_OPTIONS
|
||||
@@ -15,7 +16,7 @@ from converter.conf.settings import HIGH_QUALITY_OPTIONS
|
||||
|
||||
#from converter.conf.settings import UNOCONV_PATH
|
||||
|
||||
from converter import TEMPORARY_DIRECTORY
|
||||
from converter import TEMPORARY_DIRECTORY, TRANFORMATION_CHOICES
|
||||
from utils import from_descriptor_to_tempfile
|
||||
|
||||
|
||||
@@ -50,17 +51,28 @@ def get_errors(error_string):
|
||||
|
||||
|
||||
#TODO: Timeout & kill child
|
||||
def execute_convert(input_filepath, arguments, output_filepath, quality=QUALITY_DEFAULT):
|
||||
def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None):
|
||||
command = []
|
||||
command.append(CONVERT_PATH)
|
||||
command.extend(shlex.split(str(QUALITY_SETTINGS[quality])))
|
||||
command.append(input_filepath)
|
||||
command.extend(shlex.split(str(arguments)))
|
||||
if arguments:
|
||||
command.extend(shlex.split(str(arguments)))
|
||||
command.append(output_filepath)
|
||||
|
||||
proc = subprocess.Popen(command, stderr=subprocess.PIPE)
|
||||
return (proc.wait(), proc.stderr.read())
|
||||
|
||||
|
||||
def execute_unpaper(input_filepath, output_filepath):
|
||||
command = []
|
||||
command.append(UNPAPER_PATH)
|
||||
command.append('--overwrite')
|
||||
command.append(input_filepath)
|
||||
command.append(output_filepath)
|
||||
proc = subprocess.Popen(command, stderr=subprocess.PIPE)
|
||||
return (proc.wait(), proc.stderr.read())
|
||||
|
||||
|
||||
def execute_unoconv(input_filepath, output_filepath, arguments=''):
|
||||
command = [UNOCONV_PATH]
|
||||
command.extend(['--stdout'])
|
||||
@@ -135,7 +147,7 @@ def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, f
|
||||
try:
|
||||
input_arg = '%s[%s]' % (input_filepath, page)
|
||||
extra_options += ' -resize %s' % size
|
||||
status, error_string = execute_convert(input_arg, extra_options, '%s:%s' % (format, output_filepath), quality=quality)
|
||||
status, error_string = execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath='%s:%s' % (format, output_filepath), quality=quality)
|
||||
if status:
|
||||
errors = get_errors(error_string)
|
||||
raise ConvertError(status, errors)
|
||||
@@ -170,12 +182,44 @@ def convert_document_for_ocr(document, page=0, format='tif'):
|
||||
#Convert for OCR
|
||||
temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
|
||||
temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
|
||||
output_arg = '%s_ocr%s%s%s' % (temp_path, page, os.extsep, format)
|
||||
transformation_output_file = '%s_trans%s%s%s' % (temp_path, page, os.extsep, format)
|
||||
unpaper_input_file = '%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
|
||||
unpaper_output_file = '%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
|
||||
convert_output_file = '%s_ocr%s%s%s' % (temp_path, page, os.extsep, format)
|
||||
|
||||
input_arg = '%s[%s]' % (input_filepath, page)
|
||||
|
||||
transformation_list = []
|
||||
try:
|
||||
status, error_string = execute_convert(input_arg, OCR_OPTIONS, output_arg)
|
||||
#Catch invalid or non existing pages
|
||||
document_page = document.documentpage_set.get(document=document, page_number=page+1)
|
||||
for page_transformation in document_page.documentpagetransformation_set.all():
|
||||
try:
|
||||
if page_transformation.transformation in TRANFORMATION_CHOICES:
|
||||
output = TRANFORMATION_CHOICES[page_transformation.transformation] % eval(page_transformation.arguments)
|
||||
transformation_list.append(output)
|
||||
except Exception, e:
|
||||
if request.user.is_staff:
|
||||
messages.warning(request, _(u'Error for transformation %(transformation)s:, %(error)s') %
|
||||
{'transformation':page_transformation.get_transformation_display(),
|
||||
'error':e})
|
||||
else:
|
||||
pass
|
||||
except ObjectDoesNotExist:
|
||||
pass
|
||||
|
||||
tranformation_string = ' '.join(transformation_list)
|
||||
try:
|
||||
status, error_string = execute_convert(input_filepath=input_arg, quality=QUALITY_HIGH, arguments=tranformation_string, output_filepath=transformation_output_file)
|
||||
status, error_string = execute_convert(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file)
|
||||
status, error_string = execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file)
|
||||
status, error_string = execute_convert(input_filepath=unpaper_output_file, output_filepath=convert_output_file)
|
||||
|
||||
if status:
|
||||
errors = get_errors(error_string)
|
||||
raise ConvertError(status, errors)
|
||||
finally:
|
||||
return output_arg
|
||||
cleanup(transformation_output_file)
|
||||
cleanup(unpaper_input_file)
|
||||
cleanup(unpaper_output_file)
|
||||
return convert_output_file
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from django.conf import settings
|
||||
|
||||
CONVERT_PATH = getattr(settings, 'CONVERTER_CONVERT_PATH', u'/usr/bin/convert')
|
||||
UNPAPER_PATH = getattr(settings, 'CONVERTER_UNPAPER_PATH', u'/usr/bin/unpaper')
|
||||
IDENTIFY_PATH = getattr(settings, 'CONVERTER_IDENTIFY_PATH', u'/usr/bin/identify')
|
||||
OCR_OPTIONS = getattr(settings, 'CONVERTER_OCR_OPTIONS', u'-colorspace Gray -depth 8 -resample 200x200')
|
||||
DEFAULT_OPTIONS = getattr(settings, 'CONVERTER_DEFAULT_OPTIONS', u'')
|
||||
|
||||
@@ -22,10 +22,20 @@ def cleanup(filename):
|
||||
pass
|
||||
|
||||
class TesseractError(Exception):
|
||||
def __init__(self, status, message):
|
||||
self.status = status
|
||||
self.message = message
|
||||
pass
|
||||
# def __init__(self, status, message):
|
||||
# self.status = status
|
||||
# self.message = message
|
||||
|
||||
def get_errors(error_string):
|
||||
'''
|
||||
returns all lines in the error_string that start with the string "error"
|
||||
|
||||
'''
|
||||
lines = error_string.splitlines()
|
||||
return lines[1]
|
||||
#error_lines = (line for line in lines if line.find('error') >= 0)
|
||||
#return '\n'.join(error_lines)
|
||||
|
||||
def run_tesseract(input_filename, output_filename_base, lang=None):
|
||||
command = [TESSERACT_PATH, input_filename, output_filename_base]
|
||||
@@ -44,18 +54,19 @@ def ocr_document(document):
|
||||
status, error_string = run_tesseract(imagefile, filepath)
|
||||
if status:
|
||||
errors = get_errors(error_string)
|
||||
raise TesseractError(status, errors)
|
||||
raise TesseractError(errors)
|
||||
finally:
|
||||
ocr_output = os.extsep.join([filepath, 'txt'])
|
||||
f = file(ocr_output)
|
||||
try:
|
||||
document_page, created = DocumentPage.objects.get_or_create(document=document,
|
||||
page_number=page_index+1)
|
||||
document_page.content = f.read().strip()
|
||||
document_page.page_label = _(u'Text from OCR')
|
||||
document_page.save()
|
||||
finally:
|
||||
f.close()
|
||||
cleanup(filepath)
|
||||
cleanup(ocr_output)
|
||||
cleanup(imagefile)
|
||||
|
||||
f = file(ocr_output)
|
||||
try:
|
||||
document_page, created = DocumentPage.objects.get_or_create(document=document,
|
||||
page_number=page_index+1)
|
||||
document_page.content = f.read().strip()
|
||||
document_page.page_label = _(u'Text from OCR')
|
||||
document_page.save()
|
||||
finally:
|
||||
f.close()
|
||||
#cleanup(filepath)
|
||||
#cleanup(ocr_output)
|
||||
#cleanup(imagefile)
|
||||
|
||||
18
docs/CREDITS
18
docs/CREDITS
@@ -4,7 +4,8 @@ Python
|
||||
Copyright (c) 1995-2001 Corporation for National Research Initiatives.
|
||||
Copyright (c) 1991-1995 Stichting Mathematisch Centrum, Amsterdam.
|
||||
|
||||
Django - A high-level Python Web framework that encourages rapid development and clean, pragmatic design.
|
||||
Django - A high-level Python Web framework that encourages rapid
|
||||
development and clean, pragmatic design.
|
||||
Copyright Django Software Foundation
|
||||
http://www.djangoproject.com/
|
||||
|
||||
@@ -32,7 +33,8 @@ django-extensions - Extensions for Django
|
||||
Copyright Bas van Oostveen (v.oostveen@gmail.com)
|
||||
http://code.google.com/p/django-command-extensions/
|
||||
|
||||
django-rosetta - A Django application that eases the translation of Django projects
|
||||
django-rosetta - A Django application that eases the translation of
|
||||
Django projects
|
||||
Copyright Marco Bonetti (mbonetti@gmail.com)
|
||||
http://code.google.com/p/django-rosetta/
|
||||
|
||||
@@ -48,7 +50,8 @@ django-filetransfers - File upload/download abstraction
|
||||
Waldemar Kornewald
|
||||
http://www.allbuttonspressed.com/projects/django-filetransfers
|
||||
|
||||
tesseract - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google.
|
||||
tesseract - An OCR Engine that was developed at HP Labs between 1985 and
|
||||
1995... and now at Google.
|
||||
http://code.google.com/p/tesseract-ocr/
|
||||
|
||||
Image file 1068504_92921456 "Mayan piramid" (Stock Exchange)
|
||||
@@ -63,3 +66,12 @@ Fat cow icon set
|
||||
Python-magic - python-magic is a simple wrapper for libmagic
|
||||
Adam Hupp <adam at hupp.org>
|
||||
https://github.com/ahupp/python-magic
|
||||
|
||||
Fancybox - FancyBox is a tool for displaying images, html content and
|
||||
multi-media in a Mac-style "lightbox" that floats overtop of
|
||||
web page.
|
||||
http://fancybox.net
|
||||
|
||||
unpaper - post-processing scanned and photocopied book pages
|
||||
Jens Gulden 2005-2007 - unpaper@jensgulden.de.
|
||||
http://unpaper.berlios.de/
|
||||
|
||||
@@ -10,3 +10,5 @@
|
||||
To update a previous database do: [d.update_page_count() for d in Document.objects.all()]
|
||||
* Added support for document page transformation (no GUI yet)
|
||||
* Added views to create, edit and grant/revoke permissions to roles
|
||||
* Apply default transformations to document before OCR
|
||||
* Added unpaper to the OCR convertion pipe
|
||||
|
||||
@@ -31,6 +31,8 @@
|
||||
* Assign default role to new users - DONE
|
||||
* DB stored transformations - DONE
|
||||
* Recognize multi-page documents - DONE
|
||||
* Add unpaper to pre OCR document cleanup - DONE
|
||||
* Role editing view under setup - STARTED
|
||||
* Document list filtering by metadata
|
||||
* Filterform date filtering widget
|
||||
* Validate GET data before saving file
|
||||
@@ -53,10 +55,10 @@
|
||||
* Field for document language or autodetect
|
||||
* Count pages in a PDF file http://pybrary.net/pyPdf/
|
||||
* Download a document in diffent formats: (jpg, png, pdf)
|
||||
* Download a document in diffent formats: (jpg, png, pdf)
|
||||
* Cache.cleanup function to delete cached images when document hash changes
|
||||
* Divide navigation links search by object and by view
|
||||
* Add show_summary method to model to display as results of a search
|
||||
* Add unpaper to pre OCR document cleanup
|
||||
* Support distributed OCR queues (RabbitMQ & Celery?)
|
||||
* DXF viewer - http://code.google.com/p/dxf-reader/source/browse/#svn%2Ftrunk
|
||||
* Support spreadsheets, wordprocessing docs using openoffice in server mode
|
||||
@@ -64,10 +66,12 @@
|
||||
* Handle ziped or rar archives
|
||||
* Display preferences 'document transformations' (Rotation, default zoom)
|
||||
* Gallery view for document groups
|
||||
* Role editing view under setup
|
||||
* Download metadata group documents as a single zip file
|
||||
* Download original document or transformed document
|
||||
* Include annotations in transformed documents downloads
|
||||
* Document view temp transformations
|
||||
* Implement permissions decorators
|
||||
* block Setup menu item to non staff and non superuser users
|
||||
* Don't append an extension separator if extension is non existant
|
||||
* Don't do OCR on wordproccessing or spreadsheet document, strip tags and store text
|
||||
* Storage backend to storage backend copy support, to move/migrate document to new storage backend
|
||||
|
||||
@@ -216,6 +216,7 @@ LOGIN_EXEMPT_URLS = (
|
||||
#CONVERTER_CONVERT_PATH = u'/usr/bin/convert'
|
||||
#CONVERTER_OCR_OPTIONS = u'-colorspace Gray -depth 8 -resample 200x200'
|
||||
#CONVERTER_IDENTIFY_PATH = u'/usr/bin/identify'
|
||||
#CONVERTER_UNPAPER_PATH = u'/usr/bin/unpaper'
|
||||
|
||||
# OCR
|
||||
#OCR_TESSERACT_PATH = u'/usr/bin/tesseract'
|
||||
|
||||
Reference in New Issue
Block a user