Merge branch 'migrate_to_ocr_app' into fix_ocr_convert

This commit is contained in:
Roberto Rosario
2011-07-18 04:08:19 -04:00
14 changed files with 204 additions and 119 deletions

View File

@@ -18,6 +18,8 @@ Python:
* django-mptt - Utilities for implementing a modified pre-order traversal tree in django
* python-magic - A python wrapper for libmagic
* django-taggit - Simple tagging for django
* slate - The simplest way to extract text from PDFs in Python
Execute pip install -r requirements/production.txt to install the python/django dependencies automatically.

View File

@@ -5,8 +5,6 @@ import hashlib
from common import TEMPORARY_DIRECTORY
from documents.utils import document_save_to_temp_dir
from converter.conf.settings import UNPAPER_PATH
from converter.conf.settings import OCR_OPTIONS
from converter.conf.settings import UNOCONV_PATH
from converter.exceptions import UnpaperError, OfficeConversionError
from converter.literals import DEFAULT_PAGE_NUMBER, \
@@ -36,21 +34,6 @@ def cleanup(filename):
pass
def execute_unpaper(input_filepath, output_filepath):
"""
Executes the program unpaper using subprocess's Popen
"""
command = []
command.append(UNPAPER_PATH)
command.append(u'--overwrite')
command.append(input_filepath)
command.append(output_filepath)
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
raise UnpaperError(proc.stderr.readline())
def execute_unoconv(input_filepath, arguments=''):
"""
Executes the program unoconv using subprocess's Popen
@@ -164,38 +147,6 @@ def get_document_dimensions(document, *args, **kwargs):
return [0, 0]
def convert_document_for_ocr(document, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT):
#Extract document file
input_filepath = document_save_to_temp_dir(document, document.uuid)
#Convert for OCR
temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, file_format)
unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format)
try:
document_page = document.documentpage_set.get(page_number=page)
transformations, warnings = document_page.get_transformation_list()
#Apply default transformations
backend.convert_file(input_filepath=input_filepath, page=page, quality=QUALITY_HIGH, transformations=transformations, output_filepath=transformation_output_file)
#Do OCR operations
backend.convert_file(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file)
# Process by unpaper
execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file)
# Convert to tif
backend.convert_file(input_filepath=unpaper_output_file, output_filepath=convert_output_file)
finally:
cleanup(transformation_output_file)
cleanup(unpaper_input_file)
cleanup(unpaper_output_file)
return convert_output_file
def get_available_transformations_choices():
result = []
for transformation in backend.get_available_transformations():

View File

@@ -108,7 +108,7 @@ class ConverterClass(ConverterBase):
def get_available_transformations(self):
return [
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \
TRANSFORMATION_DENSITY, TRANSFORMATION_ZOOM
TRANSFORMATION_ZOOM
]
def get_page_count(self, input_filepath):

View File

@@ -106,7 +106,7 @@ class ConverterClass(ConverterBase):
def get_available_transformations(self):
return [
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \
TRANSFORMATION_DENSITY, TRANSFORMATION_ZOOM
TRANSFORMATION_ZOOM
]

View File

@@ -1,3 +1,4 @@
import slate
from PIL import Image
from django.utils.translation import ugettext_lazy as _
@@ -9,12 +10,28 @@ from converter.literals import TRANSFORMATION_RESIZE, \
TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM
from converter.literals import QUALITY_DEFAULT, DEFAULT_PAGE_NUMBER, \
DEFAULT_FILE_FORMAT
from converter.utils import get_mimetype
class ConverterClass(ConverterBase):
def get_page_count(self, input_filepath):
page_count = 1
im = Image.open(input_filepath)
mimetype, encoding = get_mimetype(input_filepath)
if mimetype == 'application/pdf':
# If file is a PDF open it with slate to determine the page
# count
with open(input_filepath) as fd:
pages = slate.PDF(fd)
return len(pages)
try:
im = Image.open(input_filepath)
except IOError: #cannot identify image file
# Return a page count of 1, to atleast allow the document
# to be created
return 1
try:
while 1:
im.seek(im.tell()+1)

View File

@@ -9,12 +9,11 @@ register_settings(
settings=[
{'name': u'IM_CONVERT_PATH', 'global_name': u'CONVERTER_IM_CONVERT_PATH', 'default': u'/usr/bin/convert', 'description': _(u'File path to imagemagick\'s convert program.'), 'exists': True},
{'name': u'IM_IDENTIFY_PATH', 'global_name': u'CONVERTER_IM_IDENTIFY_PATH', 'default': u'/usr/bin/identify', 'description': _(u'File path to imagemagick\'s identify program.'), 'exists': True},
{'name': u'UNPAPER_PATH', 'global_name': u'CONVERTER_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
{'name': u'GM_PATH', 'global_name': u'CONVERTER_GM_PATH', 'default': u'/usr/bin/gm', 'description': _(u'File path to graphicsmagick\'s program.'), 'exists': True},
{'name': u'GM_SETTINGS', 'global_name': u'CONVERTER_GM_SETTINGS', 'default': u''},
{'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick, converter.backends.graphicsmagick and converter.backends.python.')},
{'name': u'UNOCONV_PATH', 'global_name': u'CONVERTER_UNOCONV_PATH', 'default': u'/usr/bin/unoconv', 'exists': True},
{'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'},
#{'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'},
{'name': u'DEFAULT_OPTIONS', 'global_name': u'CONVERTER_DEFAULT_OPTIONS', 'default': u''},
{'name': u'LOW_QUALITY_OPTIONS', 'global_name': u'CONVERTER_LOW_QUALITY_OPTIONS', 'default': u''},
{'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'},

View File

@@ -3,7 +3,15 @@ import os
from django.core.exceptions import ImproperlyConfigured
from django.utils.importlib import import_module
try:
from python_magic import magic
USE_PYTHON_MAGIC = True
except:
import mimetypes
mimetypes.init()
USE_PYTHON_MAGIC = False
#http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python
def copyfile(source, dest, buffer_size=1024 * 1024):
"""
@@ -72,3 +80,32 @@ def load_backend():
raise ImproperlyConfigured(error_msg)
else:
raise # If there's some other error, this must be an error in Mayan itself.
def get_mimetype(filepath):
"""
Determine a file's mimetype by calling the system's libmagic
library via python-magic or fallback to use python's mimetypes
library
"""
file_mimetype = u''
file_mime_encoding = u''
if USE_PYTHON_MAGIC:
if os.path.exists(filepath):
try:
source = open(filepath, 'r')
mime = magic.Magic(mime=True)
file_mimetype = mime.from_buffer(source.read())
source.seek(0)
mime_encoding = magic.Magic(mime_encoding=True)
file_mime_encoding = mime_encoding.from_buffer(source.read())
finally:
if source:
source.close()
else:
path, filename = os.path.split(filepath)
file_mimetype, file_mime_encoding = mimetypes.guess_type(filename)
return file_mimetype, file_mime_encoding

View File

@@ -9,13 +9,15 @@ import sys
from django.utils.translation import ugettext as _
from django.utils.importlib import import_module
from converter.api import convert_document_for_ocr
from converter.api import convert
from documents.models import DocumentPage
from ocr.conf.settings import TESSERACT_PATH
from ocr.conf.settings import TESSERACT_LANGUAGE
from ocr.conf.settings import PDFTOTEXT_PATH
from ocr.exceptions import TesseractError, PdftotextError
from ocr.exceptions import TesseractError
from ocr.conf.settings import UNPAPER_PATH
from ocr.parsers import parse_document_page
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
def get_language_backend():
@@ -30,7 +32,7 @@ def get_language_backend():
return None
return module
backend = get_language_backend()
language_backend = get_language_backend()
def cleanup(filename):
@@ -58,62 +60,38 @@ def run_tesseract(input_filename, output_filename_base, lang=None):
raise TesseractError(error_text)
def run_pdftotext(input_filename, output_filename, page_number=None):
"""
Execute the command line binary of pdftotext
"""
command = [unicode(PDFTOTEXT_PATH)]
if page_number:
command.extend([u'-nopgbrk', u'-f', unicode(page_number), u'-l', unicode(page_number)])
command.extend([unicode(input_filename), unicode(output_filename)])
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
error_text = proc.stderr.read()
raise PdftotextError(error_text)
def do_document_ocr(document):
"""
Do OCR on all the pages of the given document object, first
trying to extract text from PDF using pdftotext then by calling
tesseract
first try to extract text from document pages using the registered
parser if the parser fails or if there is no parser registered for
the document mimetype do a visual OCR by calling tesseract
"""
for document_page in document.documentpage_set.all():
desc, filepath = tempfile.mkstemp()
imagefile = None
source = u''
try:
if document.file_mimetype == u'application/pdf':
pdf_filename = os.extsep.join([filepath, u'pdf'])
document.save_to_file(pdf_filename)
run_pdftotext(pdf_filename, filepath, document_page.page_number)
cleanup(pdf_filename)
if os.stat(filepath).st_size == 0:
#PDF page had no text, run tesseract on the page
imagefile = convert_document_for_ocr(document, page=document_page.page_number)
run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
ocr_output = os.extsep.join([filepath, u'txt'])
source = _(u'Text from OCR')
else:
ocr_output = filepath
source = _(u'Text extracted from PDF')
else:
imagefile = convert_document_for_ocr(document, page=document_page.page_number)
run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
ocr_output = os.extsep.join([filepath, u'txt'])
source = _(u'Text from OCR')
f = codecs.open(ocr_output, 'r', 'utf-8')
document_page.content = ocr_cleanup(f.read().strip())
document_page.page_label = source
document_page.save()
f.close()
cleanup(ocr_output)
finally:
os.close(desc)
cleanup(filepath)
if imagefile:
cleanup(imagefile)
# Try to extract text by means of a parser
parse_document_page(document_page)
except (ParserError, ParserUnknownFile):
# Fall back to doing visual OCR
pass
#desc, filepath = tempfile.mkstemp()
#imagefile = None
#source = u''
#imagefile = convert_document_for_ocr(document, page=document_page.page_number)
#run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
#ocr_output = os.extsep.join([filepath, u'txt'])
#source = _(u'Text from OCR')
#f = codecs.open(ocr_output, 'r', 'utf-8')
#document_page.content = ocr_cleanup(f.read().strip())
#document_page.page_label = source
#document_page.save()
#f.close()
#cleanup(ocr_output)
#finally:
# pass
#os.close(desc)
#cleanup(filepath)
#if imagefile:
# cleanup(imagefile)
def ocr_cleanup(text):
@@ -126,8 +104,8 @@ def ocr_cleanup(text):
for line in text.splitlines():
line = line.strip()
for word in line.split():
if backend:
result = backend.check_word(word)
if language_backend:
result = language_backend.check_word(word)
else:
result = word
if result:
@@ -146,3 +124,53 @@ def clean_pages():
if page.content:
page.content = ocr_cleanup(page.content)
page.save()
def execute_unpaper(input_filepath, output_filepath):
"""
Executes the program unpaper using subprocess's Popen
"""
command = []
command.append(UNPAPER_PATH)
command.append(u'--overwrite')
command.append(input_filepath)
command.append(output_filepath)
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
raise UnpaperError(proc.stderr.readline())
'''
def convert_document_for_ocr(document, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT):
#Extract document file
input_filepath = document_save_to_temp_dir(document, document.uuid)
#Convert for OCR
temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, file_format)
unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format)
try:
document_page = document.documentpage_set.get(page_number=page)
transformations, warnings = document_page.get_transformation_list()
#Apply default transformations
backend.convert_file(input_filepath=input_filepath, page=page, quality=QUALITY_HIGH, transformations=transformations, output_filepath=transformation_output_file)
#Do OCR operations
backend.convert_file(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file)
# Process by unpaper
execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file)
# Convert to tif
backend.convert_file(input_filepath=unpaper_output_file, output_filepath=convert_output_file)
finally:
cleanup(transformation_output_file)
cleanup(unpaper_input_file)
cleanup(unpaper_output_file)
return convert_output_file
'''

View File

@@ -13,8 +13,9 @@ register_settings(
{'name': u'REPLICATION_DELAY', 'global_name': u'OCR_REPLICATION_DELAY', 'default': 10, 'description': _(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.')},
{'name': u'NODE_CONCURRENT_EXECUTION', 'global_name': u'OCR_NODE_CONCURRENT_EXECUTION', 'default': 1, 'description': _(u'Maximum amount of concurrent document OCRs a node can perform.')},
{'name': u'AUTOMATIC_OCR', 'global_name': u'OCR_AUTOMATIC_OCR', 'default': False, 'description': _(u'Automatically queue newly created documents for OCR.')},
{'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'exists': True},
{'name': u'QUEUE_PROCESSING_INTERVAL', 'global_name': u'OCR_QUEUE_PROCESSING_INTERVAL', 'default': 10},
{'name': u'CACHE_URI', 'global_name': u'OCR_CACHE_URI', 'default': None, 'description': _(u'URI in the form: "memcached://127.0.0.1:11211/" to specify a cache backend to use for locking. Multiple hosts can be specified separated by a semicolon.')}
{'name': u'CACHE_URI', 'global_name': u'OCR_CACHE_URI', 'default': None, 'description': _(u'URI in the form: "memcached://127.0.0.1:11211/" to specify a cache backend to use for locking. Multiple hosts can be specified separated by a semicolon.')},
{'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
{'name': u'PARSERS_PDFTOTEXT_PATH', 'global_name': u'OCR_PARSERS_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'exists': True},
]
)

View File

@@ -4,7 +4,3 @@ class AlreadyQueued(Exception):
class TesseractError(Exception):
pass
class PdftotextError(Exception):
pass

View File

@@ -0,0 +1,40 @@
import codecs
import os
import subprocess
import tempfile
import sys
import slate
from django.utils.translation import ugettext as _
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
mimetype_registry = {}
def register_parser(mimetype, function):
mimetype_registry[mimetype] = {'function': function}
def pdf_parser(document_page):
fd = document_page.document.open()
pdf_pages = slate.PDF(fd)
fd.close()
if pdf_pages[document_page.page_number - 1] == '\x0c':
raise ParserError
document_page.content = pdf_pages[document_page.page_number - 1]
document_page.page_label = _(u'Text extracted from PDF')
document_page.save()
def parse_document_page(document_page):
try:
mimetype_registry[document_page.document.file_mimetype]['function'](document_page)
except KeyError:
raise ParserUnknownFile
register_parser('application/pdf', pdf_parser)

View File

@@ -0,0 +1,10 @@
class ParserError(Exception):
"""
Raised when a text parser fails to understand a file it been passed
or the resulting parsed text is invalid
"""
pass
class ParserUnknownFile(Exception):
pass

View File

@@ -9,3 +9,5 @@ django-celery==2.2.2
django-sentry==1.6.0
django-taggit==0.9.3
-e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt
slate==0.3
PIL==1.1.7

View File

@@ -6,3 +6,5 @@ django-celery==2.2.2
django-sentry==1.6.0
django-taggit==0.9.3
-e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt
slate==0.3
PIL==1.1.7