diff --git a/README.md b/README.md index e1e21d1eae..662d33ab2a 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,8 @@ Python: * django-mptt - Utilities for implementing a modified pre-order traversal tree in django * python-magic - A python wrapper for libmagic * django-taggit - Simple tagging for django +* slate - The simplest way to extract text from PDFs in Python + Execute pip install -r requirements/production.txt to install the python/django dependencies automatically. diff --git a/apps/converter/api.py b/apps/converter/api.py index 71a188a36d..665a980c27 100644 --- a/apps/converter/api.py +++ b/apps/converter/api.py @@ -5,8 +5,6 @@ import hashlib from common import TEMPORARY_DIRECTORY from documents.utils import document_save_to_temp_dir -from converter.conf.settings import UNPAPER_PATH -from converter.conf.settings import OCR_OPTIONS from converter.conf.settings import UNOCONV_PATH from converter.exceptions import UnpaperError, OfficeConversionError from converter.literals import DEFAULT_PAGE_NUMBER, \ @@ -36,21 +34,6 @@ def cleanup(filename): pass -def execute_unpaper(input_filepath, output_filepath): - """ - Executes the program unpaper using subprocess's Popen - """ - command = [] - command.append(UNPAPER_PATH) - command.append(u'--overwrite') - command.append(input_filepath) - command.append(output_filepath) - proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE) - return_code = proc.wait() - if return_code != 0: - raise UnpaperError(proc.stderr.readline()) - - def execute_unoconv(input_filepath, arguments=''): """ Executes the program unoconv using subprocess's Popen @@ -164,38 +147,6 @@ def get_document_dimensions(document, *args, **kwargs): return [0, 0] -def convert_document_for_ocr(document, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT): - #Extract document file - input_filepath = document_save_to_temp_dir(document, document.uuid) - - #Convert for OCR - temp_filename, separator = os.path.splitext(os.path.basename(input_filepath)) - temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename) - transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, file_format) - unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep) - unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep) - convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format) - - try: - document_page = document.documentpage_set.get(page_number=page) - transformations, warnings = document_page.get_transformation_list() - - #Apply default transformations - backend.convert_file(input_filepath=input_filepath, page=page, quality=QUALITY_HIGH, transformations=transformations, output_filepath=transformation_output_file) - #Do OCR operations - backend.convert_file(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file) - # Process by unpaper - execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file) - # Convert to tif - backend.convert_file(input_filepath=unpaper_output_file, output_filepath=convert_output_file) - finally: - cleanup(transformation_output_file) - cleanup(unpaper_input_file) - cleanup(unpaper_output_file) - - return convert_output_file - - def get_available_transformations_choices(): result = [] for transformation in backend.get_available_transformations(): diff --git a/apps/converter/backends/graphicsmagick/base.py b/apps/converter/backends/graphicsmagick/base.py index 4d3910391b..54ebbaaa95 100644 --- a/apps/converter/backends/graphicsmagick/base.py +++ b/apps/converter/backends/graphicsmagick/base.py @@ -108,7 +108,7 @@ class ConverterClass(ConverterBase): def get_available_transformations(self): return [ TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \ - TRANSFORMATION_DENSITY, TRANSFORMATION_ZOOM + TRANSFORMATION_ZOOM ] def get_page_count(self, input_filepath): diff --git a/apps/converter/backends/imagemagick/base.py b/apps/converter/backends/imagemagick/base.py index c9977fb3b4..4f924316ed 100644 --- a/apps/converter/backends/imagemagick/base.py +++ b/apps/converter/backends/imagemagick/base.py @@ -106,7 +106,7 @@ class ConverterClass(ConverterBase): def get_available_transformations(self): return [ TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \ - TRANSFORMATION_DENSITY, TRANSFORMATION_ZOOM + TRANSFORMATION_ZOOM ] diff --git a/apps/converter/backends/python/base.py b/apps/converter/backends/python/base.py index 616e997d3f..25448346ff 100644 --- a/apps/converter/backends/python/base.py +++ b/apps/converter/backends/python/base.py @@ -1,3 +1,4 @@ +import slate from PIL import Image from django.utils.translation import ugettext_lazy as _ @@ -9,12 +10,28 @@ from converter.literals import TRANSFORMATION_RESIZE, \ TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM from converter.literals import QUALITY_DEFAULT, DEFAULT_PAGE_NUMBER, \ DEFAULT_FILE_FORMAT +from converter.utils import get_mimetype + class ConverterClass(ConverterBase): def get_page_count(self, input_filepath): page_count = 1 - im = Image.open(input_filepath) - + + mimetype, encoding = get_mimetype(input_filepath) + if mimetype == 'application/pdf': + # If file is a PDF open it with slate to determine the page + # count + with open(input_filepath) as fd: + pages = slate.PDF(fd) + return len(pages) + + try: + im = Image.open(input_filepath) + except IOError: #cannot identify image file + # Return a page count of 1, to atleast allow the document + # to be created + return 1 + try: while 1: im.seek(im.tell()+1) diff --git a/apps/converter/conf/settings.py b/apps/converter/conf/settings.py index fcaa1ec9b0..95aee33b92 100644 --- a/apps/converter/conf/settings.py +++ b/apps/converter/conf/settings.py @@ -9,12 +9,11 @@ register_settings( settings=[ {'name': u'IM_CONVERT_PATH', 'global_name': u'CONVERTER_IM_CONVERT_PATH', 'default': u'/usr/bin/convert', 'description': _(u'File path to imagemagick\'s convert program.'), 'exists': True}, {'name': u'IM_IDENTIFY_PATH', 'global_name': u'CONVERTER_IM_IDENTIFY_PATH', 'default': u'/usr/bin/identify', 'description': _(u'File path to imagemagick\'s identify program.'), 'exists': True}, - {'name': u'UNPAPER_PATH', 'global_name': u'CONVERTER_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True}, {'name': u'GM_PATH', 'global_name': u'CONVERTER_GM_PATH', 'default': u'/usr/bin/gm', 'description': _(u'File path to graphicsmagick\'s program.'), 'exists': True}, {'name': u'GM_SETTINGS', 'global_name': u'CONVERTER_GM_SETTINGS', 'default': u''}, {'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick, converter.backends.graphicsmagick and converter.backends.python.')}, {'name': u'UNOCONV_PATH', 'global_name': u'CONVERTER_UNOCONV_PATH', 'default': u'/usr/bin/unoconv', 'exists': True}, - {'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'}, + #{'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'}, {'name': u'DEFAULT_OPTIONS', 'global_name': u'CONVERTER_DEFAULT_OPTIONS', 'default': u''}, {'name': u'LOW_QUALITY_OPTIONS', 'global_name': u'CONVERTER_LOW_QUALITY_OPTIONS', 'default': u''}, {'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'}, diff --git a/apps/converter/utils.py b/apps/converter/utils.py index 5fc106a940..4653b6dc9d 100644 --- a/apps/converter/utils.py +++ b/apps/converter/utils.py @@ -3,7 +3,15 @@ import os from django.core.exceptions import ImproperlyConfigured from django.utils.importlib import import_module - +try: + from python_magic import magic + USE_PYTHON_MAGIC = True +except: + import mimetypes + mimetypes.init() + USE_PYTHON_MAGIC = False + + #http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python def copyfile(source, dest, buffer_size=1024 * 1024): """ @@ -72,3 +80,32 @@ def load_backend(): raise ImproperlyConfigured(error_msg) else: raise # If there's some other error, this must be an error in Mayan itself. + + +def get_mimetype(filepath): + """ + Determine a file's mimetype by calling the system's libmagic + library via python-magic or fallback to use python's mimetypes + library + """ + file_mimetype = u'' + file_mime_encoding = u'' + + if USE_PYTHON_MAGIC: + if os.path.exists(filepath): + try: + source = open(filepath, 'r') + mime = magic.Magic(mime=True) + file_mimetype = mime.from_buffer(source.read()) + source.seek(0) + mime_encoding = magic.Magic(mime_encoding=True) + file_mime_encoding = mime_encoding.from_buffer(source.read()) + finally: + if source: + source.close() + else: + path, filename = os.path.split(filepath) + file_mimetype, file_mime_encoding = mimetypes.guess_type(filename) + + return file_mimetype, file_mime_encoding + diff --git a/apps/ocr/api.py b/apps/ocr/api.py index 88e9c20356..ec89a669c9 100644 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -9,13 +9,15 @@ import sys from django.utils.translation import ugettext as _ from django.utils.importlib import import_module -from converter.api import convert_document_for_ocr +from converter.api import convert from documents.models import DocumentPage from ocr.conf.settings import TESSERACT_PATH from ocr.conf.settings import TESSERACT_LANGUAGE -from ocr.conf.settings import PDFTOTEXT_PATH -from ocr.exceptions import TesseractError, PdftotextError +from ocr.exceptions import TesseractError +from ocr.conf.settings import UNPAPER_PATH +from ocr.parsers import parse_document_page +from ocr.parsers.exceptions import ParserError, ParserUnknownFile def get_language_backend(): @@ -30,7 +32,7 @@ def get_language_backend(): return None return module -backend = get_language_backend() +language_backend = get_language_backend() def cleanup(filename): @@ -58,62 +60,38 @@ def run_tesseract(input_filename, output_filename_base, lang=None): raise TesseractError(error_text) -def run_pdftotext(input_filename, output_filename, page_number=None): - """ - Execute the command line binary of pdftotext - """ - command = [unicode(PDFTOTEXT_PATH)] - if page_number: - command.extend([u'-nopgbrk', u'-f', unicode(page_number), u'-l', unicode(page_number)]) - command.extend([unicode(input_filename), unicode(output_filename)]) - proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - return_code = proc.wait() - if return_code != 0: - error_text = proc.stderr.read() - raise PdftotextError(error_text) - - def do_document_ocr(document): """ - Do OCR on all the pages of the given document object, first - trying to extract text from PDF using pdftotext then by calling - tesseract + first try to extract text from document pages using the registered + parser if the parser fails or if there is no parser registered for + the document mimetype do a visual OCR by calling tesseract """ for document_page in document.documentpage_set.all(): - desc, filepath = tempfile.mkstemp() - imagefile = None - source = u'' try: - if document.file_mimetype == u'application/pdf': - pdf_filename = os.extsep.join([filepath, u'pdf']) - document.save_to_file(pdf_filename) - run_pdftotext(pdf_filename, filepath, document_page.page_number) - cleanup(pdf_filename) - if os.stat(filepath).st_size == 0: - #PDF page had no text, run tesseract on the page - imagefile = convert_document_for_ocr(document, page=document_page.page_number) - run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE) - ocr_output = os.extsep.join([filepath, u'txt']) - source = _(u'Text from OCR') - else: - ocr_output = filepath - source = _(u'Text extracted from PDF') - else: - imagefile = convert_document_for_ocr(document, page=document_page.page_number) - run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE) - ocr_output = os.extsep.join([filepath, u'txt']) - source = _(u'Text from OCR') - f = codecs.open(ocr_output, 'r', 'utf-8') - document_page.content = ocr_cleanup(f.read().strip()) - document_page.page_label = source - document_page.save() - f.close() - cleanup(ocr_output) - finally: - os.close(desc) - cleanup(filepath) - if imagefile: - cleanup(imagefile) + # Try to extract text by means of a parser + parse_document_page(document_page) + except (ParserError, ParserUnknownFile): + # Fall back to doing visual OCR + pass + #desc, filepath = tempfile.mkstemp() + #imagefile = None + #source = u'' + #imagefile = convert_document_for_ocr(document, page=document_page.page_number) + #run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE) + #ocr_output = os.extsep.join([filepath, u'txt']) + #source = _(u'Text from OCR') + #f = codecs.open(ocr_output, 'r', 'utf-8') + #document_page.content = ocr_cleanup(f.read().strip()) + #document_page.page_label = source + #document_page.save() + #f.close() + #cleanup(ocr_output) + #finally: + # pass + #os.close(desc) + #cleanup(filepath) + #if imagefile: + # cleanup(imagefile) def ocr_cleanup(text): @@ -126,8 +104,8 @@ def ocr_cleanup(text): for line in text.splitlines(): line = line.strip() for word in line.split(): - if backend: - result = backend.check_word(word) + if language_backend: + result = language_backend.check_word(word) else: result = word if result: @@ -146,3 +124,53 @@ def clean_pages(): if page.content: page.content = ocr_cleanup(page.content) page.save() + + +def execute_unpaper(input_filepath, output_filepath): + """ + Executes the program unpaper using subprocess's Popen + """ + command = [] + command.append(UNPAPER_PATH) + command.append(u'--overwrite') + command.append(input_filepath) + command.append(output_filepath) + proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE) + return_code = proc.wait() + if return_code != 0: + raise UnpaperError(proc.stderr.readline()) + +''' +def convert_document_for_ocr(document, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT): + #Extract document file + input_filepath = document_save_to_temp_dir(document, document.uuid) + + #Convert for OCR + temp_filename, separator = os.path.splitext(os.path.basename(input_filepath)) + temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename) + transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, file_format) + unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep) + unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep) + convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format) + + try: + document_page = document.documentpage_set.get(page_number=page) + transformations, warnings = document_page.get_transformation_list() + + #Apply default transformations + backend.convert_file(input_filepath=input_filepath, page=page, quality=QUALITY_HIGH, transformations=transformations, output_filepath=transformation_output_file) + #Do OCR operations + backend.convert_file(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file) + # Process by unpaper + execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file) + # Convert to tif + backend.convert_file(input_filepath=unpaper_output_file, output_filepath=convert_output_file) + finally: + cleanup(transformation_output_file) + cleanup(unpaper_input_file) + cleanup(unpaper_output_file) + + return convert_output_file +''' + + diff --git a/apps/ocr/conf/settings.py b/apps/ocr/conf/settings.py index e9024b7152..52785f46ac 100644 --- a/apps/ocr/conf/settings.py +++ b/apps/ocr/conf/settings.py @@ -13,8 +13,9 @@ register_settings( {'name': u'REPLICATION_DELAY', 'global_name': u'OCR_REPLICATION_DELAY', 'default': 10, 'description': _(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.')}, {'name': u'NODE_CONCURRENT_EXECUTION', 'global_name': u'OCR_NODE_CONCURRENT_EXECUTION', 'default': 1, 'description': _(u'Maximum amount of concurrent document OCRs a node can perform.')}, {'name': u'AUTOMATIC_OCR', 'global_name': u'OCR_AUTOMATIC_OCR', 'default': False, 'description': _(u'Automatically queue newly created documents for OCR.')}, - {'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'exists': True}, {'name': u'QUEUE_PROCESSING_INTERVAL', 'global_name': u'OCR_QUEUE_PROCESSING_INTERVAL', 'default': 10}, - {'name': u'CACHE_URI', 'global_name': u'OCR_CACHE_URI', 'default': None, 'description': _(u'URI in the form: "memcached://127.0.0.1:11211/" to specify a cache backend to use for locking. Multiple hosts can be specified separated by a semicolon.')} + {'name': u'CACHE_URI', 'global_name': u'OCR_CACHE_URI', 'default': None, 'description': _(u'URI in the form: "memcached://127.0.0.1:11211/" to specify a cache backend to use for locking. Multiple hosts can be specified separated by a semicolon.')}, + {'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True}, + {'name': u'PARSERS_PDFTOTEXT_PATH', 'global_name': u'OCR_PARSERS_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'exists': True}, ] ) diff --git a/apps/ocr/exceptions.py b/apps/ocr/exceptions.py index 4bfa8f725a..b1ec8c3fe3 100644 --- a/apps/ocr/exceptions.py +++ b/apps/ocr/exceptions.py @@ -4,7 +4,3 @@ class AlreadyQueued(Exception): class TesseractError(Exception): pass - - -class PdftotextError(Exception): - pass diff --git a/apps/ocr/parsers/__init__.py b/apps/ocr/parsers/__init__.py new file mode 100644 index 0000000000..815e868747 --- /dev/null +++ b/apps/ocr/parsers/__init__.py @@ -0,0 +1,40 @@ +import codecs +import os +import subprocess +import tempfile +import sys + +import slate + +from django.utils.translation import ugettext as _ + +from ocr.parsers.exceptions import ParserError, ParserUnknownFile + +mimetype_registry = {} + + +def register_parser(mimetype, function): + mimetype_registry[mimetype] = {'function': function} + + +def pdf_parser(document_page): + fd = document_page.document.open() + pdf_pages = slate.PDF(fd) + fd.close() + + if pdf_pages[document_page.page_number - 1] == '\x0c': + raise ParserError + + document_page.content = pdf_pages[document_page.page_number - 1] + document_page.page_label = _(u'Text extracted from PDF') + document_page.save() + + +def parse_document_page(document_page): + try: + mimetype_registry[document_page.document.file_mimetype]['function'](document_page) + except KeyError: + raise ParserUnknownFile + + +register_parser('application/pdf', pdf_parser) diff --git a/apps/ocr/parsers/exceptions.py b/apps/ocr/parsers/exceptions.py new file mode 100644 index 0000000000..e06875f222 --- /dev/null +++ b/apps/ocr/parsers/exceptions.py @@ -0,0 +1,10 @@ +class ParserError(Exception): + """ + Raised when a text parser fails to understand a file it been passed + or the resulting parsed text is invalid + """ + pass + + +class ParserUnknownFile(Exception): + pass diff --git a/requirements/development.txt b/requirements/development.txt index 00def8c63a..3acf630b4e 100644 --- a/requirements/development.txt +++ b/requirements/development.txt @@ -9,3 +9,5 @@ django-celery==2.2.2 django-sentry==1.6.0 django-taggit==0.9.3 -e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt +slate==0.3 +PIL==1.1.7 diff --git a/requirements/production.txt b/requirements/production.txt index 1f1d3a0881..02219abaee 100644 --- a/requirements/production.txt +++ b/requirements/production.txt @@ -6,3 +6,5 @@ django-celery==2.2.2 django-sentry==1.6.0 django-taggit==0.9.3 -e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt +slate==0.3 +PIL==1.1.7