diff --git a/mayan/apps/converter/backends/python.py b/mayan/apps/converter/backends/python.py index 9f8173bc83..b2344ee02c 100644 --- a/mayan/apps/converter/backends/python.py +++ b/mayan/apps/converter/backends/python.py @@ -10,13 +10,16 @@ try: except ImportError: from StringIO import StringIO -import slate from PIL import Image +from pdfminer.pdfpage import PDFPage import sh +from django.utils.translation import ugettext_lazy as _ + from common.utils import fs_cleanup from ..classes import ConverterBase +from ..exceptions import PageCountError from ..settings import setting_pdftoppm_path try: @@ -78,12 +81,14 @@ class Python(ConverterBase): file_object = self.file_object try: - pages = slate.PDF(file_object) + page_count = len(list(PDFPage.get_pages(file_object))) except Exception as exception: - logger.error('Slate exception; %s', exception) - raise + error_message = _('Exception determining PDF page count; %s') % exception + logger.error(error_message) + raise PageCountError(error_message) else: - return len(pages) + logger.debug('Document contains %d pages', page_count) + return page_count finally: file_object.seek(0) else: diff --git a/mayan/apps/converter/exceptions.py b/mayan/apps/converter/exceptions.py index 617643b40e..62087a602b 100644 --- a/mayan/apps/converter/exceptions.py +++ b/mayan/apps/converter/exceptions.py @@ -29,3 +29,6 @@ class OfficeConversionError(ConvertError): class InvalidOfficeFormat(ConvertError): pass + +class PageCountError(ConvertError): + pass diff --git a/mayan/apps/ocr/api.py b/mayan/apps/ocr/api.py deleted file mode 100644 index b6e8badc93..0000000000 --- a/mayan/apps/ocr/api.py +++ /dev/null @@ -1,56 +0,0 @@ -from __future__ import unicode_literals - -import logging -import os -import tempfile - -import sh - -from common.settings import setting_temporary_directory - -from .exceptions import UnpaperError -from .parsers import parse_document_page -from .parsers.exceptions import ParserError, ParserUnknownFile -from .settings import UNPAPER_PATH - -logger = logging.getLogger(__name__) - -try: - UNPAPER = sh.Command(UNPAPER_PATH).bake( - overwrite=True, no_multi_pages=True - ) -except sh.CommandNotFound: - logger.debug('unpaper not found') - UNPAPER = None - -""" -for document_page in document_version.pages.all(): - try: - # Try to extract text by means of a parser - parse_document_page(document_page) - except (ParserError, ParserUnknownFile): - # Fall back to doing visual OCR -""" - - -def execute_unpaper(input_filepath, output_filepath=None): - """ - Executes the program unpaper using subprocess's Popen - """ - if UNPAPER: - if not output_filepath: - fd, output_filepath = tempfile.mkstemp( - dir=setting_temporary_directory.value - ) - - try: - UNPAPER(input_filepath, output_filepath) - except sh.ErrorReturnCode as exception: - logger.error(exception) - raise UnpaperError(exception.stderr) - else: - return output_filepath - finally: - os.close(fd) - else: - return input_filepath diff --git a/mayan/apps/ocr/parsers/__init__.py b/mayan/apps/ocr/parsers/__init__.py deleted file mode 100644 index 79795d2fa1..0000000000 --- a/mayan/apps/ocr/parsers/__init__.py +++ /dev/null @@ -1,157 +0,0 @@ -from __future__ import unicode_literals - -import logging -import os -import slate -import subprocess -import tempfile - -from common.settings import setting_temporary_directory -from common.utils import copyfile - -from ..settings import setting_pdftotext_path - -from .exceptions import ParserError, ParserUnknownFile - - -mimetype_registry = {} -logger = logging.getLogger(__name__) - - -def register_parser(mimetypes, parsers): - for mimetype in mimetypes: - for parser in parsers: - try: - parser_instance = parser() - except ParserError: - # If parser fails initialization is not added to the list for - # this mimetype - pass - else: - mimetype_registry.setdefault(mimetype, []).append( - parser_instance - ) - - -def parse_document_page(document_page, descriptor=None, mimetype=None): - logger.debug('executing') - logger.debug('document_page: %s', document_page) - logger.debug('document mimetype: %s', document_page.document.file_mimetype) - - if not mimetype: - mimetype = document_page.document.file_mimetype - if mimetype.startswith('text/'): - if mimetype not in CONVERTER_OFFICE_FILE_MIMETYPES: - mimetype = 'text/plain' - logger.debug('fallback to mimetype text/plain') - logger.debug('used mimetype: %s', mimetype) - - try: - for parser in mimetype_registry[mimetype]: - try: - parser.parse(document_page, descriptor) - except ParserError: - # If parser raises error, try next parser in the list - pass - else: - # If parser was successfull there is no need to try - # others in the list for this mimetype - return - - raise ParserError('Parser list exhausted') - except KeyError: - raise ParserUnknownFile - - -class Parser(object): - """ - Parser base class - """ - - def parse(self, document_page, descriptor=None): - raise NotImplementedError( - 'Your %s class has not defined a parse() method, which is required.', - self.__class__.__name__ - ) - - -class SlateParser(Parser): - """ - Parser for PDF files using the slate library for Python - """ - def parse(self, document_page, descriptor=None): - logger.debug('Starting SlateParser') - - if not descriptor: - descriptor = document_page.document_version.open() - - pdf_pages = slate.PDF(descriptor) - descriptor.close() - - if pdf_pages[document_page.page_number - 1] == b'\x0c': - logger.debug('The Slate parser didn\'t return any output') - raise ParserError('No output') - - document_page.content = pdf_pages[document_page.page_number - 1] - document_page.save() - - -class PopplerParser(Parser): - """ - PDF parser using the pdftotext execute from the poppler package - """ - def __init__(self): - self.pdftotext_path = setting_pdftotext_path.value if setting_pdftotext_path.value else '/usr/bin/pdftotext' - if not os.path.exists(self.pdftotext_path): - raise ParserError('cannot find pdftotext executable') - logger.debug('self.pdftotext_path: %s', self.pdftotext_path) - - def parse(self, document_page, descriptor=None): - logger.debug('parsing PDF with PopplerParser') - pagenum = str(document_page.page_number) - - if descriptor: - destination_descriptor, temp_filepath = tempfile.mkstemp( - dir=setting_temporary_directory.value - ) - copyfile(descriptor, temp_filepath) - document_file = temp_filepath - else: - document_file = document_page.document.document_save_to_temp_dir( - document_page.document.checksum - ) - - logger.debug('document_file: %s', document_file) - - logger.debug('parsing PDF page %s', pagenum) - - command = [] - command.append(self.pdftotext_path) - command.append('-f') - command.append(pagenum) - command.append('-l') - command.append(pagenum) - command.append(document_file) - command.append('-') - - proc = subprocess.Popen( - command, close_fds=True, stderr=subprocess.PIPE, - stdout=subprocess.PIPE - ) - return_code = proc.wait() - if return_code != 0: - logger.error(proc.stderr.readline()) - raise ParserError - - output = proc.stdout.read() - if output == b'\x0c': - logger.debug('Parser didn\'t return any output') - raise ParserError('No output') - - document_page.content = output - document_page.save() - - -register_parser( - mimetypes=['application/pdf'], parsers=[PopplerParser, SlateParser] -) diff --git a/mayan/apps/ocr/parsers/exceptions.py b/mayan/apps/ocr/parsers/exceptions.py deleted file mode 100644 index e06875f222..0000000000 --- a/mayan/apps/ocr/parsers/exceptions.py +++ /dev/null @@ -1,10 +0,0 @@ -class ParserError(Exception): - """ - Raised when a text parser fails to understand a file it been passed - or the resulting parsed text is invalid - """ - pass - - -class ParserUnknownFile(Exception): - pass diff --git a/requirements/common.txt b/requirements/common.txt index 0291652b57..5530cb2ae8 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -23,7 +23,7 @@ djangorestframework==2.4.4 fusepy==2.0.2 -pdfminer==20110227 +pdfminer==20140328 pycountry==1.10 pytesseract==0.1.6 python-dateutil==2.4.2 @@ -32,4 +32,3 @@ python-magic==0.4.6 pytz==2015.4 sh==1.11 -slate==0.3