diff --git a/docs/releases/2.0.rst b/docs/releases/2.0.rst index 7340edfee5..d7b7d9d03e 100644 --- a/docs/releases/2.0.rst +++ b/docs/releases/2.0.rst @@ -34,6 +34,7 @@ What's new in Mayan EDMS v2.0 * psutil * python-hkp * sendfile + * slate * New document converter * New class based transformations diff --git a/mayan/apps/ocr/apps.py b/mayan/apps/ocr/apps.py index 2f627084b3..bf264eb279 100644 --- a/mayan/apps/ocr/apps.py +++ b/mayan/apps/ocr/apps.py @@ -33,7 +33,7 @@ from .links import ( from .models import DocumentVersionOCRError from .permissions import permission_ocr_document, permission_ocr_content_view from .settings import ( - setting_pdftotext_path, setting_tesseract_path, setting_unpaper_path + setting_pdftotext_path, setting_tesseract_path ) from .tasks import task_do_ocr @@ -176,20 +176,3 @@ class OCRApp(MayanAppConfig): 'tesseract', _('tesseract version'), tesseract('-v').stderr, report=True ) - - try: - unpaper = sh.Command(setting_unpaper_path.value) - except sh.CommandNotFound: - namespace.add_property( - 'unpaper', _('unpaper version'), _('not found'), report=True - ) - except Exception: - namespace.add_property( - 'unpaper', _('unpaper version'), _('error getting version'), - report=True - ) - else: - namespace.add_property( - 'unpaper', _('unpaper version'), unpaper('-V').stdout, - report=True - ) diff --git a/mayan/apps/ocr/classes.py b/mayan/apps/ocr/classes.py index 4a1e90c71f..0f5a4471e7 100644 --- a/mayan/apps/ocr/classes.py +++ b/mayan/apps/ocr/classes.py @@ -5,8 +5,6 @@ import logging from converter import converter_class from .models import DocumentPageContent -from .parsers import parse_document_page -from .parsers.exceptions import ParserError, ParserUnknownFile logger = logging.getLogger(__name__) @@ -18,21 +16,31 @@ class OCRBackendBase(object): language = document_version.document.language - for page in document_version.pages.all(): - image = page.get_image() + for document_page in document_version.pages.all(): + self.process_document_page(document_page=document_page, language=language) + + def process_document_page(self, document_page, language=None): logger.info( 'Processing page: %d of document version: %s', - page.page_number, document_version + document_page.page_number, document_page.document_version ) - document_page_content, created = DocumentPageContent.objects.get_or_create(document_page=page) - document_page_content.content = self.execute( - file_object=image, language=language - ) - document_page_content.save() - image.close() + + image = document_page.get_image() + + try: + document_page_content, created = DocumentPageContent.objects.get_or_create( + document_page=document_page + ) + document_page_content.content = self.execute( + file_object=image, language=language + ) + document_page_content.save() + finally: + image.close() + logger.info( 'Finished processing page: %d of document version: %s', - page.page_number, document_version + document_page.page_number, document_page.document_version ) def execute(self, file_object, language=None, transformations=None): diff --git a/mayan/apps/ocr/exceptions.py b/mayan/apps/ocr/exceptions.py index 123f52160f..d7ea77f674 100644 --- a/mayan/apps/ocr/exceptions.py +++ b/mayan/apps/ocr/exceptions.py @@ -8,8 +8,16 @@ class OCRError(Exception): pass -class UnpaperError(Exception): +class ParserError(Exception): """ - Raised by unpaper + Base exception for file parsers """ pass + + +class NoMIMETypeMatch(ParserError): + """ + There is no parser registered for the specified MIME type + """ + pass + diff --git a/mayan/apps/ocr/parsers.py b/mayan/apps/ocr/parsers.py new file mode 100644 index 0000000000..efc65b5517 --- /dev/null +++ b/mayan/apps/ocr/parsers.py @@ -0,0 +1,171 @@ +from __future__ import unicode_literals + +from io import BytesIO +import logging +import os +from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter +from pdfminer.pdfpage import PDFPage +from pdfminer.converter import TextConverter +from pdfminer.layout import LAParams +import subprocess +import tempfile + +from django.utils.translation import ugettext_lazy as _ + +from common.settings import setting_temporary_directory +from common.utils import copyfile + +from .exceptions import ParserError, NoMIMETypeMatch +from .models import DocumentPageContent +from .settings import setting_pdftotext_path + +logger = logging.getLogger(__name__) + + +class Parser(object): + """ + Parser base class + """ + + _registry = {} + + @classmethod + def register(cls, mimetypes, parser_classes): + for mimetype in mimetypes: + for parser_class in parser_classes: + cls._registry.setdefault( + mimetype, [] + ).append(parser_class) + + @classmethod + def process_document_version(cls, document_version): + try: + for parser_class in cls._registry[mimetype]: + try: + parser = parser_class() + parser.process_document_version(document_page) + except ParserError: + # If parser raises error, try next parser in the list + pass + else: + # If parser was successfull there is no need to try + # others in the list for this mimetype + return + + raise NoMIMETypeMatch('Parser MIME type list exhausted') + except KeyError: + raise NoMIMETypeMatch + + def process_document_version(self, document_version): + logger.info('Starting parsing for document version: %s', document_version) + logger.debug('document version: %d', document_version.pk) + + for document_page in document_version.pages.all(): + self.process_document_page(document_page=document_page) + + def process_document_page(self, document_page): + logger.info( + 'Processing page: %d of document version: %s', + document_page.page_number, document_page.document_version + ) + + file_object = document_page.document_version.get_intermidiate_file() + + try: + document_page_content, created = DocumentPageContent.objects.get_or_create( + document_page=document_page + ) + document_page_content.content = self.execute( + file_object=file_object, page_number=document_page.page_number + ) + document_page_content.save() + except Exception as exception: + error_message = _('Exception parsing page; %s') % exception + logger.error(error_message) + raise ParserError(error_message) + finally: + file_object.close() + + logger.info( + 'Finished processing page: %d of document version: %s', + document_page.page_number, document_page.document_version + ) + + def execute(self, file_object, page_number): + raise NotImplementedError( + 'Your %s class has not defined the required execute() method.' % + self.__class__.__name__ + ) + + +class PopplerParser(Parser): + """ + PDF parser using the pdftotext execute from the poppler package + """ + + def __init__(self): + self.pdftotext_path = setting_pdftotext_path.value + if not os.path.exists(self.pdftotext_path): + error_message = _('Cannot find pdftotext executable at: %s') % self.pdftotext_path + logger.error(error_message) + raise ParserError(error_message) + + logger.debug('self.pdftotext_path: %s', self.pdftotext_path) + + def execute(self, file_object, page_number): + logger.debug('Parsing PDF page: %d', page_number) + + destination_descriptor, temp_filepath = tempfile.mkstemp( + dir=setting_temporary_directory.value + ) + copyfile(file_object, temp_filepath) + + command = [] + command.append(self.pdftotext_path) + command.append('-f') + command.append(str(page_number)) + command.append('-l') + command.append(str(page_number)) + command.append(temp_filepath) + command.append('-') + + proc = subprocess.Popen( + command, close_fds=True, stderr=subprocess.PIPE, + stdout=subprocess.PIPE + ) + return_code = proc.wait() + if return_code != 0: + logger.error(proc.stderr.readline()) + raise ParserError + + output = proc.stdout.read() + if output == b'\x0c': + logger.debug('Parser didn\'t return any output') + return '' + + return output + + +class PDFMinerParser(Parser): + """ + Parser for PDF files using the PDFMiner library for Python + """ + + def execute(self, file_object, page_number): + logger.debug('Parsing PDF page: %d', page_number) + + with BytesIO() as string_buffer: + rsrcmgr = PDFResourceManager() + device = TextConverter(rsrcmgr, outfp=string_buffer, laparams=LAParams()) + interpreter = PDFPageInterpreter(rsrcmgr, device) + page = PDFPage.get_pages(file_object, maxpages=1, pagenos=(page_number-1,)) + interpreter.process_page(page.next()) + device.close() + + logger.debug('Finished parsing PDF: %d', page_number) + + return string_buffer.getvalue() + +Parser.register( + mimetypes=('application/pdf',), parser_classes=(PopplerParser, PDFMinerParser) +) diff --git a/mayan/apps/ocr/settings.py b/mayan/apps/ocr/settings.py index 132e89e490..f0f1ccd807 100644 --- a/mayan/apps/ocr/settings.py +++ b/mayan/apps/ocr/settings.py @@ -9,10 +9,6 @@ setting_tesseract_path = namespace.add_setting( global_name='OCR_TESSERACT_PATH', default='/usr/bin/tesseract', help_text=_('File path to tesseract program.'), is_path=True ) -setting_unpaper_path = namespace.add_setting( - global_name='OCR_UNPAPER_PATH', default='/usr/bin/unpaper', - help_text=_('File path to unpaper program.'), is_path=True -) setting_pdftotext_path = namespace.add_setting( global_name='OCR_PDFTOTEXT_PATH', default='/usr/bin/pdftotext', help_text=_( diff --git a/mayan/apps/ocr/test_parsers.py b/mayan/apps/ocr/test_parsers.py new file mode 100644 index 0000000000..c46c21edcb --- /dev/null +++ b/mayan/apps/ocr/test_parsers.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.core.files.base import File +from django.test import TestCase + +from documents.models import DocumentType +from documents.settings import setting_language_choices +from documents.test_models import ( + TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE, TEST_SMALL_DOCUMENT_PATH +) + +from .parsers import PDFMinerParser, PopplerParser + + +class ParserTestCase(TestCase): + def setUp(self): + self.document_type = DocumentType.objects.create( + label=TEST_DOCUMENT_TYPE + ) + + ocr_settings = self.document_type.ocr_settings + ocr_settings.auto_ocr = False + ocr_settings.save() + + with open(TEST_DOCUMENT_PATH) as file_object: + self.document = self.document_type.new_document( + file_object=File(file_object) + ) + + def tearDown(self): + self.document.delete() + self.document_type.delete() + + def test_pdfminer_parser(self): + parser = PDFMinerParser() + + parser.process_document_version(self.document.latest_version) + + self.assertTrue( + 'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content + ) + + def test_poppler_parser(self): + parser = PopplerParser() + + parser.process_document_version(self.document.latest_version) + + self.assertTrue( + 'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content + )