Update PDF text parser classes. Remove SlateParser and substitute with a PDFMiner based parser.

This commit is contained in:
Roberto Rosario
2015-07-31 02:09:48 -04:00
parent 1361ea9b42
commit 8382df91a6
7 changed files with 254 additions and 36 deletions

View File

@@ -34,6 +34,7 @@ What's new in Mayan EDMS v2.0
* psutil
* python-hkp
* sendfile
* slate
* New document converter
* New class based transformations

View File

@@ -33,7 +33,7 @@ from .links import (
from .models import DocumentVersionOCRError
from .permissions import permission_ocr_document, permission_ocr_content_view
from .settings import (
setting_pdftotext_path, setting_tesseract_path, setting_unpaper_path
setting_pdftotext_path, setting_tesseract_path
)
from .tasks import task_do_ocr
@@ -176,20 +176,3 @@ class OCRApp(MayanAppConfig):
'tesseract', _('tesseract version'), tesseract('-v').stderr,
report=True
)
try:
unpaper = sh.Command(setting_unpaper_path.value)
except sh.CommandNotFound:
namespace.add_property(
'unpaper', _('unpaper version'), _('not found'), report=True
)
except Exception:
namespace.add_property(
'unpaper', _('unpaper version'), _('error getting version'),
report=True
)
else:
namespace.add_property(
'unpaper', _('unpaper version'), unpaper('-V').stdout,
report=True
)

View File

@@ -5,8 +5,6 @@ import logging
from converter import converter_class
from .models import DocumentPageContent
from .parsers import parse_document_page
from .parsers.exceptions import ParserError, ParserUnknownFile
logger = logging.getLogger(__name__)
@@ -18,21 +16,31 @@ class OCRBackendBase(object):
language = document_version.document.language
for page in document_version.pages.all():
image = page.get_image()
for document_page in document_version.pages.all():
self.process_document_page(document_page=document_page, language=language)
def process_document_page(self, document_page, language=None):
logger.info(
'Processing page: %d of document version: %s',
page.page_number, document_version
document_page.page_number, document_page.document_version
)
document_page_content, created = DocumentPageContent.objects.get_or_create(document_page=page)
document_page_content.content = self.execute(
file_object=image, language=language
)
document_page_content.save()
image.close()
image = document_page.get_image()
try:
document_page_content, created = DocumentPageContent.objects.get_or_create(
document_page=document_page
)
document_page_content.content = self.execute(
file_object=image, language=language
)
document_page_content.save()
finally:
image.close()
logger.info(
'Finished processing page: %d of document version: %s',
page.page_number, document_version
document_page.page_number, document_page.document_version
)
def execute(self, file_object, language=None, transformations=None):

View File

@@ -8,8 +8,16 @@ class OCRError(Exception):
pass
class UnpaperError(Exception):
class ParserError(Exception):
"""
Raised by unpaper
Base exception for file parsers
"""
pass
class NoMIMETypeMatch(ParserError):
"""
There is no parser registered for the specified MIME type
"""
pass

171
mayan/apps/ocr/parsers.py Normal file
View File

@@ -0,0 +1,171 @@
from __future__ import unicode_literals
from io import BytesIO
import logging
import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import subprocess
import tempfile
from django.utils.translation import ugettext_lazy as _
from common.settings import setting_temporary_directory
from common.utils import copyfile
from .exceptions import ParserError, NoMIMETypeMatch
from .models import DocumentPageContent
from .settings import setting_pdftotext_path
logger = logging.getLogger(__name__)
class Parser(object):
"""
Parser base class
"""
_registry = {}
@classmethod
def register(cls, mimetypes, parser_classes):
for mimetype in mimetypes:
for parser_class in parser_classes:
cls._registry.setdefault(
mimetype, []
).append(parser_class)
@classmethod
def process_document_version(cls, document_version):
try:
for parser_class in cls._registry[mimetype]:
try:
parser = parser_class()
parser.process_document_version(document_page)
except ParserError:
# If parser raises error, try next parser in the list
pass
else:
# If parser was successfull there is no need to try
# others in the list for this mimetype
return
raise NoMIMETypeMatch('Parser MIME type list exhausted')
except KeyError:
raise NoMIMETypeMatch
def process_document_version(self, document_version):
logger.info('Starting parsing for document version: %s', document_version)
logger.debug('document version: %d', document_version.pk)
for document_page in document_version.pages.all():
self.process_document_page(document_page=document_page)
def process_document_page(self, document_page):
logger.info(
'Processing page: %d of document version: %s',
document_page.page_number, document_page.document_version
)
file_object = document_page.document_version.get_intermidiate_file()
try:
document_page_content, created = DocumentPageContent.objects.get_or_create(
document_page=document_page
)
document_page_content.content = self.execute(
file_object=file_object, page_number=document_page.page_number
)
document_page_content.save()
except Exception as exception:
error_message = _('Exception parsing page; %s') % exception
logger.error(error_message)
raise ParserError(error_message)
finally:
file_object.close()
logger.info(
'Finished processing page: %d of document version: %s',
document_page.page_number, document_page.document_version
)
def execute(self, file_object, page_number):
raise NotImplementedError(
'Your %s class has not defined the required execute() method.' %
self.__class__.__name__
)
class PopplerParser(Parser):
"""
PDF parser using the pdftotext execute from the poppler package
"""
def __init__(self):
self.pdftotext_path = setting_pdftotext_path.value
if not os.path.exists(self.pdftotext_path):
error_message = _('Cannot find pdftotext executable at: %s') % self.pdftotext_path
logger.error(error_message)
raise ParserError(error_message)
logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
def execute(self, file_object, page_number):
logger.debug('Parsing PDF page: %d', page_number)
destination_descriptor, temp_filepath = tempfile.mkstemp(
dir=setting_temporary_directory.value
)
copyfile(file_object, temp_filepath)
command = []
command.append(self.pdftotext_path)
command.append('-f')
command.append(str(page_number))
command.append('-l')
command.append(str(page_number))
command.append(temp_filepath)
command.append('-')
proc = subprocess.Popen(
command, close_fds=True, stderr=subprocess.PIPE,
stdout=subprocess.PIPE
)
return_code = proc.wait()
if return_code != 0:
logger.error(proc.stderr.readline())
raise ParserError
output = proc.stdout.read()
if output == b'\x0c':
logger.debug('Parser didn\'t return any output')
return ''
return output
class PDFMinerParser(Parser):
"""
Parser for PDF files using the PDFMiner library for Python
"""
def execute(self, file_object, page_number):
logger.debug('Parsing PDF page: %d', page_number)
with BytesIO() as string_buffer:
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, outfp=string_buffer, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
page = PDFPage.get_pages(file_object, maxpages=1, pagenos=(page_number-1,))
interpreter.process_page(page.next())
device.close()
logger.debug('Finished parsing PDF: %d', page_number)
return string_buffer.getvalue()
Parser.register(
mimetypes=('application/pdf',), parser_classes=(PopplerParser, PDFMinerParser)
)

View File

@@ -9,10 +9,6 @@ setting_tesseract_path = namespace.add_setting(
global_name='OCR_TESSERACT_PATH', default='/usr/bin/tesseract',
help_text=_('File path to tesseract program.'), is_path=True
)
setting_unpaper_path = namespace.add_setting(
global_name='OCR_UNPAPER_PATH', default='/usr/bin/unpaper',
help_text=_('File path to unpaper program.'), is_path=True
)
setting_pdftotext_path = namespace.add_setting(
global_name='OCR_PDFTOTEXT_PATH', default='/usr/bin/pdftotext',
help_text=_(

View File

@@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.core.files.base import File
from django.test import TestCase
from documents.models import DocumentType
from documents.settings import setting_language_choices
from documents.test_models import (
TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE, TEST_SMALL_DOCUMENT_PATH
)
from .parsers import PDFMinerParser, PopplerParser
class ParserTestCase(TestCase):
def setUp(self):
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE
)
ocr_settings = self.document_type.ocr_settings
ocr_settings.auto_ocr = False
ocr_settings.save()
with open(TEST_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=File(file_object)
)
def tearDown(self):
self.document.delete()
self.document_type.delete()
def test_pdfminer_parser(self):
parser = PDFMinerParser()
parser.process_document_version(self.document.latest_version)
self.assertTrue(
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
)
def test_poppler_parser(self):
parser = PopplerParser()
parser.process_document_version(self.document.latest_version)
self.assertTrue(
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
)