Update PDF text parser classes. Remove SlateParser and substitute with a PDFMiner based parser.
This commit is contained in:
@@ -34,6 +34,7 @@ What's new in Mayan EDMS v2.0
|
||||
* psutil
|
||||
* python-hkp
|
||||
* sendfile
|
||||
* slate
|
||||
|
||||
* New document converter
|
||||
* New class based transformations
|
||||
|
||||
@@ -33,7 +33,7 @@ from .links import (
|
||||
from .models import DocumentVersionOCRError
|
||||
from .permissions import permission_ocr_document, permission_ocr_content_view
|
||||
from .settings import (
|
||||
setting_pdftotext_path, setting_tesseract_path, setting_unpaper_path
|
||||
setting_pdftotext_path, setting_tesseract_path
|
||||
)
|
||||
from .tasks import task_do_ocr
|
||||
|
||||
@@ -176,20 +176,3 @@ class OCRApp(MayanAppConfig):
|
||||
'tesseract', _('tesseract version'), tesseract('-v').stderr,
|
||||
report=True
|
||||
)
|
||||
|
||||
try:
|
||||
unpaper = sh.Command(setting_unpaper_path.value)
|
||||
except sh.CommandNotFound:
|
||||
namespace.add_property(
|
||||
'unpaper', _('unpaper version'), _('not found'), report=True
|
||||
)
|
||||
except Exception:
|
||||
namespace.add_property(
|
||||
'unpaper', _('unpaper version'), _('error getting version'),
|
||||
report=True
|
||||
)
|
||||
else:
|
||||
namespace.add_property(
|
||||
'unpaper', _('unpaper version'), unpaper('-V').stdout,
|
||||
report=True
|
||||
)
|
||||
|
||||
@@ -5,8 +5,6 @@ import logging
|
||||
from converter import converter_class
|
||||
|
||||
from .models import DocumentPageContent
|
||||
from .parsers import parse_document_page
|
||||
from .parsers.exceptions import ParserError, ParserUnknownFile
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -18,21 +16,31 @@ class OCRBackendBase(object):
|
||||
|
||||
language = document_version.document.language
|
||||
|
||||
for page in document_version.pages.all():
|
||||
image = page.get_image()
|
||||
for document_page in document_version.pages.all():
|
||||
self.process_document_page(document_page=document_page, language=language)
|
||||
|
||||
def process_document_page(self, document_page, language=None):
|
||||
logger.info(
|
||||
'Processing page: %d of document version: %s',
|
||||
page.page_number, document_version
|
||||
document_page.page_number, document_page.document_version
|
||||
)
|
||||
document_page_content, created = DocumentPageContent.objects.get_or_create(document_page=page)
|
||||
document_page_content.content = self.execute(
|
||||
file_object=image, language=language
|
||||
)
|
||||
document_page_content.save()
|
||||
image.close()
|
||||
|
||||
image = document_page.get_image()
|
||||
|
||||
try:
|
||||
document_page_content, created = DocumentPageContent.objects.get_or_create(
|
||||
document_page=document_page
|
||||
)
|
||||
document_page_content.content = self.execute(
|
||||
file_object=image, language=language
|
||||
)
|
||||
document_page_content.save()
|
||||
finally:
|
||||
image.close()
|
||||
|
||||
logger.info(
|
||||
'Finished processing page: %d of document version: %s',
|
||||
page.page_number, document_version
|
||||
document_page.page_number, document_page.document_version
|
||||
)
|
||||
|
||||
def execute(self, file_object, language=None, transformations=None):
|
||||
|
||||
@@ -8,8 +8,16 @@ class OCRError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class UnpaperError(Exception):
|
||||
class ParserError(Exception):
|
||||
"""
|
||||
Raised by unpaper
|
||||
Base exception for file parsers
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class NoMIMETypeMatch(ParserError):
|
||||
"""
|
||||
There is no parser registered for the specified MIME type
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
171
mayan/apps/ocr/parsers.py
Normal file
171
mayan/apps/ocr/parsers.py
Normal file
@@ -0,0 +1,171 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from io import BytesIO
|
||||
import logging
|
||||
import os
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.converter import TextConverter
|
||||
from pdfminer.layout import LAParams
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from common.settings import setting_temporary_directory
|
||||
from common.utils import copyfile
|
||||
|
||||
from .exceptions import ParserError, NoMIMETypeMatch
|
||||
from .models import DocumentPageContent
|
||||
from .settings import setting_pdftotext_path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Parser(object):
|
||||
"""
|
||||
Parser base class
|
||||
"""
|
||||
|
||||
_registry = {}
|
||||
|
||||
@classmethod
|
||||
def register(cls, mimetypes, parser_classes):
|
||||
for mimetype in mimetypes:
|
||||
for parser_class in parser_classes:
|
||||
cls._registry.setdefault(
|
||||
mimetype, []
|
||||
).append(parser_class)
|
||||
|
||||
@classmethod
|
||||
def process_document_version(cls, document_version):
|
||||
try:
|
||||
for parser_class in cls._registry[mimetype]:
|
||||
try:
|
||||
parser = parser_class()
|
||||
parser.process_document_version(document_page)
|
||||
except ParserError:
|
||||
# If parser raises error, try next parser in the list
|
||||
pass
|
||||
else:
|
||||
# If parser was successfull there is no need to try
|
||||
# others in the list for this mimetype
|
||||
return
|
||||
|
||||
raise NoMIMETypeMatch('Parser MIME type list exhausted')
|
||||
except KeyError:
|
||||
raise NoMIMETypeMatch
|
||||
|
||||
def process_document_version(self, document_version):
|
||||
logger.info('Starting parsing for document version: %s', document_version)
|
||||
logger.debug('document version: %d', document_version.pk)
|
||||
|
||||
for document_page in document_version.pages.all():
|
||||
self.process_document_page(document_page=document_page)
|
||||
|
||||
def process_document_page(self, document_page):
|
||||
logger.info(
|
||||
'Processing page: %d of document version: %s',
|
||||
document_page.page_number, document_page.document_version
|
||||
)
|
||||
|
||||
file_object = document_page.document_version.get_intermidiate_file()
|
||||
|
||||
try:
|
||||
document_page_content, created = DocumentPageContent.objects.get_or_create(
|
||||
document_page=document_page
|
||||
)
|
||||
document_page_content.content = self.execute(
|
||||
file_object=file_object, page_number=document_page.page_number
|
||||
)
|
||||
document_page_content.save()
|
||||
except Exception as exception:
|
||||
error_message = _('Exception parsing page; %s') % exception
|
||||
logger.error(error_message)
|
||||
raise ParserError(error_message)
|
||||
finally:
|
||||
file_object.close()
|
||||
|
||||
logger.info(
|
||||
'Finished processing page: %d of document version: %s',
|
||||
document_page.page_number, document_page.document_version
|
||||
)
|
||||
|
||||
def execute(self, file_object, page_number):
|
||||
raise NotImplementedError(
|
||||
'Your %s class has not defined the required execute() method.' %
|
||||
self.__class__.__name__
|
||||
)
|
||||
|
||||
|
||||
class PopplerParser(Parser):
|
||||
"""
|
||||
PDF parser using the pdftotext execute from the poppler package
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.pdftotext_path = setting_pdftotext_path.value
|
||||
if not os.path.exists(self.pdftotext_path):
|
||||
error_message = _('Cannot find pdftotext executable at: %s') % self.pdftotext_path
|
||||
logger.error(error_message)
|
||||
raise ParserError(error_message)
|
||||
|
||||
logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
|
||||
|
||||
def execute(self, file_object, page_number):
|
||||
logger.debug('Parsing PDF page: %d', page_number)
|
||||
|
||||
destination_descriptor, temp_filepath = tempfile.mkstemp(
|
||||
dir=setting_temporary_directory.value
|
||||
)
|
||||
copyfile(file_object, temp_filepath)
|
||||
|
||||
command = []
|
||||
command.append(self.pdftotext_path)
|
||||
command.append('-f')
|
||||
command.append(str(page_number))
|
||||
command.append('-l')
|
||||
command.append(str(page_number))
|
||||
command.append(temp_filepath)
|
||||
command.append('-')
|
||||
|
||||
proc = subprocess.Popen(
|
||||
command, close_fds=True, stderr=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE
|
||||
)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
logger.error(proc.stderr.readline())
|
||||
raise ParserError
|
||||
|
||||
output = proc.stdout.read()
|
||||
if output == b'\x0c':
|
||||
logger.debug('Parser didn\'t return any output')
|
||||
return ''
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class PDFMinerParser(Parser):
|
||||
"""
|
||||
Parser for PDF files using the PDFMiner library for Python
|
||||
"""
|
||||
|
||||
def execute(self, file_object, page_number):
|
||||
logger.debug('Parsing PDF page: %d', page_number)
|
||||
|
||||
with BytesIO() as string_buffer:
|
||||
rsrcmgr = PDFResourceManager()
|
||||
device = TextConverter(rsrcmgr, outfp=string_buffer, laparams=LAParams())
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
page = PDFPage.get_pages(file_object, maxpages=1, pagenos=(page_number-1,))
|
||||
interpreter.process_page(page.next())
|
||||
device.close()
|
||||
|
||||
logger.debug('Finished parsing PDF: %d', page_number)
|
||||
|
||||
return string_buffer.getvalue()
|
||||
|
||||
Parser.register(
|
||||
mimetypes=('application/pdf',), parser_classes=(PopplerParser, PDFMinerParser)
|
||||
)
|
||||
@@ -9,10 +9,6 @@ setting_tesseract_path = namespace.add_setting(
|
||||
global_name='OCR_TESSERACT_PATH', default='/usr/bin/tesseract',
|
||||
help_text=_('File path to tesseract program.'), is_path=True
|
||||
)
|
||||
setting_unpaper_path = namespace.add_setting(
|
||||
global_name='OCR_UNPAPER_PATH', default='/usr/bin/unpaper',
|
||||
help_text=_('File path to unpaper program.'), is_path=True
|
||||
)
|
||||
setting_pdftotext_path = namespace.add_setting(
|
||||
global_name='OCR_PDFTOTEXT_PATH', default='/usr/bin/pdftotext',
|
||||
help_text=_(
|
||||
|
||||
51
mayan/apps/ocr/test_parsers.py
Normal file
51
mayan/apps/ocr/test_parsers.py
Normal file
@@ -0,0 +1,51 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.core.files.base import File
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.models import DocumentType
|
||||
from documents.settings import setting_language_choices
|
||||
from documents.test_models import (
|
||||
TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE, TEST_SMALL_DOCUMENT_PATH
|
||||
)
|
||||
|
||||
from .parsers import PDFMinerParser, PopplerParser
|
||||
|
||||
|
||||
class ParserTestCase(TestCase):
|
||||
def setUp(self):
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE
|
||||
)
|
||||
|
||||
ocr_settings = self.document_type.ocr_settings
|
||||
ocr_settings.auto_ocr = False
|
||||
ocr_settings.save()
|
||||
|
||||
with open(TEST_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=File(file_object)
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document.delete()
|
||||
self.document_type.delete()
|
||||
|
||||
def test_pdfminer_parser(self):
|
||||
parser = PDFMinerParser()
|
||||
|
||||
parser.process_document_version(self.document.latest_version)
|
||||
|
||||
self.assertTrue(
|
||||
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
||||
)
|
||||
|
||||
def test_poppler_parser(self):
|
||||
parser = PopplerParser()
|
||||
|
||||
parser.process_document_version(self.document.latest_version)
|
||||
|
||||
self.assertTrue(
|
||||
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
||||
)
|
||||
Reference in New Issue
Block a user