Remove dependency on slate to calculate page number in PDF files.
This commit is contained in:
@@ -10,13 +10,16 @@ try:
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
|
||||
import slate
|
||||
from PIL import Image
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
import sh
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from common.utils import fs_cleanup
|
||||
|
||||
from ..classes import ConverterBase
|
||||
from ..exceptions import PageCountError
|
||||
from ..settings import setting_pdftoppm_path
|
||||
|
||||
try:
|
||||
@@ -78,12 +81,14 @@ class Python(ConverterBase):
|
||||
file_object = self.file_object
|
||||
|
||||
try:
|
||||
pages = slate.PDF(file_object)
|
||||
page_count = len(list(PDFPage.get_pages(file_object)))
|
||||
except Exception as exception:
|
||||
logger.error('Slate exception; %s', exception)
|
||||
raise
|
||||
error_message = _('Exception determining PDF page count; %s') % exception
|
||||
logger.error(error_message)
|
||||
raise PageCountError(error_message)
|
||||
else:
|
||||
return len(pages)
|
||||
logger.debug('Document contains %d pages', page_count)
|
||||
return page_count
|
||||
finally:
|
||||
file_object.seek(0)
|
||||
else:
|
||||
|
||||
@@ -29,3 +29,6 @@ class OfficeConversionError(ConvertError):
|
||||
|
||||
class InvalidOfficeFormat(ConvertError):
|
||||
pass
|
||||
|
||||
class PageCountError(ConvertError):
|
||||
pass
|
||||
|
||||
@@ -1,56 +0,0 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import sh
|
||||
|
||||
from common.settings import setting_temporary_directory
|
||||
|
||||
from .exceptions import UnpaperError
|
||||
from .parsers import parse_document_page
|
||||
from .parsers.exceptions import ParserError, ParserUnknownFile
|
||||
from .settings import UNPAPER_PATH
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
UNPAPER = sh.Command(UNPAPER_PATH).bake(
|
||||
overwrite=True, no_multi_pages=True
|
||||
)
|
||||
except sh.CommandNotFound:
|
||||
logger.debug('unpaper not found')
|
||||
UNPAPER = None
|
||||
|
||||
"""
|
||||
for document_page in document_version.pages.all():
|
||||
try:
|
||||
# Try to extract text by means of a parser
|
||||
parse_document_page(document_page)
|
||||
except (ParserError, ParserUnknownFile):
|
||||
# Fall back to doing visual OCR
|
||||
"""
|
||||
|
||||
|
||||
def execute_unpaper(input_filepath, output_filepath=None):
|
||||
"""
|
||||
Executes the program unpaper using subprocess's Popen
|
||||
"""
|
||||
if UNPAPER:
|
||||
if not output_filepath:
|
||||
fd, output_filepath = tempfile.mkstemp(
|
||||
dir=setting_temporary_directory.value
|
||||
)
|
||||
|
||||
try:
|
||||
UNPAPER(input_filepath, output_filepath)
|
||||
except sh.ErrorReturnCode as exception:
|
||||
logger.error(exception)
|
||||
raise UnpaperError(exception.stderr)
|
||||
else:
|
||||
return output_filepath
|
||||
finally:
|
||||
os.close(fd)
|
||||
else:
|
||||
return input_filepath
|
||||
@@ -1,157 +0,0 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
import os
|
||||
import slate
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
from common.settings import setting_temporary_directory
|
||||
from common.utils import copyfile
|
||||
|
||||
from ..settings import setting_pdftotext_path
|
||||
|
||||
from .exceptions import ParserError, ParserUnknownFile
|
||||
|
||||
|
||||
mimetype_registry = {}
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def register_parser(mimetypes, parsers):
|
||||
for mimetype in mimetypes:
|
||||
for parser in parsers:
|
||||
try:
|
||||
parser_instance = parser()
|
||||
except ParserError:
|
||||
# If parser fails initialization is not added to the list for
|
||||
# this mimetype
|
||||
pass
|
||||
else:
|
||||
mimetype_registry.setdefault(mimetype, []).append(
|
||||
parser_instance
|
||||
)
|
||||
|
||||
|
||||
def parse_document_page(document_page, descriptor=None, mimetype=None):
|
||||
logger.debug('executing')
|
||||
logger.debug('document_page: %s', document_page)
|
||||
logger.debug('document mimetype: %s', document_page.document.file_mimetype)
|
||||
|
||||
if not mimetype:
|
||||
mimetype = document_page.document.file_mimetype
|
||||
if mimetype.startswith('text/'):
|
||||
if mimetype not in CONVERTER_OFFICE_FILE_MIMETYPES:
|
||||
mimetype = 'text/plain'
|
||||
logger.debug('fallback to mimetype text/plain')
|
||||
logger.debug('used mimetype: %s', mimetype)
|
||||
|
||||
try:
|
||||
for parser in mimetype_registry[mimetype]:
|
||||
try:
|
||||
parser.parse(document_page, descriptor)
|
||||
except ParserError:
|
||||
# If parser raises error, try next parser in the list
|
||||
pass
|
||||
else:
|
||||
# If parser was successfull there is no need to try
|
||||
# others in the list for this mimetype
|
||||
return
|
||||
|
||||
raise ParserError('Parser list exhausted')
|
||||
except KeyError:
|
||||
raise ParserUnknownFile
|
||||
|
||||
|
||||
class Parser(object):
|
||||
"""
|
||||
Parser base class
|
||||
"""
|
||||
|
||||
def parse(self, document_page, descriptor=None):
|
||||
raise NotImplementedError(
|
||||
'Your %s class has not defined a parse() method, which is required.',
|
||||
self.__class__.__name__
|
||||
)
|
||||
|
||||
|
||||
class SlateParser(Parser):
|
||||
"""
|
||||
Parser for PDF files using the slate library for Python
|
||||
"""
|
||||
def parse(self, document_page, descriptor=None):
|
||||
logger.debug('Starting SlateParser')
|
||||
|
||||
if not descriptor:
|
||||
descriptor = document_page.document_version.open()
|
||||
|
||||
pdf_pages = slate.PDF(descriptor)
|
||||
descriptor.close()
|
||||
|
||||
if pdf_pages[document_page.page_number - 1] == b'\x0c':
|
||||
logger.debug('The Slate parser didn\'t return any output')
|
||||
raise ParserError('No output')
|
||||
|
||||
document_page.content = pdf_pages[document_page.page_number - 1]
|
||||
document_page.save()
|
||||
|
||||
|
||||
class PopplerParser(Parser):
|
||||
"""
|
||||
PDF parser using the pdftotext execute from the poppler package
|
||||
"""
|
||||
def __init__(self):
|
||||
self.pdftotext_path = setting_pdftotext_path.value if setting_pdftotext_path.value else '/usr/bin/pdftotext'
|
||||
if not os.path.exists(self.pdftotext_path):
|
||||
raise ParserError('cannot find pdftotext executable')
|
||||
logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
|
||||
|
||||
def parse(self, document_page, descriptor=None):
|
||||
logger.debug('parsing PDF with PopplerParser')
|
||||
pagenum = str(document_page.page_number)
|
||||
|
||||
if descriptor:
|
||||
destination_descriptor, temp_filepath = tempfile.mkstemp(
|
||||
dir=setting_temporary_directory.value
|
||||
)
|
||||
copyfile(descriptor, temp_filepath)
|
||||
document_file = temp_filepath
|
||||
else:
|
||||
document_file = document_page.document.document_save_to_temp_dir(
|
||||
document_page.document.checksum
|
||||
)
|
||||
|
||||
logger.debug('document_file: %s', document_file)
|
||||
|
||||
logger.debug('parsing PDF page %s', pagenum)
|
||||
|
||||
command = []
|
||||
command.append(self.pdftotext_path)
|
||||
command.append('-f')
|
||||
command.append(pagenum)
|
||||
command.append('-l')
|
||||
command.append(pagenum)
|
||||
command.append(document_file)
|
||||
command.append('-')
|
||||
|
||||
proc = subprocess.Popen(
|
||||
command, close_fds=True, stderr=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE
|
||||
)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
logger.error(proc.stderr.readline())
|
||||
raise ParserError
|
||||
|
||||
output = proc.stdout.read()
|
||||
if output == b'\x0c':
|
||||
logger.debug('Parser didn\'t return any output')
|
||||
raise ParserError('No output')
|
||||
|
||||
document_page.content = output
|
||||
document_page.save()
|
||||
|
||||
|
||||
register_parser(
|
||||
mimetypes=['application/pdf'], parsers=[PopplerParser, SlateParser]
|
||||
)
|
||||
@@ -1,10 +0,0 @@
|
||||
class ParserError(Exception):
|
||||
"""
|
||||
Raised when a text parser fails to understand a file it been passed
|
||||
or the resulting parsed text is invalid
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class ParserUnknownFile(Exception):
|
||||
pass
|
||||
@@ -23,7 +23,7 @@ djangorestframework==2.4.4
|
||||
|
||||
fusepy==2.0.2
|
||||
|
||||
pdfminer==20110227
|
||||
pdfminer==20140328
|
||||
pycountry==1.10
|
||||
pytesseract==0.1.6
|
||||
python-dateutil==2.4.2
|
||||
@@ -32,4 +32,3 @@ python-magic==0.4.6
|
||||
pytz==2015.4
|
||||
|
||||
sh==1.11
|
||||
slate==0.3
|
||||
|
||||
Reference in New Issue
Block a user