Remove dependency on slate to calculate page number in PDF files.

This commit is contained in:
Roberto Rosario
2015-07-31 02:09:10 -04:00
parent dcd909f488
commit 1361ea9b42
6 changed files with 14 additions and 230 deletions

View File

@@ -10,13 +10,16 @@ try:
except ImportError: except ImportError:
from StringIO import StringIO from StringIO import StringIO
import slate
from PIL import Image from PIL import Image
from pdfminer.pdfpage import PDFPage
import sh import sh
from django.utils.translation import ugettext_lazy as _
from common.utils import fs_cleanup from common.utils import fs_cleanup
from ..classes import ConverterBase from ..classes import ConverterBase
from ..exceptions import PageCountError
from ..settings import setting_pdftoppm_path from ..settings import setting_pdftoppm_path
try: try:
@@ -78,12 +81,14 @@ class Python(ConverterBase):
file_object = self.file_object file_object = self.file_object
try: try:
pages = slate.PDF(file_object) page_count = len(list(PDFPage.get_pages(file_object)))
except Exception as exception: except Exception as exception:
logger.error('Slate exception; %s', exception) error_message = _('Exception determining PDF page count; %s') % exception
raise logger.error(error_message)
raise PageCountError(error_message)
else: else:
return len(pages) logger.debug('Document contains %d pages', page_count)
return page_count
finally: finally:
file_object.seek(0) file_object.seek(0)
else: else:

View File

@@ -29,3 +29,6 @@ class OfficeConversionError(ConvertError):
class InvalidOfficeFormat(ConvertError): class InvalidOfficeFormat(ConvertError):
pass pass
class PageCountError(ConvertError):
pass

View File

@@ -1,56 +0,0 @@
from __future__ import unicode_literals
import logging
import os
import tempfile
import sh
from common.settings import setting_temporary_directory
from .exceptions import UnpaperError
from .parsers import parse_document_page
from .parsers.exceptions import ParserError, ParserUnknownFile
from .settings import UNPAPER_PATH
logger = logging.getLogger(__name__)
try:
UNPAPER = sh.Command(UNPAPER_PATH).bake(
overwrite=True, no_multi_pages=True
)
except sh.CommandNotFound:
logger.debug('unpaper not found')
UNPAPER = None
"""
for document_page in document_version.pages.all():
try:
# Try to extract text by means of a parser
parse_document_page(document_page)
except (ParserError, ParserUnknownFile):
# Fall back to doing visual OCR
"""
def execute_unpaper(input_filepath, output_filepath=None):
"""
Executes the program unpaper using subprocess's Popen
"""
if UNPAPER:
if not output_filepath:
fd, output_filepath = tempfile.mkstemp(
dir=setting_temporary_directory.value
)
try:
UNPAPER(input_filepath, output_filepath)
except sh.ErrorReturnCode as exception:
logger.error(exception)
raise UnpaperError(exception.stderr)
else:
return output_filepath
finally:
os.close(fd)
else:
return input_filepath

View File

@@ -1,157 +0,0 @@
from __future__ import unicode_literals
import logging
import os
import slate
import subprocess
import tempfile
from common.settings import setting_temporary_directory
from common.utils import copyfile
from ..settings import setting_pdftotext_path
from .exceptions import ParserError, ParserUnknownFile
mimetype_registry = {}
logger = logging.getLogger(__name__)
def register_parser(mimetypes, parsers):
for mimetype in mimetypes:
for parser in parsers:
try:
parser_instance = parser()
except ParserError:
# If parser fails initialization is not added to the list for
# this mimetype
pass
else:
mimetype_registry.setdefault(mimetype, []).append(
parser_instance
)
def parse_document_page(document_page, descriptor=None, mimetype=None):
logger.debug('executing')
logger.debug('document_page: %s', document_page)
logger.debug('document mimetype: %s', document_page.document.file_mimetype)
if not mimetype:
mimetype = document_page.document.file_mimetype
if mimetype.startswith('text/'):
if mimetype not in CONVERTER_OFFICE_FILE_MIMETYPES:
mimetype = 'text/plain'
logger.debug('fallback to mimetype text/plain')
logger.debug('used mimetype: %s', mimetype)
try:
for parser in mimetype_registry[mimetype]:
try:
parser.parse(document_page, descriptor)
except ParserError:
# If parser raises error, try next parser in the list
pass
else:
# If parser was successfull there is no need to try
# others in the list for this mimetype
return
raise ParserError('Parser list exhausted')
except KeyError:
raise ParserUnknownFile
class Parser(object):
"""
Parser base class
"""
def parse(self, document_page, descriptor=None):
raise NotImplementedError(
'Your %s class has not defined a parse() method, which is required.',
self.__class__.__name__
)
class SlateParser(Parser):
"""
Parser for PDF files using the slate library for Python
"""
def parse(self, document_page, descriptor=None):
logger.debug('Starting SlateParser')
if not descriptor:
descriptor = document_page.document_version.open()
pdf_pages = slate.PDF(descriptor)
descriptor.close()
if pdf_pages[document_page.page_number - 1] == b'\x0c':
logger.debug('The Slate parser didn\'t return any output')
raise ParserError('No output')
document_page.content = pdf_pages[document_page.page_number - 1]
document_page.save()
class PopplerParser(Parser):
"""
PDF parser using the pdftotext execute from the poppler package
"""
def __init__(self):
self.pdftotext_path = setting_pdftotext_path.value if setting_pdftotext_path.value else '/usr/bin/pdftotext'
if not os.path.exists(self.pdftotext_path):
raise ParserError('cannot find pdftotext executable')
logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
def parse(self, document_page, descriptor=None):
logger.debug('parsing PDF with PopplerParser')
pagenum = str(document_page.page_number)
if descriptor:
destination_descriptor, temp_filepath = tempfile.mkstemp(
dir=setting_temporary_directory.value
)
copyfile(descriptor, temp_filepath)
document_file = temp_filepath
else:
document_file = document_page.document.document_save_to_temp_dir(
document_page.document.checksum
)
logger.debug('document_file: %s', document_file)
logger.debug('parsing PDF page %s', pagenum)
command = []
command.append(self.pdftotext_path)
command.append('-f')
command.append(pagenum)
command.append('-l')
command.append(pagenum)
command.append(document_file)
command.append('-')
proc = subprocess.Popen(
command, close_fds=True, stderr=subprocess.PIPE,
stdout=subprocess.PIPE
)
return_code = proc.wait()
if return_code != 0:
logger.error(proc.stderr.readline())
raise ParserError
output = proc.stdout.read()
if output == b'\x0c':
logger.debug('Parser didn\'t return any output')
raise ParserError('No output')
document_page.content = output
document_page.save()
register_parser(
mimetypes=['application/pdf'], parsers=[PopplerParser, SlateParser]
)

View File

@@ -1,10 +0,0 @@
class ParserError(Exception):
"""
Raised when a text parser fails to understand a file it been passed
or the resulting parsed text is invalid
"""
pass
class ParserUnknownFile(Exception):
pass

View File

@@ -23,7 +23,7 @@ djangorestframework==2.4.4
fusepy==2.0.2 fusepy==2.0.2
pdfminer==20110227 pdfminer==20140328
pycountry==1.10 pycountry==1.10
pytesseract==0.1.6 pytesseract==0.1.6
python-dateutil==2.4.2 python-dateutil==2.4.2
@@ -32,4 +32,3 @@ python-magic==0.4.6
pytz==2015.4 pytz==2015.4
sh==1.11 sh==1.11
slate==0.3