PDF compatibility improvements. Support simple encrypted PDF.

Improve PDF page count detection. Improve PDF orientation
detection. Merge CONVERTER_LIBREOFFICE_PATH and
CONVERTER_PDFTOPPM_PATH config settings into
CONVERTER_GRAPHICS_BACKEND_CONFIG.
GitLab issue #384 #376 #377.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2017-07-04 04:02:29 -04:00
parent 5629033578
commit 7343223f59
7 changed files with 113 additions and 38 deletions

View File

@@ -11,24 +11,40 @@ except ImportError:
from PIL import Image
import PyPDF2
from pdfminer.pdfpage import PDFPage
import sh
import yaml
from django.utils.encoding import force_text
from django.utils.translation import ugettext_lazy as _
from common.utils import fs_cleanup, mkstemp
from ..classes import ConverterBase
from ..exceptions import PageCountError
from ..settings import setting_pdftoppm_path
from ..settings import setting_graphics_backend_config
from .literals import DEFAULT_PDFTOPPM_PATH, DEFAULT_PDFINFO_PATH
try:
pdftoppm = sh.Command(setting_pdftoppm_path.value)
pdftoppm = sh.Command(
yaml.load(setting_graphics_backend_config.value).get(
'pdftoppm_path', DEFAULT_PDFTOPPM_PATH
)
)
except sh.CommandNotFound:
pdftoppm = None
else:
pdftoppm = pdftoppm.bake('-jpeg')
try:
pdfinfo = sh.Command(
yaml.load(setting_graphics_backend_config.value).get(
'pdfinfo_path', DEFAULT_PDFINFO_PATH
)
)
except sh.CommandNotFound:
pdfinfo = None
Image.init()
logger = logging.getLogger(__name__)
@@ -69,17 +85,34 @@ class Python(ConverterBase):
fs_cleanup(input_filepath)
def detect_orientation(self, page_number):
# Default rotation: 0 degrees
result = 0
# Use different ways depending on the file type
if self.mime_type == 'application/pdf':
pdf = PyPDF2.PdfFileReader(self.file_object)
result = pdf.getPage(page_number - 1).get('/Rotate')
try:
result = pdf.getPage(page_number - 1).get('/Rotate')
except Exception as exception:
self.file_object.seek(0)
pdf = PyPDF2.PdfFileReader(self.file_object)
if force_text(exception) == 'File has not been decrypted':
# File is encrypted, try to decrypt using a blank
# password.
try:
pdf.decrypt(password=b'')
except Exception as exception:
logger.error(
'Unable to detect PDF orientation; %s', exception
)
else:
logger.error(
'Unable to detect PDF orientation; %s', exception
)
finally:
self.file_object.seek(0)
self.file_object.seek(0)
return result
# Default rotation: 0 degrees
return 0
return result
def get_page_count(self):
super(Python, self).get_page_count()
@@ -87,20 +120,57 @@ class Python(ConverterBase):
page_count = 1
if self.mime_type == 'application/pdf' or self.soffice_file:
# If file is a PDF open it with slate to determine the page count
if self.soffice_file:
file_object = IteratorIO(self.soffice_file).file_buffer
else:
file_object = self.file_object
try:
page_count = len(list(PDFPage.get_pages(file_object)))
# Try PyPDF to determine the page number
pdf_reader = PyPDF2.PdfFileReader(
stream=file_object, strict=False
)
page_count = pdf_reader.getNumPages()
except Exception as exception:
error_message = _(
'Exception determining PDF page count; %s'
) % exception
logger.error(error_message)
raise PageCountError(error_message)
if force_text(exception) == 'File has not been decrypted':
# File is encrypted, try to decrypt using a blank
# password.
file_object.seek(0)
pdf_reader = PyPDF2.PdfFileReader(
stream=file_object, strict=False
)
try:
pdf_reader.decrypt(password=b'')
page_count = pdf_reader.getNumPages()
except Exception as exception:
file_object.seek(0)
if force_text(exception) == 'only algorithm code 1 and 2 are supported':
# PDF uses an unsupported encryption
# Try poppler-util's pdfinfo
process = pdfinfo('-', _in=file_object)
page_count = int(
filter(
lambda line: line.startswith('Pages:'),
process.stdout.split(b'\n')
)[0].replace('Pages:', '')
)
file_object.seek(0)
logger.debug(
'Document contains %d pages', page_count
)
return page_count
else:
error_message = _(
'Exception determining PDF page count; %s'
) % exception
logger.error(error_message)
raise PageCountError(error_message)
else:
error_message = _(
'Exception determining PDF page count; %s'
) % exception
logger.error(error_message)
raise PageCountError(error_message)
else:
logger.debug('Document contains %d pages', page_count)
return page_count
@@ -111,7 +181,7 @@ class Python(ConverterBase):
image = Image.open(self.file_object)
except IOError as exception:
error_message = _(
'Exception determining PDF page count; %s'
'Exception determining page count using Pillow; %s'
) % exception
logger.error(error_message)
raise PageCountError(error_message)