Files
mayan-edms/mayan/apps/converter/backends/python.py
Roberto Rosario 9564db398f Backport configuration file improvements
Remove support for quoted entried. Support unquoted entries. Support
custom location for the config files.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
2019-07-09 15:40:20 -04:00

198 lines
6.8 KiB
Python

from __future__ import unicode_literals
import io
import logging
import shutil
from PIL import Image
import PyPDF2
import sh
from django.utils.encoding import force_text
from django.utils.translation import ugettext_lazy as _
from mayan.apps.storage.utils import NamedTemporaryFile
from ..classes import ConverterBase
from ..exceptions import PageCountError
from ..settings import setting_graphics_backend_arguments
from ..literals import (
DEFAULT_PDFTOPPM_DPI, DEFAULT_PDFTOPPM_FORMAT, DEFAULT_PDFTOPPM_PATH,
DEFAULT_PDFINFO_PATH
)
pdftoppm_path = setting_graphics_backend_arguments.value.get(
'pdftoppm_path', DEFAULT_PDFTOPPM_PATH
)
try:
pdftoppm = sh.Command(pdftoppm_path)
except sh.CommandNotFound:
pdftoppm = None
else:
pdftoppm_format = '-{}'.format(
setting_graphics_backend_arguments.value.get(
'pdftoppm_format', DEFAULT_PDFTOPPM_FORMAT
)
)
pdftoppm_dpi = format(
setting_graphics_backend_arguments.value.get(
'pdftoppm_dpi', DEFAULT_PDFTOPPM_DPI
)
)
pdftoppm = pdftoppm.bake(pdftoppm_format, '-r', pdftoppm_dpi)
pdfinfo_path = setting_graphics_backend_arguments.value.get(
'pdfinfo_path', DEFAULT_PDFINFO_PATH
)
try:
pdfinfo = sh.Command(pdfinfo_path)
except sh.CommandNotFound:
pdfinfo = None
logger = logging.getLogger(__name__)
class Python(ConverterBase):
def convert(self, *args, **kwargs):
super(Python, self).convert(*args, **kwargs)
if self.mime_type == 'application/pdf' and pdftoppm:
new_file_object = NamedTemporaryFile()
input_filepath = new_file_object.name
self.file_object.seek(0)
shutil.copyfileobj(fsrc=self.file_object, fdst=new_file_object)
self.file_object.seek(0)
new_file_object.seek(0)
image_buffer = io.BytesIO()
try:
pdftoppm(
input_filepath, f=self.page_number + 1,
l=self.page_number + 1, _out=image_buffer
)
image_buffer.seek(0)
return Image.open(image_buffer)
finally:
new_file_object.close()
def detect_orientation(self, page_number):
# Default rotation: 0 degrees
result = 0
# Use different ways depending on the file type
if self.mime_type == 'application/pdf':
pdf = PyPDF2.PdfFileReader(self.file_object)
try:
result = pdf.getPage(page_number - 1).get('/Rotate', 0)
if isinstance(result, PyPDF2.generic.IndirectObject):
result = result.getObject()
except Exception as exception:
self.file_object.seek(0)
pdf = PyPDF2.PdfFileReader(self.file_object)
if force_text(exception) == 'File has not been decrypted':
# File is encrypted, try to decrypt using a blank
# password.
try:
pdf.decrypt(password=b'')
except Exception as exception:
logger.error(
'Unable to detect PDF orientation; %s', exception
)
else:
logger.error(
'Unable to detect PDF orientation; %s', exception
)
finally:
self.file_object.seek(0)
return result
def get_page_count(self):
super(Python, self).get_page_count()
page_count = 1
if self.mime_type == 'application/pdf' or self.soffice_file:
if self.soffice_file:
file_object = self.soffice_file
else:
file_object = self.file_object
try:
# Try PyPDF to determine the page number
pdf_reader = PyPDF2.PdfFileReader(
stream=file_object, strict=False
)
page_count = pdf_reader.getNumPages()
except Exception as exception:
if force_text(exception) == 'File has not been decrypted':
# File is encrypted, try to decrypt using a blank
# password.
file_object.seek(0)
pdf_reader = PyPDF2.PdfFileReader(
stream=file_object, strict=False
)
try:
pdf_reader.decrypt(password=b'')
page_count = pdf_reader.getNumPages()
except Exception as exception:
file_object.seek(0)
if force_text(exception) == 'only algorithm code 1 and 2 are supported':
# PDF uses an unsupported encryption
# Try poppler-util's pdfinfo
process = pdfinfo('-', _in=file_object)
page_count = int(
filter(
lambda line: line.startswith('Pages:'),
force_text(process.stdout).split('\n')
)[0].replace('Pages:', '')
)
file_object.seek(0)
logger.debug(
'Document contains %d pages', page_count
)
return page_count
else:
error_message = _(
'Exception determining PDF page count; %s'
) % exception
logger.error(error_message)
raise PageCountError(error_message)
else:
error_message = _(
'Exception determining PDF page count; %s'
) % exception
logger.error(error_message)
raise PageCountError(error_message)
else:
logger.debug('Document contains %d pages', page_count)
return page_count
finally:
file_object.seek(0)
else:
try:
image = Image.open(self.file_object)
except IOError as exception:
error_message = _(
'Exception determining page count using Pillow; %s'
) % exception
logger.error(error_message)
raise PageCountError(error_message)
finally:
self.file_object.seek(0)
try:
while True:
image.seek(image.tell() + 1)
page_count += 1
except EOFError:
# end of sequence
pass
return page_count