Files
mayan-edms/mayan/apps/converter/backends/python.py
Roberto Rosario 8e69178e07 Project: Switch to full app paths
Instead of inserting the path of the apps into the Python app,
the apps are now referenced by their full import path.

This app name claves with external or native Python libraries.
Example: Mayan statistics app vs. Python new statistics library.

Every app reference is now prepended with 'mayan.apps'.

Existing config.yml files need to be updated manually.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
2018-12-05 02:04:20 -04:00

211 lines
7.0 KiB
Python

from __future__ import unicode_literals
import io
import logging
import os
from PIL import Image
import PyPDF2
import sh
from django.utils.encoding import force_text
from django.utils.translation import ugettext_lazy as _
from mayan.apps.common.utils import fs_cleanup, mkstemp
from ..classes import ConverterBase
from ..exceptions import PageCountError
from ..settings import setting_graphics_backend_arguments
from ..literals import (
DEFAULT_PDFTOPPM_DPI, DEFAULT_PDFTOPPM_FORMAT, DEFAULT_PDFTOPPM_PATH,
DEFAULT_PDFINFO_PATH
)
try:
pdftoppm = sh.Command(
setting_graphics_backend_arguments.value.get(
'pdftoppm_path', DEFAULT_PDFTOPPM_PATH
)
)
except sh.CommandNotFound:
pdftoppm = None
else:
pdftoppm_format = '-{}'.format(
setting_graphics_backend_arguments.value.get(
'pdftoppm_format', DEFAULT_PDFTOPPM_FORMAT
)
)
pdftoppm_dpi = format(
setting_graphics_backend_arguments.value.get(
'pdftoppm_dpi', DEFAULT_PDFTOPPM_DPI
)
)
pdftoppm = pdftoppm.bake(pdftoppm_format, '-r', pdftoppm_dpi)
try:
pdfinfo = sh.Command(
setting_graphics_backend_arguments.value.get(
'pdfinfo_path', DEFAULT_PDFINFO_PATH
)
)
except sh.CommandNotFound:
pdfinfo = None
Image.init()
logger = logging.getLogger(__name__)
class IteratorIO(object):
def __init__(self, iterator):
self.file_buffer = io.BytesIO()
for chunk in iterator:
self.file_buffer.write(chunk)
self.file_buffer.seek(0)
class Python(ConverterBase):
def convert(self, *args, **kwargs):
super(Python, self).convert(*args, **kwargs)
if self.mime_type == 'application/pdf' and pdftoppm:
new_file_object, input_filepath = mkstemp()
self.file_object.seek(0)
os.write(new_file_object, self.file_object.read())
self.file_object.seek(0)
os.close(new_file_object)
image_buffer = io.BytesIO()
try:
pdftoppm(
input_filepath, f=self.page_number + 1,
l=self.page_number + 1, _out=image_buffer
)
image_buffer.seek(0)
return Image.open(image_buffer)
finally:
fs_cleanup(input_filepath)
def detect_orientation(self, page_number):
# Default rotation: 0 degrees
result = 0
# Use different ways depending on the file type
if self.mime_type == 'application/pdf':
pdf = PyPDF2.PdfFileReader(self.file_object)
try:
result = pdf.getPage(page_number - 1).get('/Rotate', 0)
if isinstance(result, PyPDF2.generic.IndirectObject):
result = result.getObject()
except Exception as exception:
self.file_object.seek(0)
pdf = PyPDF2.PdfFileReader(self.file_object)
if force_text(exception) == 'File has not been decrypted':
# File is encrypted, try to decrypt using a blank
# password.
try:
pdf.decrypt(password=b'')
except Exception as exception:
logger.error(
'Unable to detect PDF orientation; %s', exception
)
else:
logger.error(
'Unable to detect PDF orientation; %s', exception
)
finally:
self.file_object.seek(0)
return result
def get_page_count(self):
super(Python, self).get_page_count()
page_count = 1
if self.mime_type == 'application/pdf' or self.soffice_file:
if self.soffice_file:
file_object = IteratorIO(self.soffice_file).file_buffer
else:
file_object = self.file_object
try:
# Try PyPDF to determine the page number
pdf_reader = PyPDF2.PdfFileReader(
stream=file_object, strict=False
)
page_count = pdf_reader.getNumPages()
except Exception as exception:
if force_text(exception) == 'File has not been decrypted':
# File is encrypted, try to decrypt using a blank
# password.
file_object.seek(0)
pdf_reader = PyPDF2.PdfFileReader(
stream=file_object, strict=False
)
try:
pdf_reader.decrypt(password=b'')
page_count = pdf_reader.getNumPages()
except Exception as exception:
file_object.seek(0)
if force_text(exception) == 'only algorithm code 1 and 2 are supported':
# PDF uses an unsupported encryption
# Try poppler-util's pdfinfo
process = pdfinfo('-', _in=file_object)
page_count = int(
filter(
lambda line: line.startswith('Pages:'),
force_text(process.stdout).split('\n')
)[0].replace('Pages:', '')
)
file_object.seek(0)
logger.debug(
'Document contains %d pages', page_count
)
return page_count
else:
error_message = _(
'Exception determining PDF page count; %s'
) % exception
logger.error(error_message)
raise PageCountError(error_message)
else:
error_message = _(
'Exception determining PDF page count; %s'
) % exception
logger.error(error_message)
raise PageCountError(error_message)
else:
logger.debug('Document contains %d pages', page_count)
return page_count
finally:
file_object.seek(0)
else:
try:
image = Image.open(self.file_object)
except IOError as exception:
error_message = _(
'Exception determining page count using Pillow; %s'
) % exception
logger.error(error_message)
raise PageCountError(error_message)
finally:
self.file_object.seek(0)
try:
while True:
image.seek(image.tell() + 1)
page_count += 1
except EOFError:
# end of sequence
pass
return page_count