Instead of inserting the path of the apps into the Python app, the apps are now referenced by their full import path. This app name claves with external or native Python libraries. Example: Mayan statistics app vs. Python new statistics library. Every app reference is now prepended with 'mayan.apps'. Existing config.yml files need to be updated manually. Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
211 lines
7.0 KiB
Python
211 lines
7.0 KiB
Python
from __future__ import unicode_literals
|
|
|
|
import io
|
|
import logging
|
|
import os
|
|
|
|
from PIL import Image
|
|
import PyPDF2
|
|
import sh
|
|
|
|
from django.utils.encoding import force_text
|
|
from django.utils.translation import ugettext_lazy as _
|
|
|
|
from mayan.apps.common.utils import fs_cleanup, mkstemp
|
|
|
|
from ..classes import ConverterBase
|
|
from ..exceptions import PageCountError
|
|
from ..settings import setting_graphics_backend_arguments
|
|
|
|
from ..literals import (
|
|
DEFAULT_PDFTOPPM_DPI, DEFAULT_PDFTOPPM_FORMAT, DEFAULT_PDFTOPPM_PATH,
|
|
DEFAULT_PDFINFO_PATH
|
|
)
|
|
|
|
try:
|
|
pdftoppm = sh.Command(
|
|
setting_graphics_backend_arguments.value.get(
|
|
'pdftoppm_path', DEFAULT_PDFTOPPM_PATH
|
|
)
|
|
)
|
|
except sh.CommandNotFound:
|
|
pdftoppm = None
|
|
else:
|
|
pdftoppm_format = '-{}'.format(
|
|
setting_graphics_backend_arguments.value.get(
|
|
'pdftoppm_format', DEFAULT_PDFTOPPM_FORMAT
|
|
)
|
|
)
|
|
|
|
pdftoppm_dpi = format(
|
|
setting_graphics_backend_arguments.value.get(
|
|
'pdftoppm_dpi', DEFAULT_PDFTOPPM_DPI
|
|
)
|
|
)
|
|
|
|
pdftoppm = pdftoppm.bake(pdftoppm_format, '-r', pdftoppm_dpi)
|
|
|
|
try:
|
|
pdfinfo = sh.Command(
|
|
setting_graphics_backend_arguments.value.get(
|
|
'pdfinfo_path', DEFAULT_PDFINFO_PATH
|
|
)
|
|
)
|
|
except sh.CommandNotFound:
|
|
pdfinfo = None
|
|
|
|
Image.init()
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class IteratorIO(object):
|
|
def __init__(self, iterator):
|
|
self.file_buffer = io.BytesIO()
|
|
|
|
for chunk in iterator:
|
|
self.file_buffer.write(chunk)
|
|
|
|
self.file_buffer.seek(0)
|
|
|
|
|
|
class Python(ConverterBase):
|
|
|
|
def convert(self, *args, **kwargs):
|
|
super(Python, self).convert(*args, **kwargs)
|
|
|
|
if self.mime_type == 'application/pdf' and pdftoppm:
|
|
|
|
new_file_object, input_filepath = mkstemp()
|
|
self.file_object.seek(0)
|
|
os.write(new_file_object, self.file_object.read())
|
|
self.file_object.seek(0)
|
|
|
|
os.close(new_file_object)
|
|
|
|
image_buffer = io.BytesIO()
|
|
try:
|
|
pdftoppm(
|
|
input_filepath, f=self.page_number + 1,
|
|
l=self.page_number + 1, _out=image_buffer
|
|
)
|
|
image_buffer.seek(0)
|
|
return Image.open(image_buffer)
|
|
finally:
|
|
fs_cleanup(input_filepath)
|
|
|
|
def detect_orientation(self, page_number):
|
|
# Default rotation: 0 degrees
|
|
result = 0
|
|
|
|
# Use different ways depending on the file type
|
|
if self.mime_type == 'application/pdf':
|
|
pdf = PyPDF2.PdfFileReader(self.file_object)
|
|
try:
|
|
result = pdf.getPage(page_number - 1).get('/Rotate', 0)
|
|
if isinstance(result, PyPDF2.generic.IndirectObject):
|
|
result = result.getObject()
|
|
except Exception as exception:
|
|
self.file_object.seek(0)
|
|
pdf = PyPDF2.PdfFileReader(self.file_object)
|
|
if force_text(exception) == 'File has not been decrypted':
|
|
# File is encrypted, try to decrypt using a blank
|
|
# password.
|
|
try:
|
|
pdf.decrypt(password=b'')
|
|
except Exception as exception:
|
|
logger.error(
|
|
'Unable to detect PDF orientation; %s', exception
|
|
)
|
|
else:
|
|
logger.error(
|
|
'Unable to detect PDF orientation; %s', exception
|
|
)
|
|
finally:
|
|
self.file_object.seek(0)
|
|
|
|
return result
|
|
|
|
def get_page_count(self):
|
|
super(Python, self).get_page_count()
|
|
|
|
page_count = 1
|
|
|
|
if self.mime_type == 'application/pdf' or self.soffice_file:
|
|
if self.soffice_file:
|
|
file_object = IteratorIO(self.soffice_file).file_buffer
|
|
else:
|
|
file_object = self.file_object
|
|
|
|
try:
|
|
# Try PyPDF to determine the page number
|
|
pdf_reader = PyPDF2.PdfFileReader(
|
|
stream=file_object, strict=False
|
|
)
|
|
page_count = pdf_reader.getNumPages()
|
|
except Exception as exception:
|
|
if force_text(exception) == 'File has not been decrypted':
|
|
# File is encrypted, try to decrypt using a blank
|
|
# password.
|
|
file_object.seek(0)
|
|
pdf_reader = PyPDF2.PdfFileReader(
|
|
stream=file_object, strict=False
|
|
)
|
|
try:
|
|
pdf_reader.decrypt(password=b'')
|
|
page_count = pdf_reader.getNumPages()
|
|
except Exception as exception:
|
|
file_object.seek(0)
|
|
if force_text(exception) == 'only algorithm code 1 and 2 are supported':
|
|
# PDF uses an unsupported encryption
|
|
# Try poppler-util's pdfinfo
|
|
process = pdfinfo('-', _in=file_object)
|
|
page_count = int(
|
|
filter(
|
|
lambda line: line.startswith('Pages:'),
|
|
force_text(process.stdout).split('\n')
|
|
)[0].replace('Pages:', '')
|
|
)
|
|
file_object.seek(0)
|
|
logger.debug(
|
|
'Document contains %d pages', page_count
|
|
)
|
|
return page_count
|
|
else:
|
|
error_message = _(
|
|
'Exception determining PDF page count; %s'
|
|
) % exception
|
|
logger.error(error_message)
|
|
raise PageCountError(error_message)
|
|
else:
|
|
error_message = _(
|
|
'Exception determining PDF page count; %s'
|
|
) % exception
|
|
logger.error(error_message)
|
|
raise PageCountError(error_message)
|
|
else:
|
|
logger.debug('Document contains %d pages', page_count)
|
|
return page_count
|
|
finally:
|
|
file_object.seek(0)
|
|
else:
|
|
try:
|
|
image = Image.open(self.file_object)
|
|
except IOError as exception:
|
|
error_message = _(
|
|
'Exception determining page count using Pillow; %s'
|
|
) % exception
|
|
logger.error(error_message)
|
|
raise PageCountError(error_message)
|
|
finally:
|
|
self.file_object.seek(0)
|
|
|
|
try:
|
|
while True:
|
|
image.seek(image.tell() + 1)
|
|
page_count += 1
|
|
except EOFError:
|
|
# end of sequence
|
|
pass
|
|
|
|
return page_count
|