Instead of passing strings as arguments to backends, all settings must
be formatted according to YAML specifications. This is to remove the
need to add separate YAML parsing to each backend argument in each
app that needs it. Argument passing to backends is not fully
uniform.
Users need to update their config files.
Example:
DOCUMENTS_STORAGE_BACKEND_ARGUMENTS: '{location: /home/rosarior/development/mayan-edms/mayan/media/document_storage}'
must be changed to:
DOCUMENTS_STORAGE_BACKEND_ARGUMENTS:
location: /home/rosarior/development/mayan-edms/mayan/media/document_storage
Example 2:
CONVERTER_GRAPHICS_BACKEND_CONFIG: ' { libreoffice_path: /usr/bin/libreoffice, pdftoppm_dpi:
300, pdftoppm_format: jpeg, pdftoppm_path: /usr/bin/pdftoppm, pdfinfo_path:
/usr/bin/pdfinfo, pillow_format: JPEG } '
must be changed to:
CONVERTER_GRAPHICS_BACKEND_CONFIG:
libreoffice_path: /usr/bin/libreoffice
pdftoppm_dpi: 300
pdftoppm_format: jpeg
pdftoppm_path: /usr/bin/pdftoppm
pdfinfo_path: /usr/bin/pdfinfo
pillow_format: JPEG
Example 3:
OCR_BACKEND_ARGUMENTS: ''
must be changed to:
OCR_BACKEND_ARGUMENTS: {}
Settings that need to be updated are:
- COMMON_SHARED_STORAGE_ARGUMENTS
- CONVERTER_GRAPHICS_BACKEND_CONFIG
- DOCUMENTS_CACHE_STORAGE_BACKEND_ARGUMENTS
- DOCUMENTS_STORAGE_BACKEND_ARGUMENTS
- OCR_BACKEND_ARGUMENTS
- SIGNATURES_STORAGE_BACKEND_ARGUMENTS
- SOURCES_STAGING_FILE_CACHE_STORAGE_BACKEND_ARGUMENTS
The following error will appear in the console if a setting is not yet
updated to this new format::
TypeError: type object argument after ** must be a mapping, not str
Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
211 lines
7.0 KiB
Python
211 lines
7.0 KiB
Python
from __future__ import unicode_literals
|
|
|
|
import io
|
|
import logging
|
|
import os
|
|
|
|
from PIL import Image
|
|
import PyPDF2
|
|
import sh
|
|
|
|
from django.utils.encoding import force_text
|
|
from django.utils.translation import ugettext_lazy as _
|
|
|
|
from common.utils import fs_cleanup, mkstemp
|
|
|
|
from ..classes import ConverterBase
|
|
from ..exceptions import PageCountError
|
|
from ..settings import setting_graphics_backend_config
|
|
|
|
from ..literals import (
|
|
DEFAULT_PDFTOPPM_DPI, DEFAULT_PDFTOPPM_FORMAT, DEFAULT_PDFTOPPM_PATH,
|
|
DEFAULT_PDFINFO_PATH
|
|
)
|
|
|
|
try:
|
|
pdftoppm = sh.Command(
|
|
setting_graphics_backend_config.value.get(
|
|
'pdftoppm_path', DEFAULT_PDFTOPPM_PATH
|
|
)
|
|
)
|
|
except sh.CommandNotFound:
|
|
pdftoppm = None
|
|
else:
|
|
pdftoppm_format = '-{}'.format(
|
|
setting_graphics_backend_config.value.get(
|
|
'pdftoppm_format', DEFAULT_PDFTOPPM_FORMAT
|
|
)
|
|
)
|
|
|
|
pdftoppm_dpi = format(
|
|
setting_graphics_backend_config.value.get(
|
|
'pdftoppm_dpi', DEFAULT_PDFTOPPM_DPI
|
|
)
|
|
)
|
|
|
|
pdftoppm = pdftoppm.bake(pdftoppm_format, '-r', pdftoppm_dpi)
|
|
|
|
try:
|
|
pdfinfo = sh.Command(
|
|
setting_graphics_backend_config.value.get(
|
|
'pdfinfo_path', DEFAULT_PDFINFO_PATH
|
|
)
|
|
)
|
|
except sh.CommandNotFound:
|
|
pdfinfo = None
|
|
|
|
Image.init()
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class IteratorIO(object):
|
|
def __init__(self, iterator):
|
|
self.file_buffer = io.BytesIO()
|
|
|
|
for chunk in iterator:
|
|
self.file_buffer.write(chunk)
|
|
|
|
self.file_buffer.seek(0)
|
|
|
|
|
|
class Python(ConverterBase):
|
|
|
|
def convert(self, *args, **kwargs):
|
|
super(Python, self).convert(*args, **kwargs)
|
|
|
|
if self.mime_type == 'application/pdf' and pdftoppm:
|
|
|
|
new_file_object, input_filepath = mkstemp()
|
|
self.file_object.seek(0)
|
|
os.write(new_file_object, self.file_object.read())
|
|
self.file_object.seek(0)
|
|
|
|
os.close(new_file_object)
|
|
|
|
image_buffer = io.BytesIO()
|
|
try:
|
|
pdftoppm(
|
|
input_filepath, f=self.page_number + 1,
|
|
l=self.page_number + 1, _out=image_buffer
|
|
)
|
|
image_buffer.seek(0)
|
|
return Image.open(image_buffer)
|
|
finally:
|
|
fs_cleanup(input_filepath)
|
|
|
|
def detect_orientation(self, page_number):
|
|
# Default rotation: 0 degrees
|
|
result = 0
|
|
|
|
# Use different ways depending on the file type
|
|
if self.mime_type == 'application/pdf':
|
|
pdf = PyPDF2.PdfFileReader(self.file_object)
|
|
try:
|
|
result = pdf.getPage(page_number - 1).get('/Rotate', 0)
|
|
if isinstance(result, PyPDF2.generic.IndirectObject):
|
|
result = result.getObject()
|
|
except Exception as exception:
|
|
self.file_object.seek(0)
|
|
pdf = PyPDF2.PdfFileReader(self.file_object)
|
|
if force_text(exception) == 'File has not been decrypted':
|
|
# File is encrypted, try to decrypt using a blank
|
|
# password.
|
|
try:
|
|
pdf.decrypt(password=b'')
|
|
except Exception as exception:
|
|
logger.error(
|
|
'Unable to detect PDF orientation; %s', exception
|
|
)
|
|
else:
|
|
logger.error(
|
|
'Unable to detect PDF orientation; %s', exception
|
|
)
|
|
finally:
|
|
self.file_object.seek(0)
|
|
|
|
return result
|
|
|
|
def get_page_count(self):
|
|
super(Python, self).get_page_count()
|
|
|
|
page_count = 1
|
|
|
|
if self.mime_type == 'application/pdf' or self.soffice_file:
|
|
if self.soffice_file:
|
|
file_object = IteratorIO(self.soffice_file).file_buffer
|
|
else:
|
|
file_object = self.file_object
|
|
|
|
try:
|
|
# Try PyPDF to determine the page number
|
|
pdf_reader = PyPDF2.PdfFileReader(
|
|
stream=file_object, strict=False
|
|
)
|
|
page_count = pdf_reader.getNumPages()
|
|
except Exception as exception:
|
|
if force_text(exception) == 'File has not been decrypted':
|
|
# File is encrypted, try to decrypt using a blank
|
|
# password.
|
|
file_object.seek(0)
|
|
pdf_reader = PyPDF2.PdfFileReader(
|
|
stream=file_object, strict=False
|
|
)
|
|
try:
|
|
pdf_reader.decrypt(password=b'')
|
|
page_count = pdf_reader.getNumPages()
|
|
except Exception as exception:
|
|
file_object.seek(0)
|
|
if force_text(exception) == 'only algorithm code 1 and 2 are supported':
|
|
# PDF uses an unsupported encryption
|
|
# Try poppler-util's pdfinfo
|
|
process = pdfinfo('-', _in=file_object)
|
|
page_count = int(
|
|
filter(
|
|
lambda line: line.startswith('Pages:'),
|
|
force_text(process.stdout).split('\n')
|
|
)[0].replace('Pages:', '')
|
|
)
|
|
file_object.seek(0)
|
|
logger.debug(
|
|
'Document contains %d pages', page_count
|
|
)
|
|
return page_count
|
|
else:
|
|
error_message = _(
|
|
'Exception determining PDF page count; %s'
|
|
) % exception
|
|
logger.error(error_message)
|
|
raise PageCountError(error_message)
|
|
else:
|
|
error_message = _(
|
|
'Exception determining PDF page count; %s'
|
|
) % exception
|
|
logger.error(error_message)
|
|
raise PageCountError(error_message)
|
|
else:
|
|
logger.debug('Document contains %d pages', page_count)
|
|
return page_count
|
|
finally:
|
|
file_object.seek(0)
|
|
else:
|
|
try:
|
|
image = Image.open(self.file_object)
|
|
except IOError as exception:
|
|
error_message = _(
|
|
'Exception determining page count using Pillow; %s'
|
|
) % exception
|
|
logger.error(error_message)
|
|
raise PageCountError(error_message)
|
|
finally:
|
|
self.file_object.seek(0)
|
|
|
|
try:
|
|
while True:
|
|
image.seek(image.tell() + 1)
|
|
page_count += 1
|
|
except EOFError:
|
|
# end of sequence
|
|
pass
|
|
|
|
return page_count
|