PDF compatibility improvements. Support simple encrypted PDF.
Improve PDF page count detection. Improve PDF orientation detection. Merge CONVERTER_LIBREOFFICE_PATH and CONVERTER_PDFTOPPM_PATH config settings into CONVERTER_GRAPHICS_BACKEND_CONFIG. GitLab issue #384 #376 #377. Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
@@ -34,6 +34,10 @@ Other Changes
|
||||
"Tools" and "Setup" sub menus, were moved from the "Profile" menu to the
|
||||
"System" menu. The "Profile" menu has been renamed to "User".
|
||||
- Usability improvements on small displays.
|
||||
- Removal of the CONVERTER_LIBREOFFICE_PATH and CONVERTER_PDFTOPPM_PATH
|
||||
settings. These setting have been consolidated into
|
||||
CONVERTER_GRAPHICS_BACKEND_CONFIG.
|
||||
- PDF compatibility improvements.
|
||||
|
||||
|
||||
Removals
|
||||
|
||||
@@ -2,7 +2,6 @@ from __future__ import unicode_literals
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
|
||||
DELETE_STALE_UPLOADS_INTERVAL = 60 * 10 # 10 minutes
|
||||
MAYAN_PYPI_NAME = 'mayan-edms'
|
||||
PYPI_URL = 'https://pypi.python.org/pypi'
|
||||
|
||||
4
mayan/apps/converter/backends/literals.py
Normal file
4
mayan/apps/converter/backends/literals.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
DEFAULT_PDFTOPPM_PATH = '/usr/bin/pdftoppm'
|
||||
DEFAULT_PDFINFO_PATH = '/usr/bin/pdfinfo'
|
||||
@@ -11,24 +11,40 @@ except ImportError:
|
||||
|
||||
from PIL import Image
|
||||
import PyPDF2
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
import sh
|
||||
import yaml
|
||||
|
||||
from django.utils.encoding import force_text
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from common.utils import fs_cleanup, mkstemp
|
||||
|
||||
from ..classes import ConverterBase
|
||||
from ..exceptions import PageCountError
|
||||
from ..settings import setting_pdftoppm_path
|
||||
from ..settings import setting_graphics_backend_config
|
||||
|
||||
from .literals import DEFAULT_PDFTOPPM_PATH, DEFAULT_PDFINFO_PATH
|
||||
|
||||
try:
|
||||
pdftoppm = sh.Command(setting_pdftoppm_path.value)
|
||||
pdftoppm = sh.Command(
|
||||
yaml.load(setting_graphics_backend_config.value).get(
|
||||
'pdftoppm_path', DEFAULT_PDFTOPPM_PATH
|
||||
)
|
||||
)
|
||||
except sh.CommandNotFound:
|
||||
pdftoppm = None
|
||||
else:
|
||||
pdftoppm = pdftoppm.bake('-jpeg')
|
||||
|
||||
try:
|
||||
pdfinfo = sh.Command(
|
||||
yaml.load(setting_graphics_backend_config.value).get(
|
||||
'pdfinfo_path', DEFAULT_PDFINFO_PATH
|
||||
)
|
||||
)
|
||||
except sh.CommandNotFound:
|
||||
pdfinfo = None
|
||||
|
||||
Image.init()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -69,17 +85,34 @@ class Python(ConverterBase):
|
||||
fs_cleanup(input_filepath)
|
||||
|
||||
def detect_orientation(self, page_number):
|
||||
# Default rotation: 0 degrees
|
||||
result = 0
|
||||
|
||||
# Use different ways depending on the file type
|
||||
if self.mime_type == 'application/pdf':
|
||||
pdf = PyPDF2.PdfFileReader(self.file_object)
|
||||
result = pdf.getPage(page_number - 1).get('/Rotate')
|
||||
try:
|
||||
result = pdf.getPage(page_number - 1).get('/Rotate')
|
||||
except Exception as exception:
|
||||
self.file_object.seek(0)
|
||||
pdf = PyPDF2.PdfFileReader(self.file_object)
|
||||
if force_text(exception) == 'File has not been decrypted':
|
||||
# File is encrypted, try to decrypt using a blank
|
||||
# password.
|
||||
try:
|
||||
pdf.decrypt(password=b'')
|
||||
except Exception as exception:
|
||||
logger.error(
|
||||
'Unable to detect PDF orientation; %s', exception
|
||||
)
|
||||
else:
|
||||
logger.error(
|
||||
'Unable to detect PDF orientation; %s', exception
|
||||
)
|
||||
finally:
|
||||
self.file_object.seek(0)
|
||||
|
||||
self.file_object.seek(0)
|
||||
|
||||
return result
|
||||
|
||||
# Default rotation: 0 degrees
|
||||
return 0
|
||||
return result
|
||||
|
||||
def get_page_count(self):
|
||||
super(Python, self).get_page_count()
|
||||
@@ -87,20 +120,57 @@ class Python(ConverterBase):
|
||||
page_count = 1
|
||||
|
||||
if self.mime_type == 'application/pdf' or self.soffice_file:
|
||||
# If file is a PDF open it with slate to determine the page count
|
||||
if self.soffice_file:
|
||||
file_object = IteratorIO(self.soffice_file).file_buffer
|
||||
else:
|
||||
file_object = self.file_object
|
||||
|
||||
try:
|
||||
page_count = len(list(PDFPage.get_pages(file_object)))
|
||||
# Try PyPDF to determine the page number
|
||||
pdf_reader = PyPDF2.PdfFileReader(
|
||||
stream=file_object, strict=False
|
||||
)
|
||||
page_count = pdf_reader.getNumPages()
|
||||
except Exception as exception:
|
||||
error_message = _(
|
||||
'Exception determining PDF page count; %s'
|
||||
) % exception
|
||||
logger.error(error_message)
|
||||
raise PageCountError(error_message)
|
||||
if force_text(exception) == 'File has not been decrypted':
|
||||
# File is encrypted, try to decrypt using a blank
|
||||
# password.
|
||||
file_object.seek(0)
|
||||
pdf_reader = PyPDF2.PdfFileReader(
|
||||
stream=file_object, strict=False
|
||||
)
|
||||
try:
|
||||
pdf_reader.decrypt(password=b'')
|
||||
page_count = pdf_reader.getNumPages()
|
||||
except Exception as exception:
|
||||
file_object.seek(0)
|
||||
if force_text(exception) == 'only algorithm code 1 and 2 are supported':
|
||||
# PDF uses an unsupported encryption
|
||||
# Try poppler-util's pdfinfo
|
||||
process = pdfinfo('-', _in=file_object)
|
||||
page_count = int(
|
||||
filter(
|
||||
lambda line: line.startswith('Pages:'),
|
||||
process.stdout.split(b'\n')
|
||||
)[0].replace('Pages:', '')
|
||||
)
|
||||
file_object.seek(0)
|
||||
logger.debug(
|
||||
'Document contains %d pages', page_count
|
||||
)
|
||||
return page_count
|
||||
else:
|
||||
error_message = _(
|
||||
'Exception determining PDF page count; %s'
|
||||
) % exception
|
||||
logger.error(error_message)
|
||||
raise PageCountError(error_message)
|
||||
else:
|
||||
error_message = _(
|
||||
'Exception determining PDF page count; %s'
|
||||
) % exception
|
||||
logger.error(error_message)
|
||||
raise PageCountError(error_message)
|
||||
else:
|
||||
logger.debug('Document contains %d pages', page_count)
|
||||
return page_count
|
||||
@@ -111,7 +181,7 @@ class Python(ConverterBase):
|
||||
image = Image.open(self.file_object)
|
||||
except IOError as exception:
|
||||
error_message = _(
|
||||
'Exception determining PDF page count; %s'
|
||||
'Exception determining page count using Pillow; %s'
|
||||
) % exception
|
||||
logger.error(error_message)
|
||||
raise PageCountError(error_message)
|
||||
|
||||
@@ -11,6 +11,7 @@ except ImportError:
|
||||
|
||||
from PIL import Image, ImageFilter
|
||||
import sh
|
||||
import yaml
|
||||
|
||||
from django.utils.translation import string_concat, ugettext_lazy as _
|
||||
|
||||
@@ -19,15 +20,19 @@ from common.utils import fs_cleanup, mkstemp
|
||||
from mimetype.api import get_mimetype
|
||||
|
||||
from .exceptions import InvalidOfficeFormat, OfficeConversionError
|
||||
from .literals import DEFAULT_PAGE_NUMBER, DEFAULT_FILE_FORMAT
|
||||
from .settings import setting_libreoffice_path
|
||||
from .literals import (
|
||||
DEFAULT_LIBREOFFICE_PATH, DEFAULT_PAGE_NUMBER, DEFAULT_FILE_FORMAT
|
||||
)
|
||||
from .settings import setting_graphics_backend_config
|
||||
|
||||
CHUNK_SIZE = 1024
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
LIBREOFFICE = sh.Command(
|
||||
setting_libreoffice_path.value
|
||||
yaml.load(setting_graphics_backend_config.value).get(
|
||||
'libreoffice_path', DEFAULT_LIBREOFFICE_PATH
|
||||
)
|
||||
).bake('--headless', '--convert-to', 'pdf')
|
||||
except sh.CommandNotFound:
|
||||
LIBREOFFICE = None
|
||||
@@ -114,11 +119,9 @@ class ConverterBase(object):
|
||||
Executes LibreOffice as a subprocess
|
||||
"""
|
||||
|
||||
if not os.path.exists(setting_libreoffice_path.value):
|
||||
if not LIBREOFFICE:
|
||||
raise OfficeConversionError(
|
||||
_(
|
||||
'LibreOffice not installed or not found at path: %s'
|
||||
) % setting_libreoffice_path.value
|
||||
_('LibreOffice not installed or not found.')
|
||||
)
|
||||
|
||||
new_file_object, input_filepath = mkstemp()
|
||||
@@ -471,7 +474,3 @@ BaseTransformation.register(TransformationRotate180)
|
||||
BaseTransformation.register(TransformationRotate270)
|
||||
BaseTransformation.register(TransformationUnsharpMask)
|
||||
BaseTransformation.register(TransformationZoom)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -4,5 +4,6 @@ DEFAULT_ZOOM_LEVEL = 100
|
||||
DEFAULT_ROTATION = 0
|
||||
DEFAULT_PAGE_NUMBER = 1
|
||||
DEFAULT_FILE_FORMAT = 'JPEG'
|
||||
DEFAULT_LIBREOFFICE_PATH = '/usr/bin/libreoffice'
|
||||
|
||||
DIMENSION_SEPARATOR = 'x'
|
||||
|
||||
@@ -10,12 +10,10 @@ setting_graphics_backend = namespace.add_setting(
|
||||
help_text=_('Graphics conversion backend to use.'),
|
||||
global_name='CONVERTER_GRAPHICS_BACKEND',
|
||||
)
|
||||
setting_libreoffice_path = namespace.add_setting(
|
||||
default='/usr/bin/libreoffice',
|
||||
global_name='CONVERTER_LIBREOFFICE_PATH',
|
||||
help_text=_('Path to the libreoffice program.'), is_path=True
|
||||
)
|
||||
setting_pdftoppm_path = namespace.add_setting(
|
||||
default='/usr/bin/pdftoppm', global_name='CONVERTER_PDFTOPPM_PATH',
|
||||
help_text=_('Path to the Popple program pdftoppm.'), is_path=True
|
||||
setting_graphics_backend_config = namespace.add_setting(
|
||||
default='{libreoffice_path: /usr/bin/libreoffice, '
|
||||
'pdftoppm_path: /usr/bin/pdftoppm, pdfinfo_path: /usr/bin/pdfinfo}',
|
||||
help_text=_(
|
||||
'Configuration options for the graphics conversion backend.'
|
||||
), global_name='CONVERTER_GRAPHICS_BACKEND_CONFIG',
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user