Added PDF file support to the python converter backend via ghostscript
This commit is contained in:
@@ -12,6 +12,15 @@ from django.contrib.contenttypes.models import ContentType
|
|||||||
from django.contrib.auth.models import User
|
from django.contrib.auth.models import User
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
from python_magic import magic
|
||||||
|
USE_PYTHON_MAGIC = True
|
||||||
|
except:
|
||||||
|
import mimetypes
|
||||||
|
mimetypes.init()
|
||||||
|
USE_PYTHON_MAGIC = False
|
||||||
|
|
||||||
|
|
||||||
def urlquote(link=None, get=None):
|
def urlquote(link=None, get=None):
|
||||||
u'''
|
u'''
|
||||||
This method does both: urlquote() and urlencode()
|
This method does both: urlquote() and urlencode()
|
||||||
@@ -337,3 +346,31 @@ def return_diff(old_obj, new_obj, attrib_list=None):
|
|||||||
}
|
}
|
||||||
|
|
||||||
return diff_dict
|
return diff_dict
|
||||||
|
|
||||||
|
|
||||||
|
def get_mimetype(filepath):
|
||||||
|
"""
|
||||||
|
Determine a file's mimetype by calling the system's libmagic
|
||||||
|
library via python-magic or fallback to use python's mimetypes
|
||||||
|
library
|
||||||
|
"""
|
||||||
|
file_mimetype = u''
|
||||||
|
file_mime_encoding = u''
|
||||||
|
|
||||||
|
if USE_PYTHON_MAGIC:
|
||||||
|
if os.path.exists(filepath):
|
||||||
|
try:
|
||||||
|
source = open(filepath, 'r')
|
||||||
|
mime = magic.Magic(mime=True)
|
||||||
|
file_mimetype = mime.from_buffer(source.read())
|
||||||
|
source.seek(0)
|
||||||
|
mime_encoding = magic.Magic(mime_encoding=True)
|
||||||
|
file_mime_encoding = mime_encoding.from_buffer(source.read())
|
||||||
|
finally:
|
||||||
|
if source:
|
||||||
|
source.close()
|
||||||
|
else:
|
||||||
|
path, filename = os.path.split(filepath)
|
||||||
|
file_mimetype, file_mime_encoding = mimetypes.guess_type(filename)
|
||||||
|
|
||||||
|
return file_mimetype, file_mime_encoding
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ from common import TEMPORARY_DIRECTORY
|
|||||||
from documents.utils import document_save_to_temp_dir
|
from documents.utils import document_save_to_temp_dir
|
||||||
|
|
||||||
from converter.conf.settings import UNOCONV_PATH
|
from converter.conf.settings import UNOCONV_PATH
|
||||||
from converter.exceptions import UnpaperError, OfficeConversionError
|
from converter.exceptions import OfficeConversionError
|
||||||
from converter.literals import DEFAULT_PAGE_NUMBER, \
|
from converter.literals import DEFAULT_PAGE_NUMBER, \
|
||||||
QUALITY_DEFAULT, DEFAULT_ZOOM_LEVEL, \
|
QUALITY_DEFAULT, DEFAULT_ZOOM_LEVEL, \
|
||||||
DEFAULT_ROTATION, DEFAULT_FILE_FORMAT, QUALITY_HIGH
|
DEFAULT_ROTATION, DEFAULT_FILE_FORMAT, QUALITY_HIGH
|
||||||
@@ -17,6 +17,7 @@ from converter.literals import TRANSFORMATION_RESIZE, \
|
|||||||
TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \
|
TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \
|
||||||
TRANSFORMATION_ZOOM
|
TRANSFORMATION_ZOOM
|
||||||
from converter.literals import DIMENSION_SEPARATOR
|
from converter.literals import DIMENSION_SEPARATOR
|
||||||
|
from converter.utils import cleanup
|
||||||
|
|
||||||
HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
|
HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
|
||||||
|
|
||||||
@@ -24,15 +25,6 @@ CONVERTER_OFFICE_FILE_EXTENSIONS = [
|
|||||||
u'ods', u'docx', u'doc'
|
u'ods', u'docx', u'doc'
|
||||||
]
|
]
|
||||||
|
|
||||||
def cleanup(filename):
|
|
||||||
"""
|
|
||||||
Tries to remove the given filename. Ignores non-existent files
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
os.remove(filename)
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def execute_unoconv(input_filepath, arguments=''):
|
def execute_unoconv(input_filepath, arguments=''):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -1,8 +1,14 @@
|
|||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
|
||||||
import slate
|
import slate
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
import ghostscript
|
||||||
|
|
||||||
from django.utils.translation import ugettext_lazy as _
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
|
from common.utils import get_mimetype
|
||||||
|
|
||||||
from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS
|
from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS
|
||||||
from converter.exceptions import ConvertError, UnknownFormat, IdentifyError
|
from converter.exceptions import ConvertError, UnknownFormat, IdentifyError
|
||||||
from converter.backends import ConverterBase
|
from converter.backends import ConverterBase
|
||||||
@@ -10,7 +16,7 @@ from converter.literals import TRANSFORMATION_RESIZE, \
|
|||||||
TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM
|
TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM
|
||||||
from converter.literals import QUALITY_DEFAULT, DEFAULT_PAGE_NUMBER, \
|
from converter.literals import QUALITY_DEFAULT, DEFAULT_PAGE_NUMBER, \
|
||||||
DEFAULT_FILE_FORMAT
|
DEFAULT_FILE_FORMAT
|
||||||
from converter.utils import get_mimetype
|
from converter.utils import cleanup
|
||||||
|
|
||||||
|
|
||||||
class ConverterClass(ConverterBase):
|
class ConverterClass(ConverterBase):
|
||||||
@@ -43,10 +49,44 @@ class ConverterClass(ConverterBase):
|
|||||||
return page_count
|
return page_count
|
||||||
|
|
||||||
def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
|
def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
|
||||||
|
tmpfile = None
|
||||||
|
mimetype, encoding = get_mimetype(input_filepath)
|
||||||
|
if mimetype == 'application/pdf':
|
||||||
|
# If file is a PDF open it with ghostscript and convert it to
|
||||||
|
# TIFF
|
||||||
|
first_page_tmpl = '-dFirstPage=%d' % page
|
||||||
|
last_page_tmpl = '-dLastPage=%d' % page
|
||||||
|
fd, tmpfile = tempfile.mkstemp()
|
||||||
|
os.close(fd)
|
||||||
|
output_file_tmpl = '-sOutputFile=%s' % tmpfile
|
||||||
|
input_file_tmpl = '-f%s' % input_filepath
|
||||||
|
args = [
|
||||||
|
'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH',
|
||||||
|
'-dNOPAUSE', '-dNOPROMPT',
|
||||||
|
first_page_tmpl, last_page_tmpl,
|
||||||
|
'-sDEVICE=jpeg', '-dJPEGQ=75',
|
||||||
|
'-r300', output_file_tmpl,
|
||||||
|
input_file_tmpl,
|
||||||
|
'-c "60000000 setvmthreshold"', # use 30MB
|
||||||
|
'-dNOGC', # No garbage collection
|
||||||
|
'-dMaxBitmap=500000000',
|
||||||
|
'-dAlignToPixels=0',
|
||||||
|
'-dGridFitTT=0',
|
||||||
|
'-dTextAlphaBits=4',
|
||||||
|
'-dGraphicsAlphaBits=4',
|
||||||
|
]
|
||||||
|
|
||||||
|
ghostscript.Ghostscript(*args)
|
||||||
|
page = 1 # Don't execute the following while loop
|
||||||
|
input_filepath = tmpfile
|
||||||
|
|
||||||
try:
|
try:
|
||||||
im = Image.open(input_filepath)
|
im = Image.open(input_filepath)
|
||||||
except Exception: # Python Imaging Library doesn't recognize it as an image
|
except Exception: # Python Imaging Library doesn't recognize it as an image
|
||||||
raise UnknownFormat
|
raise UnknownFormat
|
||||||
|
finally:
|
||||||
|
if tmpfile:
|
||||||
|
cleanup(tmpfile)
|
||||||
|
|
||||||
current_page = 0
|
current_page = 0
|
||||||
try:
|
try:
|
||||||
@@ -58,12 +98,12 @@ class ConverterClass(ConverterBase):
|
|||||||
pass # end of sequence
|
pass # end of sequence
|
||||||
|
|
||||||
if transformations:
|
if transformations:
|
||||||
|
aspect = 1.0 * im.size[0] / im.size[1]
|
||||||
for transformation in transformations:
|
for transformation in transformations:
|
||||||
aspect = 1.0 * im.size[1] / im.size[0]
|
|
||||||
if transformation['transformation'] == TRANSFORMATION_RESIZE:
|
if transformation['transformation'] == TRANSFORMATION_RESIZE:
|
||||||
width = int(transformation['arguments']['width'])
|
width = int(transformation['arguments']['width'])
|
||||||
height = int(transformation['arguments'].get('height', 1.0 * width * aspect))
|
height = int(transformation['arguments'].get('height', 1.0 * width * aspect))
|
||||||
im = im.resize((width, height), Image.ANTIALIAS)
|
im = self.resize(im, (width, height))
|
||||||
elif transformation['transformation'] == TRANSFORMATION_ZOOM:
|
elif transformation['transformation'] == TRANSFORMATION_ZOOM:
|
||||||
decimal_value = float(transformation['arguments']['percent']) / 100
|
decimal_value = float(transformation['arguments']['percent']) / 100
|
||||||
im = im.transform((im.size[0] * decimal_value, im.size[1] * decimal_value), Image.EXTENT, (0, 0, im.size[0], im.size[1]))
|
im = im.transform((im.size[0] * decimal_value, im.size[1] * decimal_value), Image.EXTENT, (0, 0, im.size[0], im.size[1]))
|
||||||
@@ -73,6 +113,7 @@ class ConverterClass(ConverterBase):
|
|||||||
|
|
||||||
if im.mode not in ('L', 'RGB'):
|
if im.mode not in ('L', 'RGB'):
|
||||||
im = im.convert('RGB')
|
im = im.convert('RGB')
|
||||||
|
|
||||||
im.save(output_filepath, format=file_format)
|
im.save(output_filepath, format=file_format)
|
||||||
|
|
||||||
def get_format_list(self):
|
def get_format_list(self):
|
||||||
@@ -91,3 +132,41 @@ class ConverterClass(ConverterBase):
|
|||||||
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \
|
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \
|
||||||
TRANSFORMATION_ZOOM
|
TRANSFORMATION_ZOOM
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# From: http://united-coders.com/christian-harms/image-resizing-tips-general-and-for-python
|
||||||
|
def resize(self, img, box, fit=False, out=None):
|
||||||
|
'''Downsample the image.
|
||||||
|
@param img: Image - an Image-object
|
||||||
|
@param box: tuple(x, y) - the bounding box of the result image
|
||||||
|
@param fit: boolean - crop the image to fill the box
|
||||||
|
@param out: file-like-object - save the image into the output stream
|
||||||
|
'''
|
||||||
|
#preresize image with factor 2, 4, 8 and fast algorithm
|
||||||
|
factor = 1
|
||||||
|
while img.size[0]/factor > 2*box[0] and img.size[1]*2/factor > 2*box[1]:
|
||||||
|
factor *=2
|
||||||
|
if factor > 1:
|
||||||
|
img.thumbnail((img.size[0]/factor, img.size[1]/factor), Image.NEAREST)
|
||||||
|
|
||||||
|
#calculate the cropping box and get the cropped part
|
||||||
|
if fit:
|
||||||
|
x1 = y1 = 0
|
||||||
|
x2, y2 = img.size
|
||||||
|
wRatio = 1.0 * x2/box[0]
|
||||||
|
hRatio = 1.0 * y2/box[1]
|
||||||
|
if hRatio > wRatio:
|
||||||
|
y1 = y2/2-box[1]*wRatio/2
|
||||||
|
y2 = y2/2+box[1]*wRatio/2
|
||||||
|
else:
|
||||||
|
x1 = x2/2-box[0]*hRatio/2
|
||||||
|
x2 = x2/2+box[0]*hRatio/2
|
||||||
|
img = img.crop((x1,y1,x2,y2))
|
||||||
|
|
||||||
|
#Resize the image with best quality algorithm ANTI-ALIAS
|
||||||
|
img.thumbnail(box, Image.ANTIALIAS)
|
||||||
|
|
||||||
|
if out:
|
||||||
|
#save it into a file-like object
|
||||||
|
img.save(out, "JPEG", quality=75)
|
||||||
|
else:
|
||||||
|
return img
|
||||||
|
|||||||
@@ -13,13 +13,6 @@ class UnknownFormat(ConvertError):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class UnpaperError(ConvertError):
|
|
||||||
"""
|
|
||||||
Raised by unpaper
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class IdentifyError(ConvertError):
|
class IdentifyError(ConvertError):
|
||||||
"""
|
"""
|
||||||
Raised by identify
|
Raised by identify
|
||||||
|
|||||||
@@ -2,14 +2,6 @@ import os
|
|||||||
|
|
||||||
from django.core.exceptions import ImproperlyConfigured
|
from django.core.exceptions import ImproperlyConfigured
|
||||||
from django.utils.importlib import import_module
|
from django.utils.importlib import import_module
|
||||||
|
|
||||||
try:
|
|
||||||
from python_magic import magic
|
|
||||||
USE_PYTHON_MAGIC = True
|
|
||||||
except:
|
|
||||||
import mimetypes
|
|
||||||
mimetypes.init()
|
|
||||||
USE_PYTHON_MAGIC = False
|
|
||||||
|
|
||||||
|
|
||||||
#http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python
|
#http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python
|
||||||
@@ -82,30 +74,11 @@ def load_backend():
|
|||||||
raise # If there's some other error, this must be an error in Mayan itself.
|
raise # If there's some other error, this must be an error in Mayan itself.
|
||||||
|
|
||||||
|
|
||||||
def get_mimetype(filepath):
|
def cleanup(filename):
|
||||||
"""
|
"""
|
||||||
Determine a file's mimetype by calling the system's libmagic
|
Tries to remove the given filename. Ignores non-existent files
|
||||||
library via python-magic or fallback to use python's mimetypes
|
|
||||||
library
|
|
||||||
"""
|
"""
|
||||||
file_mimetype = u''
|
try:
|
||||||
file_mime_encoding = u''
|
os.remove(filename)
|
||||||
|
except OSError:
|
||||||
if USE_PYTHON_MAGIC:
|
pass
|
||||||
if os.path.exists(filepath):
|
|
||||||
try:
|
|
||||||
source = open(filepath, 'r')
|
|
||||||
mime = magic.Magic(mime=True)
|
|
||||||
file_mimetype = mime.from_buffer(source.read())
|
|
||||||
source.seek(0)
|
|
||||||
mime_encoding = magic.Magic(mime_encoding=True)
|
|
||||||
file_mime_encoding = mime_encoding.from_buffer(source.read())
|
|
||||||
finally:
|
|
||||||
if source:
|
|
||||||
source.close()
|
|
||||||
else:
|
|
||||||
path, filename = os.path.split(filepath)
|
|
||||||
file_mimetype, file_mime_encoding = mimetypes.guess_type(filename)
|
|
||||||
|
|
||||||
return file_mimetype, file_mime_encoding
|
|
||||||
|
|
||||||
|
|||||||
@@ -4,3 +4,10 @@ class AlreadyQueued(Exception):
|
|||||||
|
|
||||||
class TesseractError(Exception):
|
class TesseractError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class UnpaperError(Exception):
|
||||||
|
"""
|
||||||
|
Raised by unpaper
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|||||||
@@ -20,5 +20,6 @@ QUEUEDOCUMENT_STATE_CHOICES = (
|
|||||||
(QUEUEDOCUMENT_STATE_ERROR, _(u'error')),
|
(QUEUEDOCUMENT_STATE_ERROR, _(u'error')),
|
||||||
)
|
)
|
||||||
|
|
||||||
DEFAULT_OCR_FILE_FORMAT = u'tif'
|
DEFAULT_OCR_FILE_FORMAT = u'tiff'
|
||||||
UNPAPER_FILE_FORMAT = u'pnm'
|
DEFAULT_OCR_FILE_EXTENSION = u'tif'
|
||||||
|
UNPAPER_FILE_FORMAT = u'ppm'
|
||||||
|
|||||||
Reference in New Issue
Block a user