Added PDF file support to the python converter backend via ghostscript

This commit is contained in:
Roberto Rosario
2011-07-19 20:55:08 -04:00
parent 57fed7608a
commit 8a017e2af0
7 changed files with 137 additions and 55 deletions

View File

@@ -12,6 +12,15 @@ from django.contrib.contenttypes.models import ContentType
from django.contrib.auth.models import User from django.contrib.auth.models import User
try:
from python_magic import magic
USE_PYTHON_MAGIC = True
except:
import mimetypes
mimetypes.init()
USE_PYTHON_MAGIC = False
def urlquote(link=None, get=None): def urlquote(link=None, get=None):
u''' u'''
This method does both: urlquote() and urlencode() This method does both: urlquote() and urlencode()
@@ -337,3 +346,31 @@ def return_diff(old_obj, new_obj, attrib_list=None):
} }
return diff_dict return diff_dict
def get_mimetype(filepath):
"""
Determine a file's mimetype by calling the system's libmagic
library via python-magic or fallback to use python's mimetypes
library
"""
file_mimetype = u''
file_mime_encoding = u''
if USE_PYTHON_MAGIC:
if os.path.exists(filepath):
try:
source = open(filepath, 'r')
mime = magic.Magic(mime=True)
file_mimetype = mime.from_buffer(source.read())
source.seek(0)
mime_encoding = magic.Magic(mime_encoding=True)
file_mime_encoding = mime_encoding.from_buffer(source.read())
finally:
if source:
source.close()
else:
path, filename = os.path.split(filepath)
file_mimetype, file_mime_encoding = mimetypes.guess_type(filename)
return file_mimetype, file_mime_encoding

View File

@@ -6,7 +6,7 @@ from common import TEMPORARY_DIRECTORY
from documents.utils import document_save_to_temp_dir from documents.utils import document_save_to_temp_dir
from converter.conf.settings import UNOCONV_PATH from converter.conf.settings import UNOCONV_PATH
from converter.exceptions import UnpaperError, OfficeConversionError from converter.exceptions import OfficeConversionError
from converter.literals import DEFAULT_PAGE_NUMBER, \ from converter.literals import DEFAULT_PAGE_NUMBER, \
QUALITY_DEFAULT, DEFAULT_ZOOM_LEVEL, \ QUALITY_DEFAULT, DEFAULT_ZOOM_LEVEL, \
DEFAULT_ROTATION, DEFAULT_FILE_FORMAT, QUALITY_HIGH DEFAULT_ROTATION, DEFAULT_FILE_FORMAT, QUALITY_HIGH
@@ -17,6 +17,7 @@ from converter.literals import TRANSFORMATION_RESIZE, \
TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \ TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \
TRANSFORMATION_ZOOM TRANSFORMATION_ZOOM
from converter.literals import DIMENSION_SEPARATOR from converter.literals import DIMENSION_SEPARATOR
from converter.utils import cleanup
HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest() HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
@@ -24,15 +25,6 @@ CONVERTER_OFFICE_FILE_EXTENSIONS = [
u'ods', u'docx', u'doc' u'ods', u'docx', u'doc'
] ]
def cleanup(filename):
"""
Tries to remove the given filename. Ignores non-existent files
"""
try:
os.remove(filename)
except OSError:
pass
def execute_unoconv(input_filepath, arguments=''): def execute_unoconv(input_filepath, arguments=''):
""" """

View File

@@ -1,8 +1,14 @@
import tempfile
import os
import slate import slate
from PIL import Image from PIL import Image
import ghostscript
from django.utils.translation import ugettext_lazy as _ from django.utils.translation import ugettext_lazy as _
from common.utils import get_mimetype
from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS
from converter.exceptions import ConvertError, UnknownFormat, IdentifyError from converter.exceptions import ConvertError, UnknownFormat, IdentifyError
from converter.backends import ConverterBase from converter.backends import ConverterBase
@@ -10,7 +16,7 @@ from converter.literals import TRANSFORMATION_RESIZE, \
TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM
from converter.literals import QUALITY_DEFAULT, DEFAULT_PAGE_NUMBER, \ from converter.literals import QUALITY_DEFAULT, DEFAULT_PAGE_NUMBER, \
DEFAULT_FILE_FORMAT DEFAULT_FILE_FORMAT
from converter.utils import get_mimetype from converter.utils import cleanup
class ConverterClass(ConverterBase): class ConverterClass(ConverterBase):
@@ -43,10 +49,44 @@ class ConverterClass(ConverterBase):
return page_count return page_count
def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT): def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
tmpfile = None
mimetype, encoding = get_mimetype(input_filepath)
if mimetype == 'application/pdf':
# If file is a PDF open it with ghostscript and convert it to
# TIFF
first_page_tmpl = '-dFirstPage=%d' % page
last_page_tmpl = '-dLastPage=%d' % page
fd, tmpfile = tempfile.mkstemp()
os.close(fd)
output_file_tmpl = '-sOutputFile=%s' % tmpfile
input_file_tmpl = '-f%s' % input_filepath
args = [
'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH',
'-dNOPAUSE', '-dNOPROMPT',
first_page_tmpl, last_page_tmpl,
'-sDEVICE=jpeg', '-dJPEGQ=75',
'-r300', output_file_tmpl,
input_file_tmpl,
'-c "60000000 setvmthreshold"', # use 30MB
'-dNOGC', # No garbage collection
'-dMaxBitmap=500000000',
'-dAlignToPixels=0',
'-dGridFitTT=0',
'-dTextAlphaBits=4',
'-dGraphicsAlphaBits=4',
]
ghostscript.Ghostscript(*args)
page = 1 # Don't execute the following while loop
input_filepath = tmpfile
try: try:
im = Image.open(input_filepath) im = Image.open(input_filepath)
except Exception: # Python Imaging Library doesn't recognize it as an image except Exception: # Python Imaging Library doesn't recognize it as an image
raise UnknownFormat raise UnknownFormat
finally:
if tmpfile:
cleanup(tmpfile)
current_page = 0 current_page = 0
try: try:
@@ -58,12 +98,12 @@ class ConverterClass(ConverterBase):
pass # end of sequence pass # end of sequence
if transformations: if transformations:
aspect = 1.0 * im.size[0] / im.size[1]
for transformation in transformations: for transformation in transformations:
aspect = 1.0 * im.size[1] / im.size[0]
if transformation['transformation'] == TRANSFORMATION_RESIZE: if transformation['transformation'] == TRANSFORMATION_RESIZE:
width = int(transformation['arguments']['width']) width = int(transformation['arguments']['width'])
height = int(transformation['arguments'].get('height', 1.0 * width * aspect)) height = int(transformation['arguments'].get('height', 1.0 * width * aspect))
im = im.resize((width, height), Image.ANTIALIAS) im = self.resize(im, (width, height))
elif transformation['transformation'] == TRANSFORMATION_ZOOM: elif transformation['transformation'] == TRANSFORMATION_ZOOM:
decimal_value = float(transformation['arguments']['percent']) / 100 decimal_value = float(transformation['arguments']['percent']) / 100
im = im.transform((im.size[0] * decimal_value, im.size[1] * decimal_value), Image.EXTENT, (0, 0, im.size[0], im.size[1])) im = im.transform((im.size[0] * decimal_value, im.size[1] * decimal_value), Image.EXTENT, (0, 0, im.size[0], im.size[1]))
@@ -73,6 +113,7 @@ class ConverterClass(ConverterBase):
if im.mode not in ('L', 'RGB'): if im.mode not in ('L', 'RGB'):
im = im.convert('RGB') im = im.convert('RGB')
im.save(output_filepath, format=file_format) im.save(output_filepath, format=file_format)
def get_format_list(self): def get_format_list(self):
@@ -91,3 +132,41 @@ class ConverterClass(ConverterBase):
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \ TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \
TRANSFORMATION_ZOOM TRANSFORMATION_ZOOM
] ]
# From: http://united-coders.com/christian-harms/image-resizing-tips-general-and-for-python
def resize(self, img, box, fit=False, out=None):
'''Downsample the image.
@param img: Image - an Image-object
@param box: tuple(x, y) - the bounding box of the result image
@param fit: boolean - crop the image to fill the box
@param out: file-like-object - save the image into the output stream
'''
#preresize image with factor 2, 4, 8 and fast algorithm
factor = 1
while img.size[0]/factor > 2*box[0] and img.size[1]*2/factor > 2*box[1]:
factor *=2
if factor > 1:
img.thumbnail((img.size[0]/factor, img.size[1]/factor), Image.NEAREST)
#calculate the cropping box and get the cropped part
if fit:
x1 = y1 = 0
x2, y2 = img.size
wRatio = 1.0 * x2/box[0]
hRatio = 1.0 * y2/box[1]
if hRatio > wRatio:
y1 = y2/2-box[1]*wRatio/2
y2 = y2/2+box[1]*wRatio/2
else:
x1 = x2/2-box[0]*hRatio/2
x2 = x2/2+box[0]*hRatio/2
img = img.crop((x1,y1,x2,y2))
#Resize the image with best quality algorithm ANTI-ALIAS
img.thumbnail(box, Image.ANTIALIAS)
if out:
#save it into a file-like object
img.save(out, "JPEG", quality=75)
else:
return img

View File

@@ -13,13 +13,6 @@ class UnknownFormat(ConvertError):
pass pass
class UnpaperError(ConvertError):
"""
Raised by unpaper
"""
pass
class IdentifyError(ConvertError): class IdentifyError(ConvertError):
""" """
Raised by identify Raised by identify

View File

@@ -2,14 +2,6 @@ import os
from django.core.exceptions import ImproperlyConfigured from django.core.exceptions import ImproperlyConfigured
from django.utils.importlib import import_module from django.utils.importlib import import_module
try:
from python_magic import magic
USE_PYTHON_MAGIC = True
except:
import mimetypes
mimetypes.init()
USE_PYTHON_MAGIC = False
#http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python #http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python
@@ -82,30 +74,11 @@ def load_backend():
raise # If there's some other error, this must be an error in Mayan itself. raise # If there's some other error, this must be an error in Mayan itself.
def get_mimetype(filepath): def cleanup(filename):
""" """
Determine a file's mimetype by calling the system's libmagic Tries to remove the given filename. Ignores non-existent files
library via python-magic or fallback to use python's mimetypes
library
""" """
file_mimetype = u'' try:
file_mime_encoding = u'' os.remove(filename)
except OSError:
if USE_PYTHON_MAGIC: pass
if os.path.exists(filepath):
try:
source = open(filepath, 'r')
mime = magic.Magic(mime=True)
file_mimetype = mime.from_buffer(source.read())
source.seek(0)
mime_encoding = magic.Magic(mime_encoding=True)
file_mime_encoding = mime_encoding.from_buffer(source.read())
finally:
if source:
source.close()
else:
path, filename = os.path.split(filepath)
file_mimetype, file_mime_encoding = mimetypes.guess_type(filename)
return file_mimetype, file_mime_encoding

View File

@@ -4,3 +4,10 @@ class AlreadyQueued(Exception):
class TesseractError(Exception): class TesseractError(Exception):
pass pass
class UnpaperError(Exception):
"""
Raised by unpaper
"""
pass

View File

@@ -20,5 +20,6 @@ QUEUEDOCUMENT_STATE_CHOICES = (
(QUEUEDOCUMENT_STATE_ERROR, _(u'error')), (QUEUEDOCUMENT_STATE_ERROR, _(u'error')),
) )
DEFAULT_OCR_FILE_FORMAT = u'tif' DEFAULT_OCR_FILE_FORMAT = u'tiff'
UNPAPER_FILE_FORMAT = u'pnm' DEFAULT_OCR_FILE_EXTENSION = u'tif'
UNPAPER_FILE_FORMAT = u'ppm'