Added PDF file support to the python converter backend via ghostscript

2011-07-19 20:55:08 -04:00
parent 57fed7608a
commit 8a017e2af0
7 changed files with 137 additions and 55 deletions
--- a/apps/common/utils.py
+++ b/apps/common/utils.py
@@ -12,6 +12,15 @@ from django.contrib.contenttypes.models import ContentType
 from django.contrib.auth.models import User
 try:
    from python_magic import magic
    USE_PYTHON_MAGIC = True
 except:
    import mimetypes
    mimetypes.init()
    USE_PYTHON_MAGIC = False
 def urlquote(link=None, get=None):
    u'''
    This method does both: urlquote() and urlencode()
@@ -337,3 +346,31 @@ def return_diff(old_obj, new_obj, attrib_list=None):
            }
    return diff_dict
 def get_mimetype(filepath):
    """
    Determine a file's mimetype by calling the system's libmagic
    library via python-magic or fallback to use python's mimetypes
    library
    """
    file_mimetype = u''
    file_mime_encoding = u''
    if USE_PYTHON_MAGIC:
        if os.path.exists(filepath):
            try:
                source = open(filepath, 'r')
                mime = magic.Magic(mime=True)
                file_mimetype = mime.from_buffer(source.read())
                source.seek(0)
                mime_encoding = magic.Magic(mime_encoding=True)
                file_mime_encoding = mime_encoding.from_buffer(source.read())
            finally:
                if source:
                    source.close()
    else:
        path, filename = os.path.split(filepath)
        file_mimetype, file_mime_encoding = mimetypes.guess_type(filename)
    return file_mimetype, file_mime_encoding
--- a/apps/converter/api.py
+++ b/apps/converter/api.py
@@ -6,7 +6,7 @@ from common import TEMPORARY_DIRECTORY
 from documents.utils import document_save_to_temp_dir
 from converter.conf.settings import UNOCONV_PATH
-from converter.exceptions import UnpaperError, OfficeConversionError
+from converter.exceptions import OfficeConversionError
 from converter.literals import DEFAULT_PAGE_NUMBER, \
    QUALITY_DEFAULT, DEFAULT_ZOOM_LEVEL, \
    DEFAULT_ROTATION, DEFAULT_FILE_FORMAT, QUALITY_HIGH
@@ -17,6 +17,7 @@ from converter.literals import TRANSFORMATION_RESIZE, \
    TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \
    TRANSFORMATION_ZOOM
 from converter.literals import DIMENSION_SEPARATOR    
 from converter.utils import cleanup
 HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
@@ -24,15 +25,6 @@ CONVERTER_OFFICE_FILE_EXTENSIONS = [
    u'ods', u'docx', u'doc'
 ]
 def cleanup(filename):
    """
    Tries to remove the given filename. Ignores non-existent files
    """
    try:
        os.remove(filename)
    except OSError:
        pass
 def execute_unoconv(input_filepath, arguments=''):
    """
--- a/apps/converter/backends/python/base.py
+++ b/apps/converter/backends/python/base.py
@@ -1,8 +1,14 @@
 import tempfile
 import os
 import slate
 from PIL import Image
 import ghostscript
 from django.utils.translation import ugettext_lazy as _
 from common.utils import get_mimetype
 from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS
 from converter.exceptions import ConvertError, UnknownFormat, IdentifyError
 from converter.backends import ConverterBase
@@ -10,7 +16,7 @@ from converter.literals import TRANSFORMATION_RESIZE, \
    TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM
 from converter.literals import QUALITY_DEFAULT, DEFAULT_PAGE_NUMBER, \
    DEFAULT_FILE_FORMAT
-from converter.utils import get_mimetype
+from converter.utils import cleanup
 class ConverterClass(ConverterBase):
@@ -43,10 +49,44 @@ class ConverterClass(ConverterBase):
        return page_count
    def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
        tmpfile = None
        mimetype, encoding = get_mimetype(input_filepath)
        if mimetype == 'application/pdf':
            # If file is a PDF open it with ghostscript and convert it to
            # TIFF
            first_page_tmpl = '-dFirstPage=%d' % page
            last_page_tmpl = '-dLastPage=%d' % page
            fd, tmpfile = tempfile.mkstemp()
            os.close(fd)
            output_file_tmpl = '-sOutputFile=%s' % tmpfile
            input_file_tmpl = '-f%s' % input_filepath
            args = [
                'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH',
                '-dNOPAUSE', '-dNOPROMPT', 
                first_page_tmpl, last_page_tmpl,
                '-sDEVICE=jpeg', '-dJPEGQ=75',
                '-r300', output_file_tmpl,
                input_file_tmpl,
                '-c "60000000 setvmthreshold"',  # use 30MB
                '-dNOGC',  # No garbage collection
                '-dMaxBitmap=500000000',
                '-dAlignToPixels=0',
                '-dGridFitTT=0',
                '-dTextAlphaBits=4',
                '-dGraphicsAlphaBits=4',                
            ] 
            ghostscript.Ghostscript(*args)
            page = 1 # Don't execute the following while loop
            input_filepath = tmpfile    
        try:
            im = Image.open(input_filepath)
        except Exception: # Python Imaging Library doesn't recognize it as an image
            raise UnknownFormat
        finally:
            if tmpfile:
                cleanup(tmpfile)
        current_page = 0
        try:
@@ -58,12 +98,12 @@ class ConverterClass(ConverterBase):
            pass # end of sequence        
        if transformations:
            aspect = 1.0 * im.size[0] / im.size[1]
            for transformation in transformations:
                aspect = 1.0 * im.size[1] / im.size[0]
                if transformation['transformation'] == TRANSFORMATION_RESIZE:
                    width = int(transformation['arguments']['width'])
                    height = int(transformation['arguments'].get('height', 1.0 * width * aspect))
-                    im = im.resize((width, height), Image.ANTIALIAS)
+                    im = self.resize(im, (width, height))
                elif transformation['transformation'] == TRANSFORMATION_ZOOM:
                    decimal_value = float(transformation['arguments']['percent']) / 100
                    im = im.transform((im.size[0] * decimal_value, im.size[1] * decimal_value), Image.EXTENT, (0, 0, im.size[0], im.size[1])) 
@@ -73,6 +113,7 @@ class ConverterClass(ConverterBase):
        if im.mode not in ('L', 'RGB'):
            im = im.convert('RGB')
        im.save(output_filepath, format=file_format)
    def get_format_list(self):
@@ -91,3 +132,41 @@ class ConverterClass(ConverterBase):
            TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \
            TRANSFORMATION_ZOOM
        ]
    # From: http://united-coders.com/christian-harms/image-resizing-tips-general-and-for-python
    def resize(self, img, box, fit=False, out=None):
        '''Downsample the image.
        @param img: Image -  an Image-object
        @param box: tuple(x, y) - the bounding box of the result image
        @param fit: boolean - crop the image to fill the box
        @param out: file-like-object - save the image into the output stream
        '''
        #preresize image with factor 2, 4, 8 and fast algorithm
        factor = 1
        while img.size[0]/factor > 2*box[0] and img.size[1]*2/factor > 2*box[1]:
            factor *=2
        if factor > 1:
            img.thumbnail((img.size[0]/factor, img.size[1]/factor), Image.NEAREST)
        #calculate the cropping box and get the cropped part
        if fit:
            x1 = y1 = 0
            x2, y2 = img.size
            wRatio = 1.0 * x2/box[0]
            hRatio = 1.0 * y2/box[1]
            if hRatio > wRatio:
                y1 = y2/2-box[1]*wRatio/2
                y2 = y2/2+box[1]*wRatio/2
            else:
                x1 = x2/2-box[0]*hRatio/2
                x2 = x2/2+box[0]*hRatio/2
            img = img.crop((x1,y1,x2,y2))
        #Resize the image with best quality algorithm ANTI-ALIAS
        img.thumbnail(box, Image.ANTIALIAS)
        if out:
            #save it into a file-like object
            img.save(out, "JPEG", quality=75)
        else:
            return img
--- a/apps/converter/exceptions.py
+++ b/apps/converter/exceptions.py
@@ -13,13 +13,6 @@ class UnknownFormat(ConvertError):
    pass
 class UnpaperError(ConvertError):
    """
    Raised by unpaper
    """
    pass
 class IdentifyError(ConvertError):
    """
    Raised by identify
--- a/apps/converter/utils.py
+++ b/apps/converter/utils.py
@@ -2,14 +2,6 @@ import os
 from django.core.exceptions import ImproperlyConfigured
 from django.utils.importlib import import_module
 try:
    from python_magic import magic
    USE_PYTHON_MAGIC = True
 except:
    import mimetypes
    mimetypes.init()
    USE_PYTHON_MAGIC = False
 #http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python
@@ -82,30 +74,11 @@ def load_backend():
                raise # If there's some other error, this must be an error in Mayan itself.
-def get_mimetype(filepath):
+def cleanup(filename):
    """
-    Determine a file's mimetype by calling the system's libmagic
+    Tries to remove the given filename. Ignores non-existent files
    library via python-magic or fallback to use python's mimetypes
    library
    """
-    file_mimetype = u''
+    try:
-    file_mime_encoding = u''
+        os.remove(filename)
-    
+    except OSError:
-    if USE_PYTHON_MAGIC:
+        pass
        if os.path.exists(filepath):
            try:
                source = open(filepath, 'r')
                mime = magic.Magic(mime=True)
                file_mimetype = mime.from_buffer(source.read())
                source.seek(0)
                mime_encoding = magic.Magic(mime_encoding=True)
                file_mime_encoding = mime_encoding.from_buffer(source.read())
            finally:
                if source:
                    source.close()
    else:
        path, filename = os.path.split(filepath)
        file_mimetype, file_mime_encoding = mimetypes.guess_type(filename)
    return file_mimetype, file_mime_encoding
--- a/apps/ocr/exceptions.py
+++ b/apps/ocr/exceptions.py
@@ -4,3 +4,10 @@ class AlreadyQueued(Exception):
 class TesseractError(Exception):
    pass
 class UnpaperError(Exception):
    """
    Raised by unpaper
    """
    pass
--- a/apps/ocr/literals.py
+++ b/apps/ocr/literals.py
@@ -20,5 +20,6 @@ QUEUEDOCUMENT_STATE_CHOICES = (
    (QUEUEDOCUMENT_STATE_ERROR, _(u'error')),
 )
-DEFAULT_OCR_FILE_FORMAT = u'tif'
+DEFAULT_OCR_FILE_FORMAT = u'tiff'
-UNPAPER_FILE_FORMAT = u'pnm'
+DEFAULT_OCR_FILE_EXTENSION = u'tif'
 UNPAPER_FILE_FORMAT = u'ppm'