Issue #23, Experiment with pdftoppm

2014-11-18 00:35:56 -04:00
parent 8ff61778f6
commit ed11e242ca
1 changed files with 23 additions and 36 deletions
--- a/mayan/apps/converter/backends/python.py
+++ b/mayan/apps/converter/backends/python.py
@@ -1,16 +1,24 @@
 from __future__ import absolute_import

+import logging
 import os
 import tempfile

+try:
+    from cStringIO import StringIO
+except ImportError:
+    from StringIO import StringIO
+
 import slate
 from PIL import Image
+import sh

 try:
-    import ghostscript
-    USE_GHOSTSCRIPT = True
-except RuntimeError:
-    USE_GHOSTSCRIPT = False
+    pdftoppm = sh.Command('/usr/bin/pdftoppm')
+except sh.CommandNotFound:
+    pdftoppm = None
+else:
+    pdftoppm = pdftoppm.bake('-png')

 from common.utils import fs_cleanup
 from mimetype.api import get_mimetype
@@ -22,6 +30,7 @@ from ..literals import (DEFAULT_FILE_FORMAT, DEFAULT_PAGE_NUMBER,
                        TRANSFORMATION_ZOOM)

 Image.init()
+logger = logging.getLogger(__name__)


 class Python(ConverterBase):
@@ -46,7 +55,7 @@ class Python(ConverterBase):
            raise UnknownFileFormat

        try:
-            while 1:
+            while True:
                im.seek(im.tell() + 1)
                page_count += 1
                # do something to im
@@ -61,38 +70,16 @@ class Python(ConverterBase):
        if not mimetype:
            mimetype, encoding = get_mimetype(open(input_filepath, 'rb'), input_filepath, mimetype_only=True)

-        if mimetype == 'application/pdf' and USE_GHOSTSCRIPT:
-            # If file is a PDF open it with ghostscript and convert it to
-            # TIFF
-            first_page_tmpl = '-dFirstPage=%d' % page
-            last_page_tmpl = '-dLastPage=%d' % page
-            fd, tmpfile = tempfile.mkstemp()
-            os.close(fd)
-            output_file_tmpl = '-sOutputFile=%s' % tmpfile
-            input_file_tmpl = '-f%s' % input_filepath
-            args = [
-                'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH',
-                '-dNOPAUSE', '-dNOPROMPT',
-                first_page_tmpl, last_page_tmpl,
-                '-sDEVICE=jpeg', '-dJPEGQ=95',
-                '-r150', output_file_tmpl,
-                input_file_tmpl,
-                '-c "60000000 setvmthreshold"',  # use 30MB
-                '-dNOGC',  # No garbage collection
-                '-dMaxBitmap=500000000',
-                '-dAlignToPixels=0',
-                '-dGridFitTT=0',
-                '-dTextAlphaBits=4',
-                '-dGraphicsAlphaBits=4',
-            ]
-
-            ghostscript.Ghostscript(*args)
-            page = 1  # Don't execute the following while loop
-            input_filepath = tmpfile
-
        try:
-            im = Image.open(input_filepath)
-        except Exception:
+            if mimetype == 'application/pdf' and pdftoppm:
+                image_buffer = StringIO()
+                pdftoppm(input_filepath, f=page, l=page, _out=image_buffer)
+                image_buffer.seek(0)
+                im = Image.open(image_buffer)
+            else:
+                im = Image.open(input_filepath)
+        except Exception as exception:
+            logger.error('Error converting image; %s', exception)
            # Python Imaging Library doesn't recognize it as an image
            raise UnknownFileFormat
        finally: