diff --git a/apps/common/utils.py b/apps/common/utils.py index 13abb05627..5711ae3d4b 100644 --- a/apps/common/utils.py +++ b/apps/common/utils.py @@ -12,6 +12,15 @@ from django.contrib.contenttypes.models import ContentType from django.contrib.auth.models import User +try: + from python_magic import magic + USE_PYTHON_MAGIC = True +except: + import mimetypes + mimetypes.init() + USE_PYTHON_MAGIC = False + + def urlquote(link=None, get=None): u''' This method does both: urlquote() and urlencode() @@ -337,3 +346,31 @@ def return_diff(old_obj, new_obj, attrib_list=None): } return diff_dict + + +def get_mimetype(filepath): + """ + Determine a file's mimetype by calling the system's libmagic + library via python-magic or fallback to use python's mimetypes + library + """ + file_mimetype = u'' + file_mime_encoding = u'' + + if USE_PYTHON_MAGIC: + if os.path.exists(filepath): + try: + source = open(filepath, 'r') + mime = magic.Magic(mime=True) + file_mimetype = mime.from_buffer(source.read()) + source.seek(0) + mime_encoding = magic.Magic(mime_encoding=True) + file_mime_encoding = mime_encoding.from_buffer(source.read()) + finally: + if source: + source.close() + else: + path, filename = os.path.split(filepath) + file_mimetype, file_mime_encoding = mimetypes.guess_type(filename) + + return file_mimetype, file_mime_encoding diff --git a/apps/converter/api.py b/apps/converter/api.py index 2bed8c6125..a712e1c438 100644 --- a/apps/converter/api.py +++ b/apps/converter/api.py @@ -6,7 +6,7 @@ from common import TEMPORARY_DIRECTORY from documents.utils import document_save_to_temp_dir from converter.conf.settings import UNOCONV_PATH -from converter.exceptions import UnpaperError, OfficeConversionError +from converter.exceptions import OfficeConversionError from converter.literals import DEFAULT_PAGE_NUMBER, \ QUALITY_DEFAULT, DEFAULT_ZOOM_LEVEL, \ DEFAULT_ROTATION, DEFAULT_FILE_FORMAT, QUALITY_HIGH @@ -17,6 +17,7 @@ from converter.literals import TRANSFORMATION_RESIZE, \ TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \ TRANSFORMATION_ZOOM from converter.literals import DIMENSION_SEPARATOR +from converter.utils import cleanup HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest() @@ -24,15 +25,6 @@ CONVERTER_OFFICE_FILE_EXTENSIONS = [ u'ods', u'docx', u'doc' ] -def cleanup(filename): - """ - Tries to remove the given filename. Ignores non-existent files - """ - try: - os.remove(filename) - except OSError: - pass - def execute_unoconv(input_filepath, arguments=''): """ diff --git a/apps/converter/backends/python/base.py b/apps/converter/backends/python/base.py index 25448346ff..4535bd1369 100644 --- a/apps/converter/backends/python/base.py +++ b/apps/converter/backends/python/base.py @@ -1,8 +1,14 @@ +import tempfile +import os + import slate from PIL import Image +import ghostscript from django.utils.translation import ugettext_lazy as _ +from common.utils import get_mimetype + from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS from converter.exceptions import ConvertError, UnknownFormat, IdentifyError from converter.backends import ConverterBase @@ -10,7 +16,7 @@ from converter.literals import TRANSFORMATION_RESIZE, \ TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM from converter.literals import QUALITY_DEFAULT, DEFAULT_PAGE_NUMBER, \ DEFAULT_FILE_FORMAT -from converter.utils import get_mimetype +from converter.utils import cleanup class ConverterClass(ConverterBase): @@ -43,10 +49,44 @@ class ConverterClass(ConverterBase): return page_count def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT): + tmpfile = None + mimetype, encoding = get_mimetype(input_filepath) + if mimetype == 'application/pdf': + # If file is a PDF open it with ghostscript and convert it to + # TIFF + first_page_tmpl = '-dFirstPage=%d' % page + last_page_tmpl = '-dLastPage=%d' % page + fd, tmpfile = tempfile.mkstemp() + os.close(fd) + output_file_tmpl = '-sOutputFile=%s' % tmpfile + input_file_tmpl = '-f%s' % input_filepath + args = [ + 'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH', + '-dNOPAUSE', '-dNOPROMPT', + first_page_tmpl, last_page_tmpl, + '-sDEVICE=jpeg', '-dJPEGQ=75', + '-r300', output_file_tmpl, + input_file_tmpl, + '-c "60000000 setvmthreshold"', # use 30MB + '-dNOGC', # No garbage collection + '-dMaxBitmap=500000000', + '-dAlignToPixels=0', + '-dGridFitTT=0', + '-dTextAlphaBits=4', + '-dGraphicsAlphaBits=4', + ] + + ghostscript.Ghostscript(*args) + page = 1 # Don't execute the following while loop + input_filepath = tmpfile + try: im = Image.open(input_filepath) except Exception: # Python Imaging Library doesn't recognize it as an image raise UnknownFormat + finally: + if tmpfile: + cleanup(tmpfile) current_page = 0 try: @@ -58,12 +98,12 @@ class ConverterClass(ConverterBase): pass # end of sequence if transformations: + aspect = 1.0 * im.size[0] / im.size[1] for transformation in transformations: - aspect = 1.0 * im.size[1] / im.size[0] if transformation['transformation'] == TRANSFORMATION_RESIZE: width = int(transformation['arguments']['width']) height = int(transformation['arguments'].get('height', 1.0 * width * aspect)) - im = im.resize((width, height), Image.ANTIALIAS) + im = self.resize(im, (width, height)) elif transformation['transformation'] == TRANSFORMATION_ZOOM: decimal_value = float(transformation['arguments']['percent']) / 100 im = im.transform((im.size[0] * decimal_value, im.size[1] * decimal_value), Image.EXTENT, (0, 0, im.size[0], im.size[1])) @@ -73,6 +113,7 @@ class ConverterClass(ConverterBase): if im.mode not in ('L', 'RGB'): im = im.convert('RGB') + im.save(output_filepath, format=file_format) def get_format_list(self): @@ -91,3 +132,41 @@ class ConverterClass(ConverterBase): TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \ TRANSFORMATION_ZOOM ] + + # From: http://united-coders.com/christian-harms/image-resizing-tips-general-and-for-python + def resize(self, img, box, fit=False, out=None): + '''Downsample the image. + @param img: Image - an Image-object + @param box: tuple(x, y) - the bounding box of the result image + @param fit: boolean - crop the image to fill the box + @param out: file-like-object - save the image into the output stream + ''' + #preresize image with factor 2, 4, 8 and fast algorithm + factor = 1 + while img.size[0]/factor > 2*box[0] and img.size[1]*2/factor > 2*box[1]: + factor *=2 + if factor > 1: + img.thumbnail((img.size[0]/factor, img.size[1]/factor), Image.NEAREST) + + #calculate the cropping box and get the cropped part + if fit: + x1 = y1 = 0 + x2, y2 = img.size + wRatio = 1.0 * x2/box[0] + hRatio = 1.0 * y2/box[1] + if hRatio > wRatio: + y1 = y2/2-box[1]*wRatio/2 + y2 = y2/2+box[1]*wRatio/2 + else: + x1 = x2/2-box[0]*hRatio/2 + x2 = x2/2+box[0]*hRatio/2 + img = img.crop((x1,y1,x2,y2)) + + #Resize the image with best quality algorithm ANTI-ALIAS + img.thumbnail(box, Image.ANTIALIAS) + + if out: + #save it into a file-like object + img.save(out, "JPEG", quality=75) + else: + return img diff --git a/apps/converter/exceptions.py b/apps/converter/exceptions.py index c906fc5c95..1880f0ba39 100644 --- a/apps/converter/exceptions.py +++ b/apps/converter/exceptions.py @@ -13,13 +13,6 @@ class UnknownFormat(ConvertError): pass -class UnpaperError(ConvertError): - """ - Raised by unpaper - """ - pass - - class IdentifyError(ConvertError): """ Raised by identify diff --git a/apps/converter/utils.py b/apps/converter/utils.py index 4653b6dc9d..26ad9c4b74 100644 --- a/apps/converter/utils.py +++ b/apps/converter/utils.py @@ -2,14 +2,6 @@ import os from django.core.exceptions import ImproperlyConfigured from django.utils.importlib import import_module - -try: - from python_magic import magic - USE_PYTHON_MAGIC = True -except: - import mimetypes - mimetypes.init() - USE_PYTHON_MAGIC = False #http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python @@ -82,30 +74,11 @@ def load_backend(): raise # If there's some other error, this must be an error in Mayan itself. -def get_mimetype(filepath): +def cleanup(filename): """ - Determine a file's mimetype by calling the system's libmagic - library via python-magic or fallback to use python's mimetypes - library + Tries to remove the given filename. Ignores non-existent files """ - file_mimetype = u'' - file_mime_encoding = u'' - - if USE_PYTHON_MAGIC: - if os.path.exists(filepath): - try: - source = open(filepath, 'r') - mime = magic.Magic(mime=True) - file_mimetype = mime.from_buffer(source.read()) - source.seek(0) - mime_encoding = magic.Magic(mime_encoding=True) - file_mime_encoding = mime_encoding.from_buffer(source.read()) - finally: - if source: - source.close() - else: - path, filename = os.path.split(filepath) - file_mimetype, file_mime_encoding = mimetypes.guess_type(filename) - - return file_mimetype, file_mime_encoding - + try: + os.remove(filename) + except OSError: + pass diff --git a/apps/ocr/exceptions.py b/apps/ocr/exceptions.py index b1ec8c3fe3..41ebe0c8ca 100644 --- a/apps/ocr/exceptions.py +++ b/apps/ocr/exceptions.py @@ -4,3 +4,10 @@ class AlreadyQueued(Exception): class TesseractError(Exception): pass + + +class UnpaperError(Exception): + """ + Raised by unpaper + """ + pass diff --git a/apps/ocr/literals.py b/apps/ocr/literals.py index 6a33f0e712..946c063e38 100644 --- a/apps/ocr/literals.py +++ b/apps/ocr/literals.py @@ -20,5 +20,6 @@ QUEUEDOCUMENT_STATE_CHOICES = ( (QUEUEDOCUMENT_STATE_ERROR, _(u'error')), ) -DEFAULT_OCR_FILE_FORMAT = u'tif' -UNPAPER_FILE_FORMAT = u'pnm' +DEFAULT_OCR_FILE_FORMAT = u'tiff' +DEFAULT_OCR_FILE_EXTENSION = u'tif' +UNPAPER_FILE_FORMAT = u'ppm'