From 415f0c8daa5bb95eb39d1149b2258e443c3261b0 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Wed, 13 Jul 2011 22:53:33 -0400 Subject: [PATCH 01/14] Refactored the converter backend system --- apps/converter/__init__.py | 4 - apps/converter/api.py | 70 ++++----------- apps/converter/backends/__init__.py | 43 ++++++++++ apps/converter/backends/graphicsmagick.py | 71 ---------------- .../backends/graphicsmagick/__init__.py | 0 .../converter/backends/graphicsmagick/base.py | 85 +++++++++++++++++++ apps/converter/backends/imagemagick.py | 68 --------------- .../backends/imagemagick/__init__.py | 0 apps/converter/backends/imagemagick/base.py | 82 ++++++++++++++++++ apps/converter/literals.py | 22 +++++ apps/converter/utils.py | 55 +++++++++++- apps/documents/conf/settings.py | 6 -- apps/documents/models.py | 22 +---- apps/documents/urls.py | 4 +- apps/sources/models.py | 6 +- apps/sources/staging.py | 17 ++-- 16 files changed, 317 insertions(+), 238 deletions(-) delete mode 100644 apps/converter/backends/graphicsmagick.py create mode 100644 apps/converter/backends/graphicsmagick/__init__.py create mode 100644 apps/converter/backends/graphicsmagick/base.py delete mode 100644 apps/converter/backends/imagemagick.py create mode 100644 apps/converter/backends/imagemagick/__init__.py create mode 100644 apps/converter/backends/imagemagick/base.py create mode 100644 apps/converter/literals.py diff --git a/apps/converter/__init__.py b/apps/converter/__init__.py index 331738373a..6ab5029f01 100644 --- a/apps/converter/__init__.py +++ b/apps/converter/__init__.py @@ -2,10 +2,6 @@ from django.utils.translation import ugettext_lazy as _ from navigation.api import register_sidebar_template -TRANFORMATION_CHOICES = { - u'rotate': u'-rotate %(degrees)d' -} - formats_list = {'text': _('file formats'), 'view': 'formats_list', 'famfam': 'pictures'} register_sidebar_template(['formats_list'], 'converter_file_formats_help.html') diff --git a/apps/converter/api.py b/apps/converter/api.py index d7595de8c3..9de0ed737e 100644 --- a/apps/converter/api.py +++ b/apps/converter/api.py @@ -3,62 +3,28 @@ import subprocess from django.utils.importlib import import_module from django.template.defaultfilters import slugify - -from converter.conf.settings import UNPAPER_PATH -from converter.conf.settings import OCR_OPTIONS -from converter.conf.settings import DEFAULT_OPTIONS -from converter.conf.settings import LOW_QUALITY_OPTIONS -from converter.conf.settings import HIGH_QUALITY_OPTIONS -from converter.conf.settings import PRINT_QUALITY_OPTIONS -from converter.conf.settings import GRAPHICS_BACKEND -from converter.conf.settings import UNOCONV_PATH - -from converter.exceptions import UnpaperError, OfficeConversionError +from django.core.exceptions import ImproperlyConfigured from common import TEMPORARY_DIRECTORY from documents.utils import document_save_to_temp_dir -DEFAULT_ZOOM_LEVEL = 100 -DEFAULT_ROTATION = 0 -DEFAULT_PAGE_INDEX_NUMBER = 0 -DEFAULT_FILE_FORMAT = u'jpg' -DEFAULT_OCR_FILE_FORMAT = u'tif' - -QUALITY_DEFAULT = u'quality_default' -QUALITY_LOW = u'quality_low' -QUALITY_HIGH = u'quality_high' -QUALITY_PRINT = u'quality_print' - -QUALITY_SETTINGS = { - QUALITY_DEFAULT: DEFAULT_OPTIONS, - QUALITY_LOW: LOW_QUALITY_OPTIONS, - QUALITY_HIGH: HIGH_QUALITY_OPTIONS, - QUALITY_PRINT: PRINT_QUALITY_OPTIONS -} +from converter.conf.settings import UNPAPER_PATH +from converter.conf.settings import OCR_OPTIONS +from converter.conf.settings import UNOCONV_PATH +from converter.exceptions import UnpaperError, OfficeConversionError +from converter.utils import load_backend +from converter.literals import DEFAULT_PAGE_INDEX_NUMBER, \ + DEFAULT_OCR_FILE_FORMAT, QUALITY_DEFAULT, DEFAULT_ZOOM_LEVEL, \ + DEFAULT_ROTATION, DEFAULT_FILE_FORMAT, QUALITY_PRINT CONVERTER_OFFICE_FILE_EXTENSIONS = [ u'ods', u'docx', u'doc' ] - -def _lazy_load(fn): - _cached = [] - - def _decorated(): - if not _cached: - _cached.append(fn()) - return _cached[0] - return _decorated - - -@_lazy_load -def _get_backend(): - return import_module(GRAPHICS_BACKEND) - try: - backend = _get_backend() -except ImportError: - raise ImportError(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND) + backend = load_backend().ConverterClass() +except ImproperlyConfigured: + raise ImproperlyConfigured(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND) def cleanup(filename): @@ -173,7 +139,7 @@ def convert(input_filepath, *args, **kwargs): if format == u'jpg': extra_options += u' -quality 85' try: - backend.execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath=u'%s:%s' % (file_format, output_filepath), quality=quality) + backend.convert_file(input_filepath=input_arg, arguments=extra_options, output_filepath=u'%s:%s' % (file_format, output_filepath), quality=quality) finally: if cleanup_files: cleanup(input_filepath) @@ -185,7 +151,7 @@ def convert(input_filepath, *args, **kwargs): def get_page_count(input_filepath): try: - return len(backend.execute_identify(unicode(input_filepath)).splitlines()) + return len(backend.identify_file(unicode(input_filepath)).splitlines()) except: #TODO: send to other page number identifying program return 1 @@ -195,7 +161,7 @@ def get_document_dimensions(document, *args, **kwargs): document_filepath = create_image_cache_filename(document.checksum, *args, **kwargs) if os.path.exists(document_filepath): options = [u'-format', u'%w %h'] - return [int(dimension) for dimension in backend.execute_identify(unicode(document_filepath), options).split()] + return [int(dimension) for dimension in backend.identify_file(unicode(document_filepath), options).split()] else: return [0, 0] @@ -219,13 +185,13 @@ def convert_document_for_ocr(document, page=DEFAULT_PAGE_INDEX_NUMBER, file_form transformation_string, warnings = document_page.get_transformation_string() #Apply default transformations - backend.execute_convert(input_filepath=input_arg, quality=QUALITY_HIGH, arguments=transformation_string, output_filepath=transformation_output_file) + backend.convert_file(input_filepath=input_arg, quality=QUALITY_HIGH, arguments=transformation_string, output_filepath=transformation_output_file) #Do OCR operations - backend.execute_convert(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file) + backend.convert_file(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file) # Process by unpaper execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file) # Convert to tif - backend.execute_convert(input_filepath=unpaper_output_file, output_filepath=convert_output_file) + backend.convert_file(input_filepath=unpaper_output_file, output_filepath=convert_output_file) finally: cleanup(transformation_output_file) cleanup(unpaper_input_file) diff --git a/apps/converter/backends/__init__.py b/apps/converter/backends/__init__.py index e69de29bb2..1d81dd8149 100644 --- a/apps/converter/backends/__init__.py +++ b/apps/converter/backends/__init__.py @@ -0,0 +1,43 @@ +class ConverterBase(object): + """ + Base class that all backend classes must inherit + """ + + def identify_file(self, input_filepath, *args, **kwargs): + raise NotImplementedError("Your %s class has not defined a identify_file() method, which is required." % self.__class__.__name__) + + def identify_document(self, document, *args, **kwargs): + raise NotImplementedError("Your %s class has not defined a identify_document() method, which is required." % self.__class__.__name__) + + def convert_file(self, input_filepath, *args, **kwargs): + raise NotImplementedError("Your %s class has not defined a convert_file() method, which is required." % self.__class__.__name__) + + def convert_document(self, document, *args, **kwargs): + raise NotImplementedError("Your %s class has not defined a convert_document() method, which is required." % self.__class__.__name__) + + def get_format_list(self): + raise NotImplementedError("Your %s class has not defined a get_format_list() method, which is required." % self.__class__.__name__) + + def get_available_transformations(self): + raise NotImplementedError("Your %s class has not defined a get_available_transformations() method, which is required." % self.__class__.__name__) + + def get_available_transformations_labels(self): + return ([(name, data['label']) for name, data in self.get_available_transformations().items()]) + + def get_transformation_string(self, transformation_list): + transformations = [] + warnings = [] + transformation_choices = self.get_available_transformations() + for transformation in transformation_list: + try: + if transformation['transformation'] in transformation_choices: + transformations.append( + transformation_choices[transformation['transformation']]['command_line'] % eval( + transformation['arguments'] + ) + ) + except Exception, e: + warnings.append(e) + + return u' '.join(transformations), warnings + diff --git a/apps/converter/backends/graphicsmagick.py b/apps/converter/backends/graphicsmagick.py deleted file mode 100644 index 360a24a58b..0000000000 --- a/apps/converter/backends/graphicsmagick.py +++ /dev/null @@ -1,71 +0,0 @@ -import subprocess -import re - -from converter.conf.settings import GM_PATH -from converter.conf.settings import GM_SETTINGS -from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS -from converter.exceptions import ConvertError, UnknownFormat, IdentifyError - -CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format' -CONVERTER_ERROR_STARTS_WITH = u'starts with' - - -def execute_identify(input_filepath, arguments=None): - command = [] - command.append(unicode(GM_PATH)) - command.append(u'identify') - if arguments: - command.extend(arguments) - command.append(unicode(input_filepath)) - proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - return_code = proc.wait() - if return_code != 0: - raise IdentifyError(proc.stderr.readline()) - return proc.stdout.read() - - -def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None): - command = [] - command.append(unicode(GM_PATH)) - command.append(u'convert') - command.extend(unicode(QUALITY_SETTINGS[quality]).split()) - command.extend(unicode(GM_SETTINGS).split()) - command.append(unicode(input_filepath)) - if arguments: - command.extend(unicode(arguments).split()) - command.append(unicode(output_filepath)) - proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - return_code = proc.wait() - if return_code != 0: - #Got an error from convert program - error_line = proc.stderr.readline() - if (CONVERTER_ERROR_STRING_NO_DECODER in error_line) or (CONVERTER_ERROR_STARTS_WITH in error_line): - #Try to determine from error message which class of error is it - raise UnknownFormat - else: - raise ConvertError(error_line) - - -def get_format_list(): - """ - Call GraphicsMagick to parse all of it's supported file formats, and - return a list of the names and descriptions - """ - format_regex = re.compile(' *([A-Z0-9]+)[*]? +([A-Z0-9]+) +([rw\-+]+) *(.*).*') - formats = [] - command = [] - command.append(unicode(GM_PATH)) - command.append(u'convert') - command.append(u'-list') - command.append(u'formats') - proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - return_code = proc.wait() - if return_code != 0: - raise ConvertError(proc.stderr.readline()) - - for line in proc.stdout.readlines(): - fields = format_regex.findall(line) - if fields: - formats.append((fields[0][0], fields[0][3])) - - return formats diff --git a/apps/converter/backends/graphicsmagick/__init__.py b/apps/converter/backends/graphicsmagick/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/apps/converter/backends/graphicsmagick/base.py b/apps/converter/backends/graphicsmagick/base.py new file mode 100644 index 0000000000..5570650038 --- /dev/null +++ b/apps/converter/backends/graphicsmagick/base.py @@ -0,0 +1,85 @@ +import subprocess +import re + +from django.utils.translation import ugettext_lazy as _ + +from converter.conf.settings import GM_PATH +from converter.conf.settings import GM_SETTINGS +from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS +from converter.exceptions import ConvertError, UnknownFormat, IdentifyError +from converter.backends import ConverterBase + +CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format' +CONVERTER_ERROR_STARTS_WITH = u'starts with' + + +class ConverterClass(ConverterBase): + def identify_file(self, input_filepath, arguments=None): + command = [] + command.append(unicode(GM_PATH)) + command.append(u'identify') + if arguments: + command.extend(arguments) + command.append(unicode(input_filepath)) + proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + return_code = proc.wait() + if return_code != 0: + raise IdentifyError(proc.stderr.readline()) + return proc.stdout.read() + + + def convert_file(self, input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None): + command = [] + command.append(unicode(GM_PATH)) + command.append(u'convert') + command.extend(unicode(QUALITY_SETTINGS[quality]).split()) + command.extend(unicode(GM_SETTINGS).split()) + command.append(unicode(input_filepath)) + if arguments: + command.extend(unicode(arguments).split()) + command.append(unicode(output_filepath)) + proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + return_code = proc.wait() + if return_code != 0: + #Got an error from convert program + error_line = proc.stderr.readline() + if (CONVERTER_ERROR_STRING_NO_DECODER in error_line) or (CONVERTER_ERROR_STARTS_WITH in error_line): + #Try to determine from error message which class of error is it + raise UnknownFormat + else: + raise ConvertError(error_line) + + + def get_format_list(self): + """ + Call GraphicsMagick to parse all of it's supported file formats, and + return a list of the names and descriptions + """ + format_regex = re.compile(' *([A-Z0-9]+)[*]? +([A-Z0-9]+) +([rw\-+]+) *(.*).*') + formats = [] + command = [] + command.append(unicode(GM_PATH)) + command.append(u'convert') + command.append(u'-list') + command.append(u'formats') + proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + return_code = proc.wait() + if return_code != 0: + raise ConvertError(proc.stderr.readline()) + + for line in proc.stdout.readlines(): + fields = format_regex.findall(line) + if fields: + formats.append((fields[0][0], fields[0][3])) + + return formats + + + def get_available_transformations(self): + return { + 'rotate': { + 'label': _(u'Rotate [degrees]'), + 'arguments': [{'name': 'degrees'}], + 'command_line': u'-rotate %(degrees)d' + } + } diff --git a/apps/converter/backends/imagemagick.py b/apps/converter/backends/imagemagick.py deleted file mode 100644 index 4542ebdeba..0000000000 --- a/apps/converter/backends/imagemagick.py +++ /dev/null @@ -1,68 +0,0 @@ -import subprocess -import re - -from converter.conf.settings import IM_IDENTIFY_PATH -from converter.conf.settings import IM_CONVERT_PATH -from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS -from converter.exceptions import ConvertError, UnknownFormat, \ - IdentifyError - -CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format' - - -def execute_identify(input_filepath, arguments=None): - command = [] - command.append(unicode(IM_IDENTIFY_PATH)) - if arguments: - command.extend(arguments) - command.append(unicode(input_filepath)) - - proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - return_code = proc.wait() - if return_code != 0: - raise IdentifyError(proc.stderr.readline()) - return proc.stdout.read() - - -def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None): - command = [] - command.append(unicode(IM_CONVERT_PATH)) - command.extend(unicode(QUALITY_SETTINGS[quality]).split()) - command.append(unicode(input_filepath)) - if arguments: - command.extend(unicode(arguments).split()) - command.append(unicode(output_filepath)) - proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - return_code = proc.wait() - if return_code != 0: - #Got an error from convert program - error_line = proc.stderr.readline() - if CONVERTER_ERROR_STRING_NO_DECODER in error_line: - #Try to determine from error message which class of error is it - raise UnknownFormat - else: - raise ConvertError(error_line) - - -def get_format_list(): - """ - Call ImageMagick to parse all of it's supported file formats, and - return a list of the names and descriptions - """ - format_regex = re.compile(' *([A-Z0-9]+)[*]? +([A-Z0-9]+) +([rw\-+]+) *(.*).*') - formats = [] - command = [] - command.append(unicode(IM_CONVERT_PATH)) - command.append(u'-list') - command.append(u'format') - proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - return_code = proc.wait() - if return_code != 0: - raise ConvertError(proc.stderr.readline()) - - for line in proc.stdout.readlines(): - fields = format_regex.findall(line) - if fields: - formats.append((fields[0][0], fields[0][3])) - - return formats diff --git a/apps/converter/backends/imagemagick/__init__.py b/apps/converter/backends/imagemagick/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/apps/converter/backends/imagemagick/base.py b/apps/converter/backends/imagemagick/base.py new file mode 100644 index 0000000000..e2b8c40fdd --- /dev/null +++ b/apps/converter/backends/imagemagick/base.py @@ -0,0 +1,82 @@ +import subprocess +import re + +from django.utils.translation import ugettext_lazy as _ + +from converter.conf.settings import IM_IDENTIFY_PATH +from converter.conf.settings import IM_CONVERT_PATH +from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS +from converter.exceptions import ConvertError, UnknownFormat, \ + IdentifyError +from converter.backends import ConverterBase + +CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format' + + +class ConverterClass(ConverterBase): + def identify_file(self, input_filepath, arguments=None): + command = [] + command.append(unicode(IM_IDENTIFY_PATH)) + if arguments: + command.extend(arguments) + command.append(unicode(input_filepath)) + + proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + return_code = proc.wait() + if return_code != 0: + raise IdentifyError(proc.stderr.readline()) + return proc.stdout.read() + + + def convert_file(self, input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None): + command = [] + command.append(unicode(IM_CONVERT_PATH)) + command.extend(unicode(QUALITY_SETTINGS[quality]).split()) + command.append(unicode(input_filepath)) + if arguments: + command.extend(unicode(arguments).split()) + command.append(unicode(output_filepath)) + proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + return_code = proc.wait() + if return_code != 0: + #Got an error from convert program + error_line = proc.stderr.readline() + if CONVERTER_ERROR_STRING_NO_DECODER in error_line: + #Try to determine from error message which class of error is it + raise UnknownFormat + else: + raise ConvertError(error_line) + + + def get_format_list(self): + """ + Call ImageMagick to parse all of it's supported file formats, and + return a list of the names and descriptions + """ + format_regex = re.compile(' *([A-Z0-9]+)[*]? +([A-Z0-9]+) +([rw\-+]+) *(.*).*') + formats = [] + command = [] + command.append(unicode(IM_CONVERT_PATH)) + command.append(u'-list') + command.append(u'format') + proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + return_code = proc.wait() + if return_code != 0: + raise ConvertError(proc.stderr.readline()) + + for line in proc.stdout.readlines(): + fields = format_regex.findall(line) + if fields: + formats.append((fields[0][0], fields[0][3])) + + return formats + + + def get_available_transformations(self): + return { + 'rotate': { + 'label': _(u'Rotate [degrees]'), + 'arguments': [{'name': 'degrees'}], + 'command_line': u'-rotate %(degrees)d' + } + } diff --git a/apps/converter/literals.py b/apps/converter/literals.py new file mode 100644 index 0000000000..403400d229 --- /dev/null +++ b/apps/converter/literals.py @@ -0,0 +1,22 @@ +from converter.conf.settings import DEFAULT_OPTIONS +from converter.conf.settings import LOW_QUALITY_OPTIONS +from converter.conf.settings import HIGH_QUALITY_OPTIONS +from converter.conf.settings import PRINT_QUALITY_OPTIONS + +DEFAULT_ZOOM_LEVEL = 100 +DEFAULT_ROTATION = 0 +DEFAULT_PAGE_INDEX_NUMBER = 0 +DEFAULT_FILE_FORMAT = u'jpg' +DEFAULT_OCR_FILE_FORMAT = u'tif' + +QUALITY_DEFAULT = u'quality_default' +QUALITY_LOW = u'quality_low' +QUALITY_HIGH = u'quality_high' +QUALITY_PRINT = u'quality_print' + +QUALITY_SETTINGS = { + QUALITY_DEFAULT: DEFAULT_OPTIONS, + QUALITY_LOW: LOW_QUALITY_OPTIONS, + QUALITY_HIGH: HIGH_QUALITY_OPTIONS, + QUALITY_PRINT: PRINT_QUALITY_OPTIONS +} diff --git a/apps/converter/utils.py b/apps/converter/utils.py index c5a4e7e55b..5fc106a940 100644 --- a/apps/converter/utils.py +++ b/apps/converter/utils.py @@ -1,6 +1,10 @@ +import os + +from django.core.exceptions import ImproperlyConfigured +from django.utils.importlib import import_module + + #http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python - - def copyfile(source, dest, buffer_size=1024 * 1024): """ Copy a file from source to dest. source and dest @@ -21,3 +25,50 @@ def copyfile(source, dest, buffer_size=1024 * 1024): source.close() dest.close() + + +def _lazy_load(fn): + _cached = [] + + def _decorated(): + if not _cached: + _cached.append(fn()) + return _cached[0] + return _decorated + + +@_lazy_load +def load_backend(): + from converter.conf.settings import GRAPHICS_BACKEND as backend_name + + try: + module = import_module('.base', 'converter.backends.%s' % backend_name) + import warnings + warnings.warn( + "Short names for CONVERTER_BACKEND are deprecated; prepend with 'converter.backends.'", + PendingDeprecationWarning + ) + return module + except ImportError, e: + # Look for a fully qualified converter backend name + try: + return import_module('.base', backend_name) + except ImportError, e_user: + # The converter backend wasn't found. Display a helpful error message + # listing all possible (built-in) converter backends. + backend_dir = os.path.join(os.path.dirname(__file__), 'backends') + try: + available_backends = [f for f in os.listdir(backend_dir) + if os.path.isdir(os.path.join(backend_dir, f)) + and not f.startswith('.')] + except EnvironmentError: + available_backends = [] + available_backends.sort() + if backend_name not in available_backends: + error_msg = ("%r isn't an available converter backend. \n" + + "Try using converter.backends.XXX, where XXX is one of:\n %s\n" + + "Error was: %s") % \ + (backend_name, ", ".join(map(repr, available_backends)), e_user) + raise ImproperlyConfigured(error_msg) + else: + raise # If there's some other error, this must be an error in Mayan itself. diff --git a/apps/documents/conf/settings.py b/apps/documents/conf/settings.py index 7a253f52de..4c7749624c 100644 --- a/apps/documents/conf/settings.py +++ b/apps/documents/conf/settings.py @@ -18,10 +18,6 @@ def default_uuid(): """unicode(uuid.uuid4())""" return unicode(uuid.uuid4()) -available_transformations = { - 'rotate': {'label': _(u'Rotate [degrees]'), 'arguments': [{'name': 'degrees'}]} -} - register_settings( namespace=u'documents', module=u'documents.conf.settings', @@ -31,8 +27,6 @@ register_settings( {'name': u'UUID_FUNCTION', 'global_name': u'DOCUMENTS_UUID_FUNCTION', 'default': default_uuid}, # Storage {'name': u'STORAGE_BACKEND', 'global_name': u'DOCUMENTS_STORAGE_BACKEND', 'default': FileBasedStorage}, - # Transformations - {'name': u'AVAILABLE_TRANSFORMATIONS', 'global_name': u'DOCUMENTS_AVAILABLE_TRANSFORMATIONS', 'default': available_transformations}, # Usage {'name': u'PREVIEW_SIZE', 'global_name': u'DOCUMENTS_PREVIEW_SIZE', 'default': u'640x480'}, {'name': u'PRINT_SIZE', 'global_name': u'DOCUMENTS_PRINT_SIZE', 'default': u'1400'}, diff --git a/apps/documents/models.py b/apps/documents/models.py index 96d988bfdb..e0df918fc1 100644 --- a/apps/documents/models.py +++ b/apps/documents/models.py @@ -12,16 +12,13 @@ from python_magic import magic from taggit.managers import TaggableManager from dynamic_search.api import register from converter.api import get_page_count -from converter import TRANFORMATION_CHOICES +from converter.api import backend from documents.conf.settings import CHECKSUM_FUNCTION from documents.conf.settings import UUID_FUNCTION from documents.conf.settings import STORAGE_BACKEND -from documents.conf.settings import AVAILABLE_TRANSFORMATIONS from documents.managers import RecentDocumentManager -available_transformations = ([(name, data['label']) for name, data in AVAILABLE_TRANSFORMATIONS.items()]) - def get_filename_from_uuid(instance, filename): """ @@ -263,20 +260,7 @@ class DocumentPage(models.Model): return ('document_page_view', [self.pk]) def get_transformation_string(self): - transformation_list = [] - warnings = [] - for page_transformation in self.documentpagetransformation_set.all(): - try: - if page_transformation.transformation in TRANFORMATION_CHOICES: - transformation_list.append( - TRANFORMATION_CHOICES[page_transformation.transformation] % eval( - page_transformation.arguments - ) - ) - except Exception, e: - warnings.append(e) - - return u' '.join(transformation_list), warnings + return backend.get_transformation_string(self.documentpagetransformation_set.values('transformation', 'arguments')) class DocumentPageTransformation(models.Model): @@ -286,7 +270,7 @@ class DocumentPageTransformation(models.Model): """ document_page = models.ForeignKey(DocumentPage, verbose_name=_(u'document page')) order = models.PositiveIntegerField(default=0, blank=True, null=True, verbose_name=_(u'order'), db_index=True) - transformation = models.CharField(choices=available_transformations, max_length=128, verbose_name=_(u'transformation')) + transformation = models.CharField(choices=backend.get_available_transformations_labels(), max_length=128, verbose_name=_(u'transformation')) arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: {\'degrees\':90}')) def __unicode__(self): diff --git a/apps/documents/urls.py b/apps/documents/urls.py index 4dc99f37de..4a8dcd2d46 100644 --- a/apps/documents/urls.py +++ b/apps/documents/urls.py @@ -1,14 +1,12 @@ from django.conf.urls.defaults import patterns, url -from converter.api import QUALITY_HIGH, QUALITY_PRINT +from converter.literals import QUALITY_HIGH, QUALITY_PRINT from documents.conf.settings import PREVIEW_SIZE from documents.conf.settings import PRINT_SIZE from documents.conf.settings import THUMBNAIL_SIZE from documents.conf.settings import DISPLAY_SIZE from documents.conf.settings import MULTIPAGE_PREVIEW_SIZE -#from documents.literals import UPLOAD_SOURCE_LOCAL, \ -# UPLOAD_SOURCE_STAGING, UPLOAD_SOURCE_USER_STAGING urlpatterns = patterns('documents.views', url(r'^list/$', 'document_list', (), 'document_list'), diff --git a/apps/sources/models.py b/apps/sources/models.py index 70eaf2d4e4..ffd4211fe6 100644 --- a/apps/sources/models.py +++ b/apps/sources/models.py @@ -4,14 +4,12 @@ from django.contrib.contenttypes.models import ContentType from django.contrib.contenttypes import generic from documents.models import DocumentType -from documents.conf.settings import AVAILABLE_TRANSFORMATIONS from documents.managers import RecentDocumentManager from metadata.models import MetadataType +from converter.api import backend from sources.managers import SourceTransformationManager -available_transformations = ([(name, data['label']) for name, data in AVAILABLE_TRANSFORMATIONS.items()]) - SOURCE_UNCOMPRESS_CHOICE_Y = 'y' SOURCE_UNCOMPRESS_CHOICE_N = 'n' SOURCE_UNCOMPRESS_CHOICE_ASK = 'a' @@ -164,7 +162,7 @@ class SourceTransformation(models.Model): object_id = models.PositiveIntegerField() content_object = generic.GenericForeignKey('content_type', 'object_id') order = models.PositiveIntegerField(default=0, blank=True, null=True, verbose_name=_(u'order'), db_index=True) - transformation = models.CharField(choices=available_transformations, max_length=128, verbose_name=_(u'transformation')) + transformation = models.CharField(choices=backend.get_available_transformations_labels(), max_length=128, verbose_name=_(u'transformation')) arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: {\'degrees\':90}')) objects = SourceTransformationManager() diff --git a/apps/sources/staging.py b/apps/sources/staging.py index c6668455c9..a608f1b30f 100644 --- a/apps/sources/staging.py +++ b/apps/sources/staging.py @@ -8,7 +8,6 @@ from django.utils.translation import ugettext from django.contrib import messages from django.utils.translation import ugettext_lazy as _ -from converter import TRANFORMATION_CHOICES from converter.api import convert, cache_cleanup DEFAULT_STAGING_DIRECTORY = u'/tmp' @@ -136,13 +135,13 @@ class StagingFile(object): def get_transformation_string(transformations): transformation_list = [] errors = [] - for transformation in transformations: - try: - if transformation['name'] in TRANFORMATION_CHOICES: - output = TRANFORMATION_CHOICES[transformation['name']] % eval(transformation['arguments']) - transformation_list.append(output) - except Exception, e: - errors.append(e) + #for transformation in transformations: + # try: + # if transformation['name'] in TRANFORMATION_CHOICES: + # output = TRANFORMATION_CHOICES[transformation['name']] % eval(transformation['arguments']) + # transformation_list.append(output) + # except Exception, e: + # errors.append(e) - tranformation_string = ' '.join(transformation_list) + #tranformation_string = ' '.join(transformation_list) return tranformation_string, errors From 358216fac56028a949254f266be635ee2f7a2e65 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Thu, 14 Jul 2011 23:17:43 -0400 Subject: [PATCH 02/14] Updated README.md --- README.md | 24 ++++++++++++++++++------ apps/converter/backends/base.py | 0 2 files changed, 18 insertions(+), 6 deletions(-) delete mode 100644 apps/converter/backends/base.py diff --git a/README.md b/README.md index 2e9f754566..e1e21d1eae 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Open source, Django based document manager with custom metadata indexing, file s [Website](http://bit.ly/mayan-edms) -Requirements +Basic requirements --- Python: @@ -15,6 +15,19 @@ Python: * django-filetransfers - File upload/download abstraction * celery- asynchronous task queue/job queue based on distributed message passing * django-celery - celery Django integration +* django-mptt - Utilities for implementing a modified pre-order traversal tree in django +* python-magic - A python wrapper for libmagic +* django-taggit - Simple tagging for django + +Execute pip install -r requirements/production.txt to install the python/django dependencies automatically. + +Executables: + +* tesseract-ocr - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google. +* unpaper - post-processing scanned and photocopied book pages + +Optional requirements +--- For the GridFS storage backend: @@ -22,13 +35,12 @@ For the GridFS storage backend: * GridFS - a storage specification for large objects in MongoDB * MongoDB - a scalable, open source, document-oriented database -Or execute pip install -r requirements/production.txt to install the dependencies automatically. +Libraries: -Executables: +* libmagic - MIME detection library, if not installed Mayan will fall back to using python's simpler mimetype built in library + +Mayan has the ability to switch between different image conversion backends, at the moment these two are supported: -* libmagic - MIME detection library -* tesseract-ocr - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google. -* unpaper - post-processing scanned and photocopied book pages * ImageMagick - Convert, Edit, Or Compose Bitmap Images * GraphicMagick - Robust collection of tools and libraries to read, write, and manipulate an image. diff --git a/apps/converter/backends/base.py b/apps/converter/backends/base.py deleted file mode 100644 index e69de29bb2..0000000000 From 743ae0fce0dc7b50ee3dfe954c0950f01c16b652 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Fri, 15 Jul 2011 06:16:14 -0400 Subject: [PATCH 03/14] Initial commit of the converter image transformation refactor --- apps/converter/__init__.py | 9 +++ apps/converter/api.py | 74 ++++++++++------- apps/converter/backends/__init__.py | 5 +- .../converter/backends/graphicsmagick/base.py | 57 ++++++++++--- apps/converter/backends/imagemagick/base.py | 26 ++++-- apps/converter/backends/python/__init__.py | 3 + apps/converter/backends/python/base.py | 80 +++++++++++++++++++ apps/converter/conf/settings.py | 2 +- apps/converter/literals.py | 44 +++++++++- apps/converter/views.py | 26 +----- apps/documents/managers.py | 8 ++ apps/documents/models.py | 12 +-- apps/documents/views.py | 17 ++-- apps/sources/managers.py | 3 + apps/sources/models.py | 9 ++- apps/sources/staging.py | 28 ++----- apps/sources/views.py | 14 +++- 17 files changed, 303 insertions(+), 114 deletions(-) create mode 100644 apps/converter/backends/python/__init__.py create mode 100644 apps/converter/backends/python/base.py diff --git a/apps/converter/__init__.py b/apps/converter/__init__.py index 6ab5029f01..ffaef00c09 100644 --- a/apps/converter/__init__.py +++ b/apps/converter/__init__.py @@ -1,7 +1,16 @@ from django.utils.translation import ugettext_lazy as _ +from django.core.exceptions import ImproperlyConfigured from navigation.api import register_sidebar_template +from converter.utils import load_backend +from converter.conf.settings import GRAPHICS_BACKEND + formats_list = {'text': _('file formats'), 'view': 'formats_list', 'famfam': 'pictures'} register_sidebar_template(['formats_list'], 'converter_file_formats_help.html') + +try: + backend = load_backend().ConverterClass() +except ImproperlyConfigured: + raise ImproperlyConfigured(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND) diff --git a/apps/converter/api.py b/apps/converter/api.py index 9de0ed737e..478783b299 100644 --- a/apps/converter/api.py +++ b/apps/converter/api.py @@ -3,7 +3,6 @@ import subprocess from django.utils.importlib import import_module from django.template.defaultfilters import slugify -from django.core.exceptions import ImproperlyConfigured from common import TEMPORARY_DIRECTORY from documents.utils import document_save_to_temp_dir @@ -12,21 +11,22 @@ from converter.conf.settings import UNPAPER_PATH from converter.conf.settings import OCR_OPTIONS from converter.conf.settings import UNOCONV_PATH from converter.exceptions import UnpaperError, OfficeConversionError -from converter.utils import load_backend -from converter.literals import DEFAULT_PAGE_INDEX_NUMBER, \ +from converter.literals import DEFAULT_PAGE_NUMBER, \ DEFAULT_OCR_FILE_FORMAT, QUALITY_DEFAULT, DEFAULT_ZOOM_LEVEL, \ DEFAULT_ROTATION, DEFAULT_FILE_FORMAT, QUALITY_PRINT +from converter import backend +from converter.literals import TRANSFORMATION_CHOICES +from converter.literals import TRANSFORMATION_RESIZE, \ + TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \ + TRANSFORMATION_ZOOM +from converter.literals import DIMENSION_SEPARATOR + + CONVERTER_OFFICE_FILE_EXTENSIONS = [ u'ods', u'docx', u'doc' ] -try: - backend = load_backend().ConverterClass() -except ImproperlyConfigured: - raise ImproperlyConfigured(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND) - - def cleanup(filename): """ Tries to remove the given filename. Ignores non-existent files @@ -107,18 +107,18 @@ def convert_document(document, *args, **kwargs): def convert(input_filepath, *args, **kwargs): size = kwargs.get('size') file_format = kwargs.get('file_format', DEFAULT_FILE_FORMAT) - extra_options = kwargs.get('extra_options', u'') zoom = kwargs.get('zoom', DEFAULT_ZOOM_LEVEL) rotation = kwargs.get('rotation', DEFAULT_ROTATION) - page = kwargs.get('page', DEFAULT_PAGE_INDEX_NUMBER) + page = kwargs.get('page', DEFAULT_PAGE_NUMBER) cleanup_files = kwargs.get('cleanup_files', True) quality = kwargs.get('quality', QUALITY_DEFAULT) + transformations = kwargs.get('transformations', []) unoconv_output = None output_filepath = create_image_cache_filename(input_filepath, *args, **kwargs) - if os.path.exists(output_filepath): - return output_filepath + #if os.path.exists(output_filepath): + # return output_filepath path, extension = os.path.splitext(input_filepath) if extension[1:].lower() in CONVERTER_OFFICE_FILE_EXTENSIONS: @@ -128,18 +128,33 @@ def convert(input_filepath, *args, **kwargs): input_filepath = result extra_options = u'' - input_arg = u'%s[%s]' % (input_filepath, page) - extra_options += u' -resize %s' % size + #TODO: not here in the backend + input_arg = u'%s[%s]' % (input_filepath, page-1) + transformations.append( + { + 'transformation': TRANSFORMATION_RESIZE, + 'arguments': dict(zip([u'width', u'height'], size.split(DIMENSION_SEPARATOR))) + } + ) + if zoom != 100: - extra_options += u' -resize %d%% ' % zoom + transformations.append( + { + 'transformation': TRANSFORMATION_ZOOM, + 'arguments': {'percent': zoom} + } + ) if rotation != 0 and rotation != 360: - extra_options += u' -rotate %d ' % rotation + transformations.append( + { + 'transformation': TRANSFORMATION_ROTATE, + 'arguments': {'degrees': rotation} + } + ) - if format == u'jpg': - extra_options += u' -quality 85' try: - backend.convert_file(input_filepath=input_arg, arguments=extra_options, output_filepath=u'%s:%s' % (file_format, output_filepath), quality=quality) + backend.convert_file(input_filepath=input_arg, output_filepath=u'%s:%s' % (file_format, output_filepath), quality=quality, transformations=transformations) finally: if cleanup_files: cleanup(input_filepath) @@ -150,11 +165,7 @@ def convert(input_filepath, *args, **kwargs): def get_page_count(input_filepath): - try: - return len(backend.identify_file(unicode(input_filepath)).splitlines()) - except: - #TODO: send to other page number identifying program - return 1 + return backend.get_page_count(input_filepath) def get_document_dimensions(document, *args, **kwargs): @@ -166,7 +177,7 @@ def get_document_dimensions(document, *args, **kwargs): return [0, 0] -def convert_document_for_ocr(document, page=DEFAULT_PAGE_INDEX_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT): +def convert_document_for_ocr(document, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT): #Extract document file input_filepath = document_save_to_temp_dir(document, document.uuid) @@ -178,7 +189,7 @@ def convert_document_for_ocr(document, page=DEFAULT_PAGE_INDEX_NUMBER, file_form unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep) convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format) - input_arg = u'%s[%s]' % (input_filepath, page) + input_arg = u'%s[%s]' % (input_filepath, page-1) try: document_page = document.documentpage_set.get(page_number=page + 1) @@ -198,3 +209,12 @@ def convert_document_for_ocr(document, page=DEFAULT_PAGE_INDEX_NUMBER, file_form cleanup(unpaper_output_file) return convert_output_file + + +def get_available_transformations_choices(): + result = [] + for transformation in backend.get_available_transformations(): + transformation_template = u'%s %s' % (TRANSFORMATION_CHOICES[transformation]['label'], u','.join(['<%s>' % argument['name'] if argument['required'] else '[%s]' % argument['name'] for argument in TRANSFORMATION_CHOICES[transformation]['arguments']])) + result.append([transformation, transformation_template]) + + return result diff --git a/apps/converter/backends/__init__.py b/apps/converter/backends/__init__.py index 1d81dd8149..0b42ec89c2 100644 --- a/apps/converter/backends/__init__.py +++ b/apps/converter/backends/__init__.py @@ -21,9 +21,6 @@ class ConverterBase(object): def get_available_transformations(self): raise NotImplementedError("Your %s class has not defined a get_available_transformations() method, which is required." % self.__class__.__name__) - def get_available_transformations_labels(self): - return ([(name, data['label']) for name, data in self.get_available_transformations().items()]) - def get_transformation_string(self, transformation_list): transformations = [] warnings = [] @@ -41,3 +38,5 @@ class ConverterBase(object): return u' '.join(transformations), warnings + def get_page_count(self): + raise NotImplementedError("Your %s class has not defined a get_page_count() method, which is required." % self.__class__.__name__) diff --git a/apps/converter/backends/graphicsmagick/base.py b/apps/converter/backends/graphicsmagick/base.py index 5570650038..8cb0f3fb55 100644 --- a/apps/converter/backends/graphicsmagick/base.py +++ b/apps/converter/backends/graphicsmagick/base.py @@ -8,6 +8,10 @@ from converter.conf.settings import GM_SETTINGS from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS from converter.exceptions import ConvertError, UnknownFormat, IdentifyError from converter.backends import ConverterBase +from converter.literals import TRANSFORMATION_RESIZE, \ + TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \ + TRANSFORMATION_ZOOM +from converter.literals import DIMENSION_SEPARATOR CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format' CONVERTER_ERROR_STARTS_WITH = u'starts with' @@ -28,7 +32,29 @@ class ConverterClass(ConverterBase): return proc.stdout.read() - def convert_file(self, input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None): + def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT): + arguments = [] + if transformations: + for transformation in transformations: + if transformation['transformation'] == TRANSFORMATION_RESIZE: + dimensions = [] + dimensions.append(unicode(transformation['arguments']['width'])) + if 'height' in transformation['arguments']: + dimensions.append(unicode(transformation['arguments']['height'])) + arguments.append(u'-resize') + arguments.append(u'%s' % DIMENSION_SEPARATOR.join(dimensions)) + + elif transformation['transformation'] == TRANSFORMATION_ZOOM: + arguments.append(u'-resize') + arguments.append(u'%d%%' % transformation['arguments']['zoom']) + + elif transformation['transformation'] == TRANSFORMATION_ROTATE: + arguments.append(u'-rotate') + arguments.append(u'%s' % transformation['arguments']['degrees']) + + print 'arguments: %s' % arguments + #if format == u'jpg': + # extra_options += u' -quality 85' command = [] command.append(unicode(GM_PATH)) command.append(u'convert') @@ -36,8 +62,9 @@ class ConverterClass(ConverterBase): command.extend(unicode(GM_SETTINGS).split()) command.append(unicode(input_filepath)) if arguments: - command.extend(unicode(arguments).split()) + command.extend(arguments) command.append(unicode(output_filepath)) + print 'command: %s' % command proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() if return_code != 0: @@ -76,10 +103,22 @@ class ConverterClass(ConverterBase): def get_available_transformations(self): - return { - 'rotate': { - 'label': _(u'Rotate [degrees]'), - 'arguments': [{'name': 'degrees'}], - 'command_line': u'-rotate %(degrees)d' - } - } + return [ + TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \ + TRANSFORMATION_DENSITY, TRANSFORMATION_ZOOM + ] + + + def get_page_count(self, input_filepath): + try: + return len(self.identify_file(unicode(input_filepath)).splitlines()) + except: + #TODO: send to other page number identifying program + return 1 + + + def _get_transformation_string(): + pass + #'command_line': u'-rotate %(degrees)d' + # } + #} diff --git a/apps/converter/backends/imagemagick/base.py b/apps/converter/backends/imagemagick/base.py index e2b8c40fdd..cd5b1ba53e 100644 --- a/apps/converter/backends/imagemagick/base.py +++ b/apps/converter/backends/imagemagick/base.py @@ -9,7 +9,10 @@ from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS from converter.exceptions import ConvertError, UnknownFormat, \ IdentifyError from converter.backends import ConverterBase - +from converter.literals import TRANSFORMATION_RESIZE, \ + TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \ + TRANSFORMATION_ZOOM + CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format' @@ -29,6 +32,8 @@ class ConverterClass(ConverterBase): def convert_file(self, input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None): + #if format == u'jpg': + # extra_options += u' -quality 85' command = [] command.append(unicode(IM_CONVERT_PATH)) command.extend(unicode(QUALITY_SETTINGS[quality]).split()) @@ -73,10 +78,15 @@ class ConverterClass(ConverterBase): def get_available_transformations(self): - return { - 'rotate': { - 'label': _(u'Rotate [degrees]'), - 'arguments': [{'name': 'degrees'}], - 'command_line': u'-rotate %(degrees)d' - } - } + return [ + TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \ + TRANSFORMATION_DENSITY, TRANSFORMATION_ZOOM + ] + + + def get_page_count(self, input_filepath): + try: + return len(self.identify_file(unicode(input_filepath)).splitlines()) + except: + #TODO: send to other page number identifying program + return 1 diff --git a/apps/converter/backends/python/__init__.py b/apps/converter/backends/python/__init__.py new file mode 100644 index 0000000000..dfeca950f1 --- /dev/null +++ b/apps/converter/backends/python/__init__.py @@ -0,0 +1,3 @@ +from PIL import Image + +Image.init() diff --git a/apps/converter/backends/python/base.py b/apps/converter/backends/python/base.py new file mode 100644 index 0000000000..4d776454e4 --- /dev/null +++ b/apps/converter/backends/python/base.py @@ -0,0 +1,80 @@ +from PIL import Image + +from django.utils.translation import ugettext_lazy as _ + +from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS +from converter.exceptions import ConvertError, UnknownFormat, IdentifyError +from converter.backends import ConverterBase +from converter.literals import TRANSFORMATION_RESIZE, \ + TRANSFORMATION_ROTATE + +class ConverterClass(ConverterBase): + def identify_file(self, input_filepath, arguments=None): + pass + + + def get_page_count(self, input_filepath): + page_count = 1 + im = Image.open(input_filepath) + + try: + while 1: + im.seek(im.tell()+1) + page_count += 1 + # do something to im + except EOFError: + pass # end of sequence + + return page_count + + + def convert_file(self, input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None): + im = Image.open(input_filepath) + outfile, format = output_filepath.split(u':') + im.save(outfile, format) + ''' + command = [] + command.append(unicode(GM_PATH)) + command.append(u'convert') + command.extend(unicode(QUALITY_SETTINGS[quality]).split()) + command.extend(unicode(GM_SETTINGS).split()) + command.append(unicode(input_filepath)) + if arguments: + command.extend(unicode(arguments).split()) + command.append(unicode(output_filepath)) + proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + return_code = proc.wait() + if return_code != 0: + #Got an error from convert program + error_line = proc.stderr.readline() + if (CONVERTER_ERROR_STRING_NO_DECODER in error_line) or (CONVERTER_ERROR_STARTS_WITH in error_line): + #Try to determine from error message which class of error is it + raise UnknownFormat + else: + raise ConvertError(error_line) + ''' + + def get_format_list(self): + """ + Introspect PIL's internal registry to obtain a list of the + supported file types + """ + formats = [] + for format_name in Image.ID: + formats.append((format_name, u'')) + + return formats + + + def get_available_transformations(self): + return [ + TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE + ] + + + def get_page_count(self, input_filepath): + try: + return len(self.identify_file(unicode(input_filepath)).splitlines()) + except: + #TODO: send to other page number identifying program + return 1 diff --git a/apps/converter/conf/settings.py b/apps/converter/conf/settings.py index f73c0f2b64..fcaa1ec9b0 100644 --- a/apps/converter/conf/settings.py +++ b/apps/converter/conf/settings.py @@ -12,7 +12,7 @@ register_settings( {'name': u'UNPAPER_PATH', 'global_name': u'CONVERTER_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True}, {'name': u'GM_PATH', 'global_name': u'CONVERTER_GM_PATH', 'default': u'/usr/bin/gm', 'description': _(u'File path to graphicsmagick\'s program.'), 'exists': True}, {'name': u'GM_SETTINGS', 'global_name': u'CONVERTER_GM_SETTINGS', 'default': u''}, - {'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.imagemagick', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick and converter.backends.graphicsmagick.')}, + {'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick, converter.backends.graphicsmagick and converter.backends.python.')}, {'name': u'UNOCONV_PATH', 'global_name': u'CONVERTER_UNOCONV_PATH', 'default': u'/usr/bin/unoconv', 'exists': True}, {'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'}, {'name': u'DEFAULT_OPTIONS', 'global_name': u'CONVERTER_DEFAULT_OPTIONS', 'default': u''}, diff --git a/apps/converter/literals.py b/apps/converter/literals.py index 403400d229..7671fddf8a 100644 --- a/apps/converter/literals.py +++ b/apps/converter/literals.py @@ -1,3 +1,5 @@ +from django.utils.translation import ugettext_lazy as _ + from converter.conf.settings import DEFAULT_OPTIONS from converter.conf.settings import LOW_QUALITY_OPTIONS from converter.conf.settings import HIGH_QUALITY_OPTIONS @@ -5,7 +7,7 @@ from converter.conf.settings import PRINT_QUALITY_OPTIONS DEFAULT_ZOOM_LEVEL = 100 DEFAULT_ROTATION = 0 -DEFAULT_PAGE_INDEX_NUMBER = 0 +DEFAULT_PAGE_NUMBER = 1 DEFAULT_FILE_FORMAT = u'jpg' DEFAULT_OCR_FILE_FORMAT = u'tif' @@ -20,3 +22,43 @@ QUALITY_SETTINGS = { QUALITY_HIGH: HIGH_QUALITY_OPTIONS, QUALITY_PRINT: PRINT_QUALITY_OPTIONS } + +DIMENSION_SEPARATOR = u'x' + +TRANSFORMATION_RESIZE = u'resize' +TRANSFORMATION_ROTATE = u'rotate' +TRANSFORMATION_DENSITY = u'density' +TRANSFORMATION_ZOOM = u'zoom' + +TRANSFORMATION_CHOICES = { + TRANSFORMATION_RESIZE: { + 'label': _(u'Resize'), + 'description': _(u'Resize.'), + 'arguments': [ + {'name': 'width', 'label': _(u'width'), 'required': True}, + {'name': 'height', 'label': _(u'height'), 'required': False}, + ] + }, + TRANSFORMATION_ROTATE: { + 'label': _(u'Rotate'), + 'description': _(u'Rotate by n degress.'), + 'arguments': [ + {'name': 'degrees', 'label': _(u'degrees'), 'required': True} + ] + }, + TRANSFORMATION_DENSITY: { + 'label': _(u'Density'), + 'description': _(u'Change the resolution (ie: DPI) without resizing.'), + 'arguments': [ + {'name': 'width', 'label': _(u'width'), 'required': True}, + {'name': 'height', 'label': _(u'height'), 'required': False}, + ] + }, + TRANSFORMATION_ZOOM: { + 'label': _(u'Zoom'), + 'description': _(u'Zoom by n percent.'), + 'arguments': [ + {'name': 'percent', 'label': _(u'percent'), 'required': True} + ] + }, +} diff --git a/apps/converter/views.py b/apps/converter/views.py index ad95783539..ef7173f908 100644 --- a/apps/converter/views.py +++ b/apps/converter/views.py @@ -1,38 +1,18 @@ from django.utils.translation import ugettext_lazy as _ from django.shortcuts import render_to_response from django.template import RequestContext -from django.utils.importlib import import_module + +from converter import backend from converter.conf.settings import GRAPHICS_BACKEND - -def _lazy_load(fn): - _cached = [] - - def _decorated(): - if not _cached: - _cached.append(fn()) - return _cached[0] - return _decorated - - -@_lazy_load -def _get_backend(): - return import_module(GRAPHICS_BACKEND) - -try: - backend = _get_backend() -except ImportError: - raise ImportError(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND) - - def formats_list(request): #check_permissions(request.user, [PERMISSION_DOCUMENT_VIEW]) context = { 'title': _(u'suported file formats'), 'hide_object': True, - 'object_list': backend.get_format_list(), + 'object_list': sorted(backend.get_format_list()), 'extra_columns': [ { 'name': _(u'name'), diff --git a/apps/documents/managers.py b/apps/documents/managers.py index 3b007a936e..d63b2d644f 100644 --- a/apps/documents/managers.py +++ b/apps/documents/managers.py @@ -13,3 +13,11 @@ class RecentDocumentManager(models.Manager): to_delete = self.model.objects.filter(user=user)[RECENT_COUNT:] for recent_to_delete in to_delete: recent_to_delete.delete() + + +class DocumentPageTransformationManager(models.Manager): + def get_for_document_page(self, document_page): + return self.model.objects.filter(document_page=document_page) + + def get_for_document_page_as_list(self, document_page): + return list([{'transformation': transformation['transformation'], 'arguments': eval(transformation['arguments'])} for transformation in self.get_for_document_page(document_page).values('transformation', 'arguments')]) diff --git a/apps/documents/models.py b/apps/documents/models.py index e0df918fc1..aa344dce48 100644 --- a/apps/documents/models.py +++ b/apps/documents/models.py @@ -12,12 +12,13 @@ from python_magic import magic from taggit.managers import TaggableManager from dynamic_search.api import register from converter.api import get_page_count -from converter.api import backend +from converter.api import get_available_transformations_choices from documents.conf.settings import CHECKSUM_FUNCTION from documents.conf.settings import UUID_FUNCTION from documents.conf.settings import STORAGE_BACKEND -from documents.managers import RecentDocumentManager +from documents.managers import RecentDocumentManager, \ + DocumentPageTransformationManager def get_filename_from_uuid(instance, filename): @@ -259,9 +260,6 @@ class DocumentPage(models.Model): def get_absolute_url(self): return ('document_page_view', [self.pk]) - def get_transformation_string(self): - return backend.get_transformation_string(self.documentpagetransformation_set.values('transformation', 'arguments')) - class DocumentPageTransformation(models.Model): """ @@ -270,9 +268,11 @@ class DocumentPageTransformation(models.Model): """ document_page = models.ForeignKey(DocumentPage, verbose_name=_(u'document page')) order = models.PositiveIntegerField(default=0, blank=True, null=True, verbose_name=_(u'order'), db_index=True) - transformation = models.CharField(choices=backend.get_available_transformations_labels(), max_length=128, verbose_name=_(u'transformation')) + transformation = models.CharField(choices=get_available_transformations_choices(), max_length=128, verbose_name=_(u'transformation')) arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: {\'degrees\':90}')) + objects = DocumentPageTransformationManager() + def __unicode__(self): return u'"%s" for %s' % (self.get_transformation_display(), unicode(self.document_page)) diff --git a/apps/documents/views.py b/apps/documents/views.py index 4a3247c576..dcd383006d 100644 --- a/apps/documents/views.py +++ b/apps/documents/views.py @@ -285,7 +285,7 @@ def document_edit(request, document_id): 'object': document, }, context_instance=RequestContext(request)) - +''' def calculate_converter_arguments(document, *args, **kwargs): size = kwargs.pop('size', PREVIEW_SIZE) quality = kwargs.pop('quality', QUALITY_DEFAULT) @@ -308,7 +308,7 @@ def calculate_converter_arguments(document, *args, **kwargs): } return arguments, warnings - +''' def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_DEFAULT): check_permissions(request.user, [PERMISSION_DOCUMENT_VIEW]) @@ -327,14 +327,17 @@ def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_ rotation = int(request.GET.get('rotation', 0)) % 360 - arguments, warnings = calculate_converter_arguments(document, size=size, file_format=DEFAULT_FILE_FORMAT, quality=quality, page=page, zoom=zoom, rotation=rotation) + #arguments, warnings = calculate_converter_arguments(document, size=size, file_format=DEFAULT_FILE_FORMAT, quality=quality, page=page, zoom=zoom, rotation=rotation) - if warnings and (request.user.is_staff or request.user.is_superuser): - for warning in warnings: - messages.warning(request, _(u'Page transformation error: %s') % warning) + #if warnings and (request.user.is_staff or request.user.is_superuser): + # for warning in warnings: + # messages.warning(request, _(u'Page transformation error: %s') % warning) + + transformations = DocumentPageTransformation.objects.get_for_document_page_as_list(document) try: - output_file = convert_document(document, **arguments) + #output_file = convert_document(document, **arguments) + output_file = convert_document(document, size=size, file_format=DEFAULT_FILE_FORMAT, quality=quality, page=page, zoom=zoom, rotation=rotation, transformations=transformations) except UnkownConvertError, e: if request.user.is_staff or request.user.is_superuser: messages.error(request, e) diff --git a/apps/sources/managers.py b/apps/sources/managers.py index aee45cf4c1..1fd2d38d21 100644 --- a/apps/sources/managers.py +++ b/apps/sources/managers.py @@ -6,3 +6,6 @@ class SourceTransformationManager(models.Manager): def get_for_object(self, obj): ct = ContentType.objects.get_for_model(obj) return self.model.objects.filter(content_type=ct).filter(object_id=obj.pk) + + def get_for_object_as_list(self, obj): + return list([{'transformation': transformation['transformation'], 'arguments': eval(transformation['arguments'])} for transformation in self.get_for_object(obj).values('transformation', 'arguments')]) diff --git a/apps/sources/models.py b/apps/sources/models.py index ffd4211fe6..795f269132 100644 --- a/apps/sources/models.py +++ b/apps/sources/models.py @@ -6,7 +6,8 @@ from django.contrib.contenttypes import generic from documents.models import DocumentType from documents.managers import RecentDocumentManager from metadata.models import MetadataType -from converter.api import backend +from converter.api import get_available_transformations_choices +from converter.literals import DIMENSION_SEPARATOR from sources.managers import SourceTransformationManager @@ -118,7 +119,7 @@ class StagingFolder(InteractiveBaseModel): if self.preview_height: dimensions.append(unicode(self.preview_height)) - return u'x'.join(dimensions) + return DIMENSION_SEPARATOR.join(dimensions) class Meta(InteractiveBaseModel.Meta): verbose_name = _(u'staging folder') @@ -162,8 +163,8 @@ class SourceTransformation(models.Model): object_id = models.PositiveIntegerField() content_object = generic.GenericForeignKey('content_type', 'object_id') order = models.PositiveIntegerField(default=0, blank=True, null=True, verbose_name=_(u'order'), db_index=True) - transformation = models.CharField(choices=backend.get_available_transformations_labels(), max_length=128, verbose_name=_(u'transformation')) - arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: {\'degrees\':90}')) + transformation = models.CharField(choices=get_available_transformations_choices(), max_length=128, verbose_name=_(u'transformation')) + arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: %s') % u'{\'degrees\':90}') objects = SourceTransformationManager() diff --git a/apps/sources/staging.py b/apps/sources/staging.py index a608f1b30f..ebb31a59d9 100644 --- a/apps/sources/staging.py +++ b/apps/sources/staging.py @@ -106,16 +106,15 @@ class StagingFile(object): def upload(self): """ Return a StagingFile encapsulated in a File class instance to - allow for easier upload a staging files + allow for easier upload of staging files """ try: return File(file(self.filepath, 'rb'), name=self.filename) except Exception, exc: raise Exception(ugettext(u'Unable to upload staging file: %s') % exc) - def delete(self, preview_size): - # tranformation_string, errors = get_transformation_string(DEFAULT_TRANSFORMATIONS) - cache_cleanup(self.filepath, size=preview_size)# , extra_options=tranformation_string) + def delete(self, preview_size, transformations): + cache_cleanup(self.filepath, size=preview_size, transformations=transformations) try: os.unlink(self.filepath) except OSError, exc: @@ -124,24 +123,7 @@ class StagingFile(object): else: raise OSError(ugettext(u'Unable to delete staging file: %s') % exc) - def preview(self, preview_size): + def preview(self, preview_size, transformations): errors = [] - # tranformation_string, errors = get_transformation_string(DEFAULT_TRANSFORMATIONS) - # output_file = convert(self.filepath, size=STAGING_FILES_PREVIEW_SIZE, extra_options=tranformation_string, cleanup_files=False) - output_file = convert(self.filepath, size=preview_size, cleanup_files=False) + output_file = convert(self.filepath, size=preview_size, cleanup_files=False, transformations=transformations) return output_file, errors - - -def get_transformation_string(transformations): - transformation_list = [] - errors = [] - #for transformation in transformations: - # try: - # if transformation['name'] in TRANFORMATION_CHOICES: - # output = TRANFORMATION_CHOICES[transformation['name']] % eval(transformation['arguments']) - # transformation_list.append(output) - # except Exception, e: - # errors.append(e) - - #tranformation_string = ' '.join(transformation_list) - return tranformation_string, errors diff --git a/apps/sources/views.py b/apps/sources/views.py index 6a48dc1e34..5748f7ffd8 100644 --- a/apps/sources/views.py +++ b/apps/sources/views.py @@ -285,7 +285,10 @@ def staging_file_preview(request, source_type, source_id, staging_file_id): staging_folder = get_object_or_404(StagingFolder, pk=source_id) StagingFile = create_staging_file_class(request, staging_folder.folder_path) try: - output_file, errors = StagingFile.get(staging_file_id).preview(staging_folder.get_preview_size()) + output_file, errors = StagingFile.get(staging_file_id).preview( + preview_size=staging_folder.get_preview_size(), + transformations=SourceTransformation.objects.get_for_object_as_list(staging_folder) + ) if errors and (request.user.is_staff or request.user.is_superuser): for error in errors: messages.warning(request, _(u'Staging file transformation error: %(error)s') % { @@ -318,7 +321,10 @@ def staging_file_delete(request, source_type, source_id, staging_file_id): if request.method == 'POST': try: - staging_file.delete(staging_folder.get_preview_size()) + staging_file.delete( + preview_size=staging_folder.get_preview_size(), + transformations=SourceTransformation.objects.get_for_object_as_list(staging_folder) + ) messages.success(request, _(u'Staging file delete successfully.')) except Exception, e: messages.error(request, e) @@ -509,6 +515,8 @@ def setup_source_transformation_edit(request, transformation_id): form = SourceTransformationForm(instance=source_transformation, data=request.POST) if form.is_valid(): try: + # Test the validity of the argument field + eval(form.cleaned_data['arguments']) form.save() messages.success(request, _(u'Source transformation edited successfully')) return HttpResponseRedirect(next) @@ -598,6 +606,8 @@ def setup_source_transformation_create(request, source_type, source_id): form = SourceTransformationForm_create(request.POST) if form.is_valid(): try: + # Test the validity of the argument field + eval(form.cleaned_data['arguments']) source_tranformation = form.save(commit=False) source_tranformation.content_object = source source_tranformation.save() From 389253385cbafef3f1fa78667768d5eb4143f7e4 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Fri, 15 Jul 2011 20:25:49 -0400 Subject: [PATCH 04/14] Source, document page and thumbnails working, new document transformations and OCR yet to convert --- apps/converter/api.py | 34 ++++------- .../converter/backends/graphicsmagick/base.py | 32 ++++++----- apps/converter/backends/imagemagick/base.py | 37 ++++++++++-- apps/documents/managers.py | 15 ++++- apps/documents/views.py | 57 ++++++------------- apps/sources/managers.py | 18 +++++- apps/sources/views.py | 43 ++++++++------ 7 files changed, 137 insertions(+), 99 deletions(-) diff --git a/apps/converter/api.py b/apps/converter/api.py index 478783b299..32d92b5664 100644 --- a/apps/converter/api.py +++ b/apps/converter/api.py @@ -1,5 +1,6 @@ import os import subprocess +import hashlib from django.utils.importlib import import_module from django.template.defaultfilters import slugify @@ -22,6 +23,7 @@ from converter.literals import TRANSFORMATION_RESIZE, \ TRANSFORMATION_ZOOM from converter.literals import DIMENSION_SEPARATOR +HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest() CONVERTER_OFFICE_FILE_EXTENSIONS = [ u'ods', u'docx', u'doc' @@ -75,19 +77,11 @@ def cache_cleanup(input_filepath, *args, **kwargs): def create_image_cache_filename(input_filepath, *args, **kwargs): if input_filepath: - temp_filename, separator = os.path.splitext(os.path.basename(input_filepath)) - temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename) - - final_filepath = [] - [final_filepath.append(str(arg)) for arg in args] - final_filepath.extend([u'%s_%s' % (key, value) for key, value in kwargs.items()]) - - temp_path += slugify(u'_'.join(final_filepath)) - - return temp_path + hash_value = HASH_FUNCTION(u''.join([input_filepath, unicode(args), unicode(kwargs)])) + return os.path.join(TEMPORARY_DIRECTORY, hash_value) else: return None - + def convert_office_document(input_filepath): if os.path.exists(UNOCONV_PATH): @@ -104,21 +98,21 @@ def convert_document(document, *args, **kwargs): return convert(document_save_to_temp_dir(document, document.checksum), *args, **kwargs) -def convert(input_filepath, *args, **kwargs): +def convert(input_filepath, cleanup_files=True, *args, **kwargs): size = kwargs.get('size') file_format = kwargs.get('file_format', DEFAULT_FILE_FORMAT) zoom = kwargs.get('zoom', DEFAULT_ZOOM_LEVEL) rotation = kwargs.get('rotation', DEFAULT_ROTATION) page = kwargs.get('page', DEFAULT_PAGE_NUMBER) - cleanup_files = kwargs.get('cleanup_files', True) quality = kwargs.get('quality', QUALITY_DEFAULT) transformations = kwargs.get('transformations', []) unoconv_output = None output_filepath = create_image_cache_filename(input_filepath, *args, **kwargs) - #if os.path.exists(output_filepath): - # return output_filepath + print 'output_filepath', output_filepath + if os.path.exists(output_filepath): + return output_filepath path, extension = os.path.splitext(input_filepath) if extension[1:].lower() in CONVERTER_OFFICE_FILE_EXTENSIONS: @@ -128,8 +122,6 @@ def convert(input_filepath, *args, **kwargs): input_filepath = result extra_options = u'' - #TODO: not here in the backend - input_arg = u'%s[%s]' % (input_filepath, page-1) transformations.append( { 'transformation': TRANSFORMATION_RESIZE, @@ -154,7 +146,7 @@ def convert(input_filepath, *args, **kwargs): ) try: - backend.convert_file(input_filepath=input_arg, output_filepath=u'%s:%s' % (file_format, output_filepath), quality=quality, transformations=transformations) + backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, quality=quality, transformations=transformations, page=page, file_format=file_format) finally: if cleanup_files: cleanup(input_filepath) @@ -189,14 +181,12 @@ def convert_document_for_ocr(document, page=DEFAULT_PAGE_NUMBER, file_format=DEF unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep) convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format) - input_arg = u'%s[%s]' % (input_filepath, page-1) - try: - document_page = document.documentpage_set.get(page_number=page + 1) + document_page = document.documentpage_set.get(page_number=page) transformation_string, warnings = document_page.get_transformation_string() #Apply default transformations - backend.convert_file(input_filepath=input_arg, quality=QUALITY_HIGH, arguments=transformation_string, output_filepath=transformation_output_file) + backend.convert_file(input_filepath=input_filepath, page=page, quality=QUALITY_HIGH, arguments=transformation_string, output_filepath=transformation_output_file) #Do OCR operations backend.convert_file(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file) # Process by unpaper diff --git a/apps/converter/backends/graphicsmagick/base.py b/apps/converter/backends/graphicsmagick/base.py index 8cb0f3fb55..c8b479a0aa 100644 --- a/apps/converter/backends/graphicsmagick/base.py +++ b/apps/converter/backends/graphicsmagick/base.py @@ -11,7 +11,8 @@ from converter.backends import ConverterBase from converter.literals import TRANSFORMATION_RESIZE, \ TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \ TRANSFORMATION_ZOOM -from converter.literals import DIMENSION_SEPARATOR +from converter.literals import DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER, \ + DEFAULT_FILE_FORMAT CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format' CONVERTER_ERROR_STARTS_WITH = u'starts with' @@ -32,10 +33,12 @@ class ConverterClass(ConverterBase): return proc.stdout.read() - def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT): + def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT): + print 'convert_file' arguments = [] if transformations: for transformation in transformations: + print 'transformation: %s' % transformation if transformation['transformation'] == TRANSFORMATION_RESIZE: dimensions = [] dimensions.append(unicode(transformation['arguments']['width'])) @@ -46,21 +49,31 @@ class ConverterClass(ConverterBase): elif transformation['transformation'] == TRANSFORMATION_ZOOM: arguments.append(u'-resize') - arguments.append(u'%d%%' % transformation['arguments']['zoom']) + arguments.append(u'%d%%' % transformation['arguments']['percent']) elif transformation['transformation'] == TRANSFORMATION_ROTATE: + print 'Do rotate' arguments.append(u'-rotate') arguments.append(u'%s' % transformation['arguments']['degrees']) print 'arguments: %s' % arguments - #if format == u'jpg': - # extra_options += u' -quality 85' + if format == u'jpg': + arguments.append(u'-quality') + arguments.append(u'85') + + + # Graphicsmagick page number is 0 base + input_arg = u'%s[%d]' % (input_filepath, page - 1) + + # Specify the file format next to the output filename + output_filepath = u'%s:%s' % (file_format, output_filepath) + command = [] command.append(unicode(GM_PATH)) command.append(u'convert') command.extend(unicode(QUALITY_SETTINGS[quality]).split()) command.extend(unicode(GM_SETTINGS).split()) - command.append(unicode(input_filepath)) + command.append(unicode(input_arg)) if arguments: command.extend(arguments) command.append(unicode(output_filepath)) @@ -115,10 +128,3 @@ class ConverterClass(ConverterBase): except: #TODO: send to other page number identifying program return 1 - - - def _get_transformation_string(): - pass - #'command_line': u'-rotate %(degrees)d' - # } - #} diff --git a/apps/converter/backends/imagemagick/base.py b/apps/converter/backends/imagemagick/base.py index cd5b1ba53e..11822db515 100644 --- a/apps/converter/backends/imagemagick/base.py +++ b/apps/converter/backends/imagemagick/base.py @@ -31,15 +31,42 @@ class ConverterClass(ConverterBase): return proc.stdout.read() - def convert_file(self, input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None): - #if format == u'jpg': - # extra_options += u' -quality 85' + def convert_file(self, input_filepath, output_filepath, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT): + arguments = [] + if transformations: + for transformation in transformations: + if transformation['transformation'] == TRANSFORMATION_RESIZE: + dimensions = [] + dimensions.append(unicode(transformation['arguments']['width'])) + if 'height' in transformation['arguments']: + dimensions.append(unicode(transformation['arguments']['height'])) + arguments.append(u'-resize') + arguments.append(u'%s' % DIMENSION_SEPARATOR.join(dimensions)) + + elif transformation['transformation'] == TRANSFORMATION_ZOOM: + arguments.append(u'-resize') + arguments.append(u'%d%%' % transformation['arguments']['percent']) + + elif transformation['transformation'] == TRANSFORMATION_ROTATE: + arguments.append(u'-rotate') + arguments.append(u'%s' % transformation['arguments']['degrees']) + + if format == u'jpg': + arguments.append(u'-quality') + arguments.append(u'85') + + # Imagemagick page number is 0 base + input_arg = u'%s[%d]' % (input_filepath, page - 1) + + # Specify the file format next to the output filename + output_filepath = u'%s:%s' % (file_format, output_filepath) + command = [] command.append(unicode(IM_CONVERT_PATH)) command.extend(unicode(QUALITY_SETTINGS[quality]).split()) - command.append(unicode(input_filepath)) + command.append(unicode(input_arg)) if arguments: - command.extend(unicode(arguments).split()) + command.extend(arguments) command.append(unicode(output_filepath)) proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() diff --git a/apps/documents/managers.py b/apps/documents/managers.py index d63b2d644f..ef87c929fe 100644 --- a/apps/documents/managers.py +++ b/apps/documents/managers.py @@ -20,4 +20,17 @@ class DocumentPageTransformationManager(models.Manager): return self.model.objects.filter(document_page=document_page) def get_for_document_page_as_list(self, document_page): - return list([{'transformation': transformation['transformation'], 'arguments': eval(transformation['arguments'])} for transformation in self.get_for_document_page(document_page).values('transformation', 'arguments')]) + warnings = [] + transformations = [] + for transformation in self.get_for_document_page(document_page).values('transformation', 'arguments'): + try: + transformations.append( + { + 'transformation': transformation['transformation'], + 'arguments': eval(transformation['arguments'], {}) + } + ) + except Exception, e: + warnings.append(e) + + return transformations, warnings diff --git a/apps/documents/views.py b/apps/documents/views.py index dcd383006d..c727407b88 100644 --- a/apps/documents/views.py +++ b/apps/documents/views.py @@ -20,10 +20,11 @@ from common.widgets import two_state_template from common.literals import PAGE_SIZE_DIMENSIONS, \ PAGE_ORIENTATION_PORTRAIT, PAGE_ORIENTATION_LANDSCAPE from common.conf.settings import DEFAULT_PAPER_SIZE -from converter.api import convert_document, QUALITY_DEFAULT +from converter.api import convert_document from converter.exceptions import UnkownConvertError, UnknownFormat -from converter.api import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, \ - DEFAULT_FILE_FORMAT, QUALITY_PRINT +from converter.literals import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, \ + DEFAULT_FILE_FORMAT, QUALITY_PRINT, QUALITY_DEFAULT, \ + DEFAULT_PAGE_NUMBER from filetransfers.api import serve_file from grouping.utils import get_document_group_subtemplate from metadata.api import save_metadata_list, \ @@ -285,39 +286,15 @@ def document_edit(request, document_id): 'object': document, }, context_instance=RequestContext(request)) -''' -def calculate_converter_arguments(document, *args, **kwargs): - size = kwargs.pop('size', PREVIEW_SIZE) - quality = kwargs.pop('quality', QUALITY_DEFAULT) - page = kwargs.pop('page', 1) - file_format = kwargs.pop('file_format', DEFAULT_FILE_FORMAT) - zoom = kwargs.pop('zoom', DEFAULT_ZOOM_LEVEL) - rotation = kwargs.pop('rotation', DEFAULT_ROTATION) - - document_page = DocumentPage.objects.get(document=document, page_number=page) - transformation_string, warnings = document_page.get_transformation_string() - - arguments = { - 'size': size, - 'file_format': file_format, - 'quality': quality, - 'extra_options': transformation_string, - 'page': page - 1, - 'zoom': zoom, - 'rotation': rotation - } - - return arguments, warnings -''' def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_DEFAULT): check_permissions(request.user, [PERMISSION_DOCUMENT_VIEW]) document = get_object_or_404(Document, pk=document_id) - page = int(request.GET.get('page', 1)) + page = int(request.GET.get('page', DEFAULT_PAGE_NUMBER)) - zoom = int(request.GET.get('zoom', 100)) + zoom = int(request.GET.get('zoom', DEFAULT_ZOOM_LEVEL)) if zoom < ZOOM_MIN_LEVEL: zoom = ZOOM_MIN_LEVEL @@ -325,18 +302,16 @@ def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_ if zoom > ZOOM_MAX_LEVEL: zoom = ZOOM_MAX_LEVEL - rotation = int(request.GET.get('rotation', 0)) % 360 + rotation = int(request.GET.get('rotation', DEFAULT_ROTATION)) % 360 - #arguments, warnings = calculate_converter_arguments(document, size=size, file_format=DEFAULT_FILE_FORMAT, quality=quality, page=page, zoom=zoom, rotation=rotation) - - #if warnings and (request.user.is_staff or request.user.is_superuser): - # for warning in warnings: - # messages.warning(request, _(u'Page transformation error: %s') % warning) - - transformations = DocumentPageTransformation.objects.get_for_document_page_as_list(document) + document_page = get_object_or_404(document.documentpage_set, page_number=page) + transformations, warnings = DocumentPageTransformation.objects.get_for_document_page_as_list(document_page) + if warnings and (request.user.is_staff or request.user.is_superuser): + for warning in warnings: + messages.warning(request, _(u'Page transformation error: %s') % warning) + try: - #output_file = convert_document(document, **arguments) output_file = convert_document(document, size=size, file_format=DEFAULT_FILE_FORMAT, quality=quality, page=page, zoom=zoom, rotation=rotation, transformations=transformations) except UnkownConvertError, e: if request.user.is_staff or request.user.is_superuser: @@ -595,13 +570,13 @@ def document_page_view(request, document_page_id): document_page = get_object_or_404(DocumentPage, pk=document_page_id) - zoom = int(request.GET.get('zoom', 100)) - rotation = int(request.GET.get('rotation', 0)) + zoom = int(request.GET.get('zoom', DEFAULT_ZOOM_LEVEL)) + rotation = int(request.GET.get('rotation', DEFAULT_ROTATION)) document_page_form = DocumentPageForm(instance=document_page, zoom=zoom, rotation=rotation) base_title = _(u'details for: %s') % document_page - if zoom != 100: + if zoom != DEFAULT_ZOOM_LEVEL: zoom_text = u'(%d%%)' % zoom else: zoom_text = u'' diff --git a/apps/sources/managers.py b/apps/sources/managers.py index 1fd2d38d21..aacccb31e8 100644 --- a/apps/sources/managers.py +++ b/apps/sources/managers.py @@ -7,5 +7,21 @@ class SourceTransformationManager(models.Manager): ct = ContentType.objects.get_for_model(obj) return self.model.objects.filter(content_type=ct).filter(object_id=obj.pk) + #def get_for_object_as_list(self, obj): + # return list([{'transformation': transformation['transformation'], 'arguments': eval(transformation['arguments'])} for transformation in self.get_for_object(obj).values('transformation', 'arguments')]) + def get_for_object_as_list(self, obj): - return list([{'transformation': transformation['transformation'], 'arguments': eval(transformation['arguments'])} for transformation in self.get_for_object(obj).values('transformation', 'arguments')]) + warnings = [] + transformations = [] + for transformation in self.get_for_object(obj).values('transformation', 'arguments'): + try: + transformations.append( + { + 'transformation': transformation['transformation'], + 'arguments': eval(transformation['arguments'], {}) + } + ) + except Exception, e: + warnings.append(e) + + return transformations, warnings diff --git a/apps/sources/views.py b/apps/sources/views.py index 5748f7ffd8..2e2b42ed01 100644 --- a/apps/sources/views.py +++ b/apps/sources/views.py @@ -285,9 +285,11 @@ def staging_file_preview(request, source_type, source_id, staging_file_id): staging_folder = get_object_or_404(StagingFolder, pk=source_id) StagingFile = create_staging_file_class(request, staging_folder.folder_path) try: + transformations, errors=SourceTransformation.objects.get_for_object_as_list(staging_folder) + output_file, errors = StagingFile.get(staging_file_id).preview( preview_size=staging_folder.get_preview_size(), - transformations=SourceTransformation.objects.get_for_object_as_list(staging_folder) + transformations=transformations ) if errors and (request.user.is_staff or request.user.is_superuser): for error in errors: @@ -321,9 +323,10 @@ def staging_file_delete(request, source_type, source_id, staging_file_id): if request.method == 'POST': try: + transformations, errors=SourceTransformation.objects.get_for_object_as_list(staging_folder) staging_file.delete( preview_size=staging_folder.get_preview_size(), - transformations=SourceTransformation.objects.get_for_object_as_list(staging_folder) + transformations=transformations ) messages.success(request, _(u'Staging file delete successfully.')) except Exception, e: @@ -516,12 +519,16 @@ def setup_source_transformation_edit(request, transformation_id): if form.is_valid(): try: # Test the validity of the argument field - eval(form.cleaned_data['arguments']) - form.save() - messages.success(request, _(u'Source transformation edited successfully')) - return HttpResponseRedirect(next) - except Exception, e: - messages.error(request, _(u'Error editing source transformation; %s') % e) + eval(form.cleaned_data['arguments'], {}) + except: + messages.error(request, _(u'Source transformation argument error.')) + else: + try: + form.save() + messages.success(request, _(u'Source transformation edited successfully')) + return HttpResponseRedirect(next) + except Exception, e: + messages.error(request, _(u'Error editing source transformation; %s') % e) else: form = SourceTransformationForm(instance=source_transformation) @@ -607,14 +614,18 @@ def setup_source_transformation_create(request, source_type, source_id): if form.is_valid(): try: # Test the validity of the argument field - eval(form.cleaned_data['arguments']) - source_tranformation = form.save(commit=False) - source_tranformation.content_object = source - source_tranformation.save() - messages.success(request, _(u'Source transformation created successfully')) - return HttpResponseRedirect(redirect_view) - except Exception, e: - messages.error(request, _(u'Error creating source transformation; %s') % e) + eval(form.cleaned_data['arguments'], {}) + except: + messages.error(request, _(u'Source transformation argument error.')) + else: + try: + source_tranformation = form.save(commit=False) + source_tranformation.content_object = source + source_tranformation.save() + messages.success(request, _(u'Source transformation created successfully')) + return HttpResponseRedirect(redirect_view) + except Exception, e: + messages.error(request, _(u'Error creating source transformation; %s') % e) else: form = SourceTransformationForm_create() From 0fe032f7c9e0d33e4b5c397a908e83d38740da4f Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Sat, 16 Jul 2011 01:09:36 -0400 Subject: [PATCH 05/14] Finished fixing new document transformations --- apps/converter/api.py | 1 - .../converter/backends/graphicsmagick/base.py | 5 ---- apps/documents/models.py | 29 ++++++++++--------- apps/sources/managers.py | 3 -- apps/sources/staging.py | 1 - apps/sources/views.py | 20 ++++++++----- 6 files changed, 28 insertions(+), 31 deletions(-) diff --git a/apps/converter/api.py b/apps/converter/api.py index 32d92b5664..3d3e2e6aed 100644 --- a/apps/converter/api.py +++ b/apps/converter/api.py @@ -110,7 +110,6 @@ def convert(input_filepath, cleanup_files=True, *args, **kwargs): unoconv_output = None output_filepath = create_image_cache_filename(input_filepath, *args, **kwargs) - print 'output_filepath', output_filepath if os.path.exists(output_filepath): return output_filepath diff --git a/apps/converter/backends/graphicsmagick/base.py b/apps/converter/backends/graphicsmagick/base.py index c8b479a0aa..120b6fb4ee 100644 --- a/apps/converter/backends/graphicsmagick/base.py +++ b/apps/converter/backends/graphicsmagick/base.py @@ -34,11 +34,9 @@ class ConverterClass(ConverterBase): def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT): - print 'convert_file' arguments = [] if transformations: for transformation in transformations: - print 'transformation: %s' % transformation if transformation['transformation'] == TRANSFORMATION_RESIZE: dimensions = [] dimensions.append(unicode(transformation['arguments']['width'])) @@ -52,11 +50,9 @@ class ConverterClass(ConverterBase): arguments.append(u'%d%%' % transformation['arguments']['percent']) elif transformation['transformation'] == TRANSFORMATION_ROTATE: - print 'Do rotate' arguments.append(u'-rotate') arguments.append(u'%s' % transformation['arguments']['degrees']) - print 'arguments: %s' % arguments if format == u'jpg': arguments.append(u'-quality') arguments.append(u'85') @@ -77,7 +73,6 @@ class ConverterClass(ConverterBase): if arguments: command.extend(arguments) command.append(unicode(output_filepath)) - print 'command: %s' % command proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() if return_code != 0: diff --git a/apps/documents/models.py b/apps/documents/models.py index aa344dce48..c058bdb694 100644 --- a/apps/documents/models.py +++ b/apps/documents/models.py @@ -90,7 +90,7 @@ class Document(models.Model): mimetype, page count and transformation when originally created """ new_document = not self.pk - + transformations = kwargs.pop('transformations', None) super(Document, self).save(*args, **kwargs) if new_document: @@ -99,7 +99,8 @@ class Document(models.Model): self.update_mimetype(save=False) self.save() self.update_page_count(save=False) - self.apply_default_transformations() + if transformations: + self.apply_default_transformations(transformations) @models.permalink def get_absolute_url(self): @@ -200,21 +201,21 @@ class Document(models.Model): exists in storage """ return self.file.storage.exists(self.file.path) + - def apply_default_transformations(self): + def apply_default_transformations(self, transformations): #Only apply default transformations on new documents - if DEFAULT_TRANSFORMATIONS and reduce(lambda x, y: x + y, [page.documentpagetransformation_set.count() for page in self.documentpage_set.all()]) == 0: - for transformation in DEFAULT_TRANSFORMATIONS: - if 'name' in transformation: - for document_page in self.documentpage_set.all(): - page_transformation = DocumentPageTransformation( - document_page=document_page, - order=0, - transformation=transformation['name']) - if 'arguments' in transformation: - page_transformation.arguments = transformation['arguments'] + if reduce(lambda x, y: x + y, [page.documentpagetransformation_set.count() for page in self.documentpage_set.all()]) == 0: + for transformation in transformations: + for document_page in self.documentpage_set.all(): + page_transformation = DocumentPageTransformation( + document_page=document_page, + order=0, + transformation=transformation.get('transformation'), + arguments=transformation.get('arguments') + ) - page_transformation.save() + page_transformation.save() class DocumentTypeFilename(models.Model): diff --git a/apps/sources/managers.py b/apps/sources/managers.py index aacccb31e8..f45e06e340 100644 --- a/apps/sources/managers.py +++ b/apps/sources/managers.py @@ -7,9 +7,6 @@ class SourceTransformationManager(models.Manager): ct = ContentType.objects.get_for_model(obj) return self.model.objects.filter(content_type=ct).filter(object_id=obj.pk) - #def get_for_object_as_list(self, obj): - # return list([{'transformation': transformation['transformation'], 'arguments': eval(transformation['arguments'])} for transformation in self.get_for_object(obj).values('transformation', 'arguments')]) - def get_for_object_as_list(self, obj): warnings = [] transformations = [] diff --git a/apps/sources/staging.py b/apps/sources/staging.py index ebb31a59d9..4a80435a3a 100644 --- a/apps/sources/staging.py +++ b/apps/sources/staging.py @@ -11,7 +11,6 @@ from django.utils.translation import ugettext_lazy as _ from converter.api import convert, cache_cleanup DEFAULT_STAGING_DIRECTORY = u'/tmp' -#from documents.conf.settings import DEFAULT_TRANSFORMATIONS HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest() #TODO: Do benchmarks diff --git a/apps/sources/views.py b/apps/sources/views.py index 2e2b42ed01..05766d78a9 100644 --- a/apps/sources/views.py +++ b/apps/sources/views.py @@ -129,9 +129,13 @@ def upload_interactive(request, source_type=None, source_id=None): expand = True else: expand = False - if (not expand) or (expand and not _handle_zip_file(request, request.FILES['file'], document_type)): + + transformations, errors = SourceTransformation.objects.get_for_object_as_list(web_form) + + if (not expand) or (expand and not _handle_zip_file(request, request.FILES['file'], document_type=document_type, transformations=transformations)): instance = form.save() instance.save() + instance.apply_default_transformations(transformations) if document_type: instance.document_type = document_type _handle_save_document(request, instance, form) @@ -174,16 +178,18 @@ def upload_interactive(request, source_type=None, source_id=None): expand = True else: expand = False - if (not expand) or (expand and not _handle_zip_file(request, staging_file.upload(), document_type)): + transformations, errors = SourceTransformation.objects.get_for_object_as_list(staging_folder) + if (not expand) or (expand and not _handle_zip_file(request, staging_file.upload(), document_type=document_type, transformations=transformations)): document = Document(file=staging_file.upload()) if document_type: document.document_type = document_type document.save() + document.apply_default_transformations(transformations) _handle_save_document(request, document, form) messages.success(request, _(u'Staging file: %s, uploaded successfully.') % staging_file.filename) if staging_folder.delete_after_upload: - staging_file.delete(staging_folder.get_preview_size()) + staging_file.delete(preview_size=staging_folder.get_preview_size(), transformations=transformations) messages.success(request, _(u'Staging file: %s, deleted successfully.') % staging_file.filename) except Exception, e: messages.error(request, e) @@ -260,7 +266,7 @@ def _handle_save_document(request, document, form=None): create_history(HISTORY_DOCUMENT_CREATED, document, {'user': request.user}) -def _handle_zip_file(request, uploaded_file, document_type=None): +def _handle_zip_file(request, uploaded_file, document_type=None, transformations=None): filename = getattr(uploaded_file, 'filename', getattr(uploaded_file, 'name', '')) if filename.lower().endswith('zip'): zfobj = zipfile.ZipFile(uploaded_file) @@ -318,8 +324,8 @@ def staging_file_delete(request, source_type, source_id, staging_file_id): StagingFile = create_staging_file_class(request, staging_folder.folder_path) staging_file = StagingFile.get(staging_file_id) - next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None))) - previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None))) + next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', '/'))) + previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', '/'))) if request.method == 'POST': try: @@ -330,7 +336,7 @@ def staging_file_delete(request, source_type, source_id, staging_file_id): ) messages.success(request, _(u'Staging file delete successfully.')) except Exception, e: - messages.error(request, e) + messages.error(request, _(u'Staging file delete error; %s.') % e) return HttpResponseRedirect(next) results = get_active_tab_links() From 29adcce2a33f205419767bf040ba47edd54c5cbb Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Sat, 16 Jul 2011 01:15:58 -0400 Subject: [PATCH 06/14] flake8 cleanups --- apps/converter/api.py | 6 +---- .../converter/backends/graphicsmagick/base.py | 23 +++++++------------ apps/ocr/views.py | 2 -- 3 files changed, 9 insertions(+), 22 deletions(-) diff --git a/apps/converter/api.py b/apps/converter/api.py index 3d3e2e6aed..f5d5203bae 100644 --- a/apps/converter/api.py +++ b/apps/converter/api.py @@ -2,9 +2,6 @@ import os import subprocess import hashlib -from django.utils.importlib import import_module -from django.template.defaultfilters import slugify - from common import TEMPORARY_DIRECTORY from documents.utils import document_save_to_temp_dir @@ -14,7 +11,7 @@ from converter.conf.settings import UNOCONV_PATH from converter.exceptions import UnpaperError, OfficeConversionError from converter.literals import DEFAULT_PAGE_NUMBER, \ DEFAULT_OCR_FILE_FORMAT, QUALITY_DEFAULT, DEFAULT_ZOOM_LEVEL, \ - DEFAULT_ROTATION, DEFAULT_FILE_FORMAT, QUALITY_PRINT + DEFAULT_ROTATION, DEFAULT_FILE_FORMAT, QUALITY_HIGH from converter import backend from converter.literals import TRANSFORMATION_CHOICES @@ -119,7 +116,6 @@ def convert(input_filepath, cleanup_files=True, *args, **kwargs): if result: unoconv_output = result input_filepath = result - extra_options = u'' transformations.append( { diff --git a/apps/converter/backends/graphicsmagick/base.py b/apps/converter/backends/graphicsmagick/base.py index 120b6fb4ee..9ec5f6a951 100644 --- a/apps/converter/backends/graphicsmagick/base.py +++ b/apps/converter/backends/graphicsmagick/base.py @@ -1,8 +1,6 @@ import subprocess import re -from django.utils.translation import ugettext_lazy as _ - from converter.conf.settings import GM_PATH from converter.conf.settings import GM_SETTINGS from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS @@ -32,7 +30,6 @@ class ConverterClass(ConverterBase): raise IdentifyError(proc.stderr.readline()) return proc.stdout.read() - def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT): arguments = [] if transformations: @@ -41,29 +38,28 @@ class ConverterClass(ConverterBase): dimensions = [] dimensions.append(unicode(transformation['arguments']['width'])) if 'height' in transformation['arguments']: - dimensions.append(unicode(transformation['arguments']['height'])) + dimensions.append(unicode(transformation['arguments']['height'])) arguments.append(u'-resize') arguments.append(u'%s' % DIMENSION_SEPARATOR.join(dimensions)) elif transformation['transformation'] == TRANSFORMATION_ZOOM: arguments.append(u'-resize') arguments.append(u'%d%%' % transformation['arguments']['percent']) - + elif transformation['transformation'] == TRANSFORMATION_ROTATE: arguments.append(u'-rotate') arguments.append(u'%s' % transformation['arguments']['degrees']) - + if format == u'jpg': arguments.append(u'-quality') arguments.append(u'85') - # Graphicsmagick page number is 0 base input_arg = u'%s[%d]' % (input_filepath, page - 1) - + # Specify the file format next to the output filename output_filepath = u'%s:%s' % (file_format, output_filepath) - + command = [] command.append(unicode(GM_PATH)) command.append(u'convert') @@ -84,13 +80,12 @@ class ConverterClass(ConverterBase): else: raise ConvertError(error_line) - def get_format_list(self): """ Call GraphicsMagick to parse all of it's supported file formats, and return a list of the names and descriptions """ - format_regex = re.compile(' *([A-Z0-9]+)[*]? +([A-Z0-9]+) +([rw\-+]+) *(.*).*') + format_regex = re.compile(' *([A-Z0-9]+)[*]? +([A-Z0-9]+) +([rw\-+]+) *(.*).*') formats = [] command = [] command.append(unicode(GM_PATH)) @@ -101,14 +96,13 @@ class ConverterClass(ConverterBase): return_code = proc.wait() if return_code != 0: raise ConvertError(proc.stderr.readline()) - + for line in proc.stdout.readlines(): fields = format_regex.findall(line) if fields: formats.append((fields[0][0], fields[0][3])) - - return formats + return formats def get_available_transformations(self): return [ @@ -116,7 +110,6 @@ class ConverterClass(ConverterBase): TRANSFORMATION_DENSITY, TRANSFORMATION_ZOOM ] - def get_page_count(self, input_filepath): try: return len(self.identify_file(unicode(input_filepath)).splitlines()) diff --git a/apps/ocr/views.py b/apps/ocr/views.py index eb4f3a945d..1dbff3e278 100644 --- a/apps/ocr/views.py +++ b/apps/ocr/views.py @@ -6,9 +6,7 @@ from django.shortcuts import render_to_response, get_object_or_404 from django.template import RequestContext from django.contrib import messages from django.views.generic.list_detail import object_list -from django.core.urlresolvers import reverse from django.utils.translation import ugettext_lazy as _ -from django.conf import settings from celery.task.control import inspect from permissions.api import check_permissions From 7289cafdfd709047c9bc3206687963466cca2d83 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Sat, 16 Jul 2011 04:03:17 -0400 Subject: [PATCH 07/14] Added python only converter backend supporting resizing, zooming and rotation --- apps/converter/backends/__init__.py | 26 +------ .../converter/backends/graphicsmagick/base.py | 5 +- apps/converter/backends/imagemagick/base.py | 7 +- apps/converter/backends/python/base.py | 78 +++++++++---------- apps/converter/literals.py | 2 +- 5 files changed, 45 insertions(+), 73 deletions(-) diff --git a/apps/converter/backends/__init__.py b/apps/converter/backends/__init__.py index 0b42ec89c2..a98881632b 100644 --- a/apps/converter/backends/__init__.py +++ b/apps/converter/backends/__init__.py @@ -2,13 +2,6 @@ class ConverterBase(object): """ Base class that all backend classes must inherit """ - - def identify_file(self, input_filepath, *args, **kwargs): - raise NotImplementedError("Your %s class has not defined a identify_file() method, which is required." % self.__class__.__name__) - - def identify_document(self, document, *args, **kwargs): - raise NotImplementedError("Your %s class has not defined a identify_document() method, which is required." % self.__class__.__name__) - def convert_file(self, input_filepath, *args, **kwargs): raise NotImplementedError("Your %s class has not defined a convert_file() method, which is required." % self.__class__.__name__) @@ -20,23 +13,6 @@ class ConverterBase(object): def get_available_transformations(self): raise NotImplementedError("Your %s class has not defined a get_available_transformations() method, which is required." % self.__class__.__name__) - - def get_transformation_string(self, transformation_list): - transformations = [] - warnings = [] - transformation_choices = self.get_available_transformations() - for transformation in transformation_list: - try: - if transformation['transformation'] in transformation_choices: - transformations.append( - transformation_choices[transformation['transformation']]['command_line'] % eval( - transformation['arguments'] - ) - ) - except Exception, e: - warnings.append(e) - - return u' '.join(transformations), warnings - + def get_page_count(self): raise NotImplementedError("Your %s class has not defined a get_page_count() method, which is required." % self.__class__.__name__) diff --git a/apps/converter/backends/graphicsmagick/base.py b/apps/converter/backends/graphicsmagick/base.py index 9ec5f6a951..4d3910391b 100644 --- a/apps/converter/backends/graphicsmagick/base.py +++ b/apps/converter/backends/graphicsmagick/base.py @@ -4,7 +4,8 @@ import re from converter.conf.settings import GM_PATH from converter.conf.settings import GM_SETTINGS from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS -from converter.exceptions import ConvertError, UnknownFormat, IdentifyError +from converter.exceptions import ConvertError, UnknownFormat, \ + IdentifyError from converter.backends import ConverterBase from converter.literals import TRANSFORMATION_RESIZE, \ TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \ @@ -50,7 +51,7 @@ class ConverterClass(ConverterBase): arguments.append(u'-rotate') arguments.append(u'%s' % transformation['arguments']['degrees']) - if format == u'jpg': + if format == u'jpeg': arguments.append(u'-quality') arguments.append(u'85') diff --git a/apps/converter/backends/imagemagick/base.py b/apps/converter/backends/imagemagick/base.py index 11822db515..d448232760 100644 --- a/apps/converter/backends/imagemagick/base.py +++ b/apps/converter/backends/imagemagick/base.py @@ -1,8 +1,6 @@ import subprocess import re -from django.utils.translation import ugettext_lazy as _ - from converter.conf.settings import IM_IDENTIFY_PATH from converter.conf.settings import IM_CONVERT_PATH from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS @@ -12,6 +10,8 @@ from converter.backends import ConverterBase from converter.literals import TRANSFORMATION_RESIZE, \ TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \ TRANSFORMATION_ZOOM +from converter.literals import DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER, \ + DEFAULT_FILE_FORMAT CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format' @@ -30,7 +30,6 @@ class ConverterClass(ConverterBase): raise IdentifyError(proc.stderr.readline()) return proc.stdout.read() - def convert_file(self, input_filepath, output_filepath, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT): arguments = [] if transformations: @@ -51,7 +50,7 @@ class ConverterClass(ConverterBase): arguments.append(u'-rotate') arguments.append(u'%s' % transformation['arguments']['degrees']) - if format == u'jpg': + if format == u'jpeg': arguments.append(u'-quality') arguments.append(u'85') diff --git a/apps/converter/backends/python/base.py b/apps/converter/backends/python/base.py index 4d776454e4..616e997d3f 100644 --- a/apps/converter/backends/python/base.py +++ b/apps/converter/backends/python/base.py @@ -6,13 +6,11 @@ from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS from converter.exceptions import ConvertError, UnknownFormat, IdentifyError from converter.backends import ConverterBase from converter.literals import TRANSFORMATION_RESIZE, \ - TRANSFORMATION_ROTATE + TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM +from converter.literals import QUALITY_DEFAULT, DEFAULT_PAGE_NUMBER, \ + DEFAULT_FILE_FORMAT class ConverterClass(ConverterBase): - def identify_file(self, input_filepath, arguments=None): - pass - - def get_page_count(self, input_filepath): page_count = 1 im = Image.open(input_filepath) @@ -26,33 +24,39 @@ class ConverterClass(ConverterBase): pass # end of sequence return page_count + + def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT): + try: + im = Image.open(input_filepath) + except Exception: # Python Imaging Library doesn't recognize it as an image + raise UnknownFormat - - def convert_file(self, input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None): - im = Image.open(input_filepath) - outfile, format = output_filepath.split(u':') - im.save(outfile, format) - ''' - command = [] - command.append(unicode(GM_PATH)) - command.append(u'convert') - command.extend(unicode(QUALITY_SETTINGS[quality]).split()) - command.extend(unicode(GM_SETTINGS).split()) - command.append(unicode(input_filepath)) - if arguments: - command.extend(unicode(arguments).split()) - command.append(unicode(output_filepath)) - proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - return_code = proc.wait() - if return_code != 0: - #Got an error from convert program - error_line = proc.stderr.readline() - if (CONVERTER_ERROR_STRING_NO_DECODER in error_line) or (CONVERTER_ERROR_STARTS_WITH in error_line): - #Try to determine from error message which class of error is it - raise UnknownFormat - else: - raise ConvertError(error_line) - ''' + current_page = 0 + try: + while current_page == page - 1: + im.seek(im.tell() + 1) + current_page += 1 + # do something to im + except EOFError: + pass # end of sequence + + if transformations: + for transformation in transformations: + aspect = 1.0 * im.size[1] / im.size[0] + if transformation['transformation'] == TRANSFORMATION_RESIZE: + width = int(transformation['arguments']['width']) + height = int(transformation['arguments'].get('height', 1.0 * width * aspect)) + im = im.resize((width, height), Image.ANTIALIAS) + elif transformation['transformation'] == TRANSFORMATION_ZOOM: + decimal_value = float(transformation['arguments']['percent']) / 100 + im = im.transform((im.size[0] * decimal_value, im.size[1] * decimal_value), Image.EXTENT, (0, 0, im.size[0], im.size[1])) + elif transformation['transformation'] == TRANSFORMATION_ROTATE: + # PIL counter degress counter-clockwise, reverse them + im = im.rotate(360 - transformation['arguments']['degrees']) + + if im.mode not in ('L', 'RGB'): + im = im.convert('RGB') + im.save(output_filepath, format=file_format) def get_format_list(self): """ @@ -65,16 +69,8 @@ class ConverterClass(ConverterBase): return formats - def get_available_transformations(self): return [ - TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE + TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \ + TRANSFORMATION_ZOOM ] - - - def get_page_count(self, input_filepath): - try: - return len(self.identify_file(unicode(input_filepath)).splitlines()) - except: - #TODO: send to other page number identifying program - return 1 diff --git a/apps/converter/literals.py b/apps/converter/literals.py index 7671fddf8a..cb5af4c06c 100644 --- a/apps/converter/literals.py +++ b/apps/converter/literals.py @@ -8,7 +8,7 @@ from converter.conf.settings import PRINT_QUALITY_OPTIONS DEFAULT_ZOOM_LEVEL = 100 DEFAULT_ROTATION = 0 DEFAULT_PAGE_NUMBER = 1 -DEFAULT_FILE_FORMAT = u'jpg' +DEFAULT_FILE_FORMAT = u'jpeg' DEFAULT_OCR_FILE_FORMAT = u'tif' QUALITY_DEFAULT = u'quality_default' From 970ca0be057b66aa60658b4615e0779269d34a3d Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Sat, 16 Jul 2011 04:08:17 -0400 Subject: [PATCH 08/14] Fixed Imagemagick converter backend --- apps/converter/backends/imagemagick/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/converter/backends/imagemagick/base.py b/apps/converter/backends/imagemagick/base.py index d448232760..c9977fb3b4 100644 --- a/apps/converter/backends/imagemagick/base.py +++ b/apps/converter/backends/imagemagick/base.py @@ -3,7 +3,7 @@ import re from converter.conf.settings import IM_IDENTIFY_PATH from converter.conf.settings import IM_CONVERT_PATH -from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS +from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS from converter.exceptions import ConvertError, UnknownFormat, \ IdentifyError from converter.backends import ConverterBase @@ -30,7 +30,7 @@ class ConverterClass(ConverterBase): raise IdentifyError(proc.stderr.readline()) return proc.stdout.read() - def convert_file(self, input_filepath, output_filepath, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT): + def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT): arguments = [] if transformations: for transformation in transformations: From 5829bbde4ddc3534bef7ab9d3dd76c8250c5be66 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Sun, 17 Jul 2011 01:32:46 -0400 Subject: [PATCH 09/14] Added per OCR queue transformation models and CRUD views to replace the CONVERTER_OCR_OPTIONS with the new refactored converter transformations systems --- apps/converter/api.py | 4 +- apps/documents/models.py | 3 + apps/documents/views.py | 2 +- apps/ocr/__init__.py | 17 +++-- apps/ocr/api.py | 7 +- apps/ocr/forms.py | 21 ++++++ apps/ocr/manager.py | 18 ----- apps/ocr/managers.py | 41 +++++++++++ apps/ocr/models.py | 28 +++++++- apps/ocr/urls.py | 26 ++++--- apps/ocr/views.py | 152 +++++++++++++++++++++++++++++++++++++-- apps/sources/managers.py | 2 +- apps/sources/urls.py | 12 ++-- apps/sources/views.py | 6 +- 14 files changed, 284 insertions(+), 55 deletions(-) create mode 100644 apps/ocr/forms.py delete mode 100644 apps/ocr/manager.py create mode 100644 apps/ocr/managers.py diff --git a/apps/converter/api.py b/apps/converter/api.py index f5d5203bae..71a188a36d 100644 --- a/apps/converter/api.py +++ b/apps/converter/api.py @@ -178,10 +178,10 @@ def convert_document_for_ocr(document, page=DEFAULT_PAGE_NUMBER, file_format=DEF try: document_page = document.documentpage_set.get(page_number=page) - transformation_string, warnings = document_page.get_transformation_string() + transformations, warnings = document_page.get_transformation_list() #Apply default transformations - backend.convert_file(input_filepath=input_filepath, page=page, quality=QUALITY_HIGH, arguments=transformation_string, output_filepath=transformation_output_file) + backend.convert_file(input_filepath=input_filepath, page=page, quality=QUALITY_HIGH, transformations=transformations, output_filepath=transformation_output_file) #Do OCR operations backend.convert_file(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file) # Process by unpaper diff --git a/apps/documents/models.py b/apps/documents/models.py index c058bdb694..b3eadb08e7 100644 --- a/apps/documents/models.py +++ b/apps/documents/models.py @@ -257,6 +257,9 @@ class DocumentPage(models.Model): verbose_name = _(u'document page') verbose_name_plural = _(u'document pages') + def get_transformation_list(self): + return DocumentPageTransformation.objects.get_for_document_page_as_list(self) + @models.permalink def get_absolute_url(self): return ('document_page_view', [self.pk]) diff --git a/apps/documents/views.py b/apps/documents/views.py index c727407b88..ee9be82b3e 100644 --- a/apps/documents/views.py +++ b/apps/documents/views.py @@ -305,7 +305,7 @@ def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_ rotation = int(request.GET.get('rotation', DEFAULT_ROTATION)) % 360 document_page = get_object_or_404(document.documentpage_set, page_number=page) - transformations, warnings = DocumentPageTransformation.objects.get_for_document_page_as_list(document_page) + transformations, warnings = document_page.get_transformation_list() if warnings and (request.user.is_staff or request.user.is_superuser): for warning in warnings: diff --git a/apps/ocr/__init__.py b/apps/ocr/__init__.py index 55aa3d5b60..00efe7b276 100644 --- a/apps/ocr/__init__.py +++ b/apps/ocr/__init__.py @@ -9,7 +9,7 @@ from documents.models import Document from main.api import register_tool from ocr.conf.settings import AUTOMATIC_OCR -from ocr.models import DocumentQueue +from ocr.models import DocumentQueue, QueueTransformation #Permissions PERMISSION_OCR_DOCUMENT = {'namespace': 'ocr', 'name': 'ocr_document', 'label': _(u'Submit document for OCR')} @@ -30,20 +30,27 @@ re_queue_multiple_document = {'text': _('re-queue'), 'view': 're_queue_multiple_ queue_document_delete = {'text': _(u'delete'), 'view': 'queue_document_delete', 'args': 'object.id', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]} queue_document_multiple_delete = {'text': _(u'delete'), 'view': 'queue_document_multiple_delete', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]} -document_queue_disable = {'text': _(u'stop queue'), 'view': 'document_queue_disable', 'args': 'object.id', 'famfam': 'control_stop_blue', 'permissions': [PERMISSION_OCR_QUEUE_ENABLE_DISABLE]} -document_queue_enable = {'text': _(u'activate queue'), 'view': 'document_queue_enable', 'args': 'object.id', 'famfam': 'control_play_blue', 'permissions': [PERMISSION_OCR_QUEUE_ENABLE_DISABLE]} +document_queue_disable = {'text': _(u'stop queue'), 'view': 'document_queue_disable', 'args': 'queue.id', 'famfam': 'control_stop_blue', 'permissions': [PERMISSION_OCR_QUEUE_ENABLE_DISABLE]} +document_queue_enable = {'text': _(u'activate queue'), 'view': 'document_queue_enable', 'args': 'queue.id', 'famfam': 'control_play_blue', 'permissions': [PERMISSION_OCR_QUEUE_ENABLE_DISABLE]} all_document_ocr_cleanup = {'text': _(u'clean up pages content'), 'view': 'all_document_ocr_cleanup', 'famfam': 'text_strikethrough', 'permissions': [PERMISSION_OCR_CLEAN_ALL_PAGES], 'description': _(u'Runs a language filter to remove common OCR mistakes from document pages content.')} queue_document_list = {'text': _(u'queue document list'), 'view': 'queue_document_list', 'famfam': 'hourglass', 'permissions': [PERMISSION_OCR_DOCUMENT]} node_active_list = {'text': _(u'active tasks'), 'view': 'node_active_list', 'famfam': 'server_chart', 'permissions': [PERMISSION_OCR_DOCUMENT]} +setup_queue_transformation_list = {'text': _(u'transformations'), 'view': 'setup_queue_transformation_list', 'args': 'queue.pk', 'famfam': 'shape_move_front'} +setup_queue_transformation_create = {'text': _(u'add transformation'), 'view': 'setup_queue_transformation_create', 'args': 'queue.pk', 'famfam': 'shape_square_add'} +setup_queue_transformation_edit = {'text': _(u'edit'), 'view': 'setup_queue_transformation_edit', 'args': 'transformation.pk', 'famfam': 'shape_square_edit'} +setup_queue_transformation_delete = {'text': _(u'delete'), 'view': 'setup_queue_transformation_delete', 'args': 'transformation.pk', 'famfam': 'shape_square_delete'} + register_links(Document, [submit_document]) -register_links(DocumentQueue, [document_queue_disable, document_queue_enable]) +register_links(DocumentQueue, [document_queue_disable, document_queue_enable, setup_queue_transformation_list]) +register_links(QueueTransformation, [setup_queue_transformation_edit, setup_queue_transformation_delete]) register_multi_item_links(['queue_document_list'], [re_queue_multiple_document, queue_document_multiple_delete]) -register_links(['queue_document_list', 'node_active_list'], [queue_document_list, node_active_list], menu_name='secondary_menu') +register_links(['setup_queue_transformation_create', 'setup_queue_transformation_edit', 'setup_queue_transformation_delete', 'document_queue_disable', 'document_queue_enable', 'queue_document_list', 'node_active_list', 'setup_queue_transformation_list'], [queue_document_list, node_active_list], menu_name='secondary_menu') +register_links(['setup_queue_transformation_edit', 'setup_queue_transformation_delete', 'setup_queue_transformation_list', 'setup_queue_transformation_create'], [setup_queue_transformation_create], menu_name='sidebar') register_tool(all_document_ocr_cleanup, namespace='ocr', title=_(u'OCR')) diff --git a/apps/ocr/api.py b/apps/ocr/api.py index bc9f775f76..88e9c20356 100644 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -79,7 +79,7 @@ def do_document_ocr(document): trying to extract text from PDF using pdftotext then by calling tesseract """ - for page_index, document_page in enumerate(document.documentpage_set.all()): + for document_page in document.documentpage_set.all(): desc, filepath = tempfile.mkstemp() imagefile = None source = u'' @@ -91,7 +91,7 @@ def do_document_ocr(document): cleanup(pdf_filename) if os.stat(filepath).st_size == 0: #PDF page had no text, run tesseract on the page - imagefile = convert_document_for_ocr(document, page=page_index) + imagefile = convert_document_for_ocr(document, page=document_page.page_number) run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE) ocr_output = os.extsep.join([filepath, u'txt']) source = _(u'Text from OCR') @@ -99,12 +99,11 @@ def do_document_ocr(document): ocr_output = filepath source = _(u'Text extracted from PDF') else: - imagefile = convert_document_for_ocr(document, page=page_index) + imagefile = convert_document_for_ocr(document, page=document_page.page_number) run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE) ocr_output = os.extsep.join([filepath, u'txt']) source = _(u'Text from OCR') f = codecs.open(ocr_output, 'r', 'utf-8') - document_page = document.documentpage_set.get(page_number=page_index + 1) document_page.content = ocr_cleanup(f.read().strip()) document_page.page_label = source document_page.save() diff --git a/apps/ocr/forms.py b/apps/ocr/forms.py new file mode 100644 index 0000000000..c88e7a8351 --- /dev/null +++ b/apps/ocr/forms.py @@ -0,0 +1,21 @@ +from django import forms +from django.utils.translation import ugettext_lazy as _ +from django.utils.translation import ugettext + +from ocr.models import QueueTransformation + + +class QueueTransformationForm(forms.ModelForm): + class Meta: + model = QueueTransformation + + def __init__(self, *args, **kwargs): + super(QueueTransformationForm, self).__init__(*args, **kwargs) + self.fields['content_type'].widget = forms.HiddenInput() + self.fields['object_id'].widget = forms.HiddenInput() + + +class QueueTransformationForm_create(forms.ModelForm): + class Meta: + model = QueueTransformation + exclude = ('content_type', 'object_id') diff --git a/apps/ocr/manager.py b/apps/ocr/manager.py deleted file mode 100644 index a1fdb80b8c..0000000000 --- a/apps/ocr/manager.py +++ /dev/null @@ -1,18 +0,0 @@ -from django.db import models - -from ocr.exceptions import AlreadyQueued - - -class DocumentQueueManager(models.Manager): - """ - Module manager class to handle adding documents to an OCR document - queue - """ - def queue_document(self, document, queue_name='default'): - document_queue = self.model.objects.get(name=queue_name) - if document_queue.queuedocument_set.filter(document=document): - raise AlreadyQueued - - document_queue.queuedocument_set.create(document=document, delay=True) - - return document_queue diff --git a/apps/ocr/managers.py b/apps/ocr/managers.py new file mode 100644 index 0000000000..a3ed9621ad --- /dev/null +++ b/apps/ocr/managers.py @@ -0,0 +1,41 @@ +from django.db import models +from django.contrib.contenttypes.models import ContentType + +from ocr.exceptions import AlreadyQueued + + +class DocumentQueueManager(models.Manager): + """ + Module manager class to handle adding documents to an OCR document + queue + """ + def queue_document(self, document, queue_name='default'): + document_queue = self.model.objects.get(name=queue_name) + if document_queue.queuedocument_set.filter(document=document): + raise AlreadyQueued + + document_queue.queuedocument_set.create(document=document, delay=True) + + return document_queue + + +class QueueTransformationManager(models.Manager): + def get_for_object(self, obj): + ct = ContentType.objects.get_for_model(obj) + return self.model.objects.filter(content_type=ct).filter(object_id=obj.pk) + + def get_for_object_as_list(self, obj): + warnings = [] + transformations = [] + for transformation in self.get_for_object(obj).values('transformation', 'arguments'): + try: + transformations.append( + { + 'transformation': transformation['transformation'], + 'arguments': eval(transformation['arguments'], {}) + } + ) + except Exception, e: + warnings.append(e) + + return transformations, warnings diff --git a/apps/ocr/models.py b/apps/ocr/models.py index f9567e2b0a..bfcbf74aec 100644 --- a/apps/ocr/models.py +++ b/apps/ocr/models.py @@ -2,13 +2,16 @@ from django.db import models from django.utils.translation import ugettext_lazy as _ from django.utils.translation import ugettext from django.core.exceptions import ObjectDoesNotExist +from django.contrib.contenttypes.models import ContentType +from django.contrib.contenttypes import generic from documents.models import Document +from converter.api import get_available_transformations_choices from ocr.literals import DOCUMENTQUEUE_STATE_STOPPED, \ DOCUMENTQUEUE_STATE_CHOICES, QUEUEDOCUMENT_STATE_PENDING, \ QUEUEDOCUMENT_STATE_CHOICES -from ocr.manager import DocumentQueueManager +from ocr.managers import DocumentQueueManager, QueueTransformationManager class DocumentQueue(models.Model): @@ -51,3 +54,26 @@ class QueueDocument(models.Model): return unicode(self.document) except ObjectDoesNotExist: return ugettext(u'Missing document.') + + +class QueueTransformation(models.Model): + """ + Model that stores the transformation and transformation arguments + for a given document queue + """ + content_type = models.ForeignKey(ContentType) + object_id = models.PositiveIntegerField() + content_object = generic.GenericForeignKey('content_type', 'object_id') + order = models.PositiveIntegerField(default=0, blank=True, null=True, verbose_name=_(u'order'), db_index=True) + transformation = models.CharField(choices=get_available_transformations_choices(), max_length=128, verbose_name=_(u'transformation')) + arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: %s') % u'{\'degrees\':90}') + + objects = QueueTransformationManager() + + def __unicode__(self): + return self.get_transformation_display() + + class Meta: + ordering = ('order',) + verbose_name = _(u'document queue transformation') + verbose_name_plural = _(u'document queue transformations') diff --git a/apps/ocr/urls.py b/apps/ocr/urls.py index 6bddd3d7fe..cb090cf065 100644 --- a/apps/ocr/urls.py +++ b/apps/ocr/urls.py @@ -1,16 +1,22 @@ from django.conf.urls.defaults import patterns, url urlpatterns = patterns('ocr.views', - url(r'^(?P\d+)/submit/$', 'submit_document', (), 'submit_document'), - url(r'^ocr/queue/document/list/$', 'queue_document_list', (), 'queue_document_list'), - url(r'^ocr/queue/document/(?P\d+)/delete/$', 'queue_document_delete', (), 'queue_document_delete'), - url(r'^ocr/queue/document/multiple/delete/$', 'queue_document_multiple_delete', (), 'queue_document_multiple_delete'), - url(r'^ocr/queue/document/(?P\d+)/re-queue/$', 're_queue_document', (), 're_queue_document'), - url(r'^ocr/queue/document/multiple/re-queue/$', 're_queue_multiple_document', (), 're_queue_multiple_document'), + url(r'^document/(?P\d+)/submit/$', 'submit_document', (), 'submit_document'), + url(r'^queue/document/list/$', 'queue_document_list', (), 'queue_document_list'), + url(r'^queue/document/(?P\d+)/delete/$', 'queue_document_delete', (), 'queue_document_delete'), + url(r'^queue/document/multiple/delete/$', 'queue_document_multiple_delete', (), 'queue_document_multiple_delete'), + url(r'^queue/document/(?P\d+)/re-queue/$', 're_queue_document', (), 're_queue_document'), + url(r'^queue/document/multiple/re-queue/$', 're_queue_multiple_document', (), 're_queue_multiple_document'), - url(r'^ocr/queue/(?P\d+)/enable/$', 'document_queue_enable', (), 'document_queue_enable'), - url(r'^ocr/queue/(?P\d+)/disable/$', 'document_queue_disable', (), 'document_queue_disable'), + url(r'^queue/(?P\d+)/enable/$', 'document_queue_enable', (), 'document_queue_enable'), + url(r'^queue/(?P\d+)/disable/$', 'document_queue_disable', (), 'document_queue_disable'), - url(r'^ocr/document/all/clean_up/$', 'all_document_ocr_cleanup', (), 'all_document_ocr_cleanup'), - url(r'^ocr/node/active/list/$', 'node_active_list', (), 'node_active_list'), + url(r'^document/all/clean_up/$', 'all_document_ocr_cleanup', (), 'all_document_ocr_cleanup'), + url(r'^node/active/list/$', 'node_active_list', (), 'node_active_list'), + + url(r'^queue/(?P\d+)/transformation/list/$', 'setup_queue_transformation_list', (), 'setup_queue_transformation_list'), + url(r'^queue/(?P\w+)/transformation/create/$', 'setup_queue_transformation_create', (), 'setup_queue_transformation_create'), + url(r'^queue/transformation/(?P\w+)/edit/$', 'setup_queue_transformation_edit', (), 'setup_queue_transformation_edit'), + url(r'^queue/transformation/(?P\w+)/delete/$', 'setup_queue_transformation_delete', (), 'setup_queue_transformation_delete'), + ) diff --git a/apps/ocr/views.py b/apps/ocr/views.py index 1dbff3e278..6ee6a96393 100644 --- a/apps/ocr/views.py +++ b/apps/ocr/views.py @@ -7,6 +7,7 @@ from django.template import RequestContext from django.contrib import messages from django.views.generic.list_detail import object_list from django.utils.translation import ugettext_lazy as _ +from django.core.urlresolvers import reverse from celery.task.control import inspect from permissions.api import check_permissions @@ -16,12 +17,13 @@ from documents.widgets import document_link, document_thumbnail from ocr import PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE, \ PERMISSION_OCR_QUEUE_ENABLE_DISABLE, PERMISSION_OCR_CLEAN_ALL_PAGES -from ocr.models import DocumentQueue, QueueDocument +from ocr.models import DocumentQueue, QueueDocument, QueueTransformation from ocr.literals import QUEUEDOCUMENT_STATE_PENDING, \ QUEUEDOCUMENT_STATE_PROCESSING, DOCUMENTQUEUE_STATE_STOPPED, \ DOCUMENTQUEUE_STATE_ACTIVE from ocr.exceptions import AlreadyQueued from ocr.api import clean_pages +from ocr.forms import QueueTransformationForm, QueueTransformationForm_create def queue_document_list(request, queue_name='default'): @@ -36,8 +38,10 @@ def queue_document_list(request, queue_name='default'): extra_context={ 'title': _(u'documents in queue: %s') % document_queue, 'hide_object': True, - 'object': document_queue, + 'queue': document_queue, 'object_name': _(u'document queue'), + 'navigation_object_name': 'queue', + 'list_object_variable_name': 'queue_document', 'extra_columns': [ {'name': 'document', 'attribute': lambda x: document_link(x.document) if hasattr(x, 'document') else _(u'Missing document.')}, {'name': _(u'thumbnail'), 'attribute': lambda x: document_thumbnail(x.document)}, @@ -210,7 +214,8 @@ def document_queue_disable(request, document_queue_id): return HttpResponseRedirect(next) return render_to_response('generic_confirm.html', { - 'object': document_queue, + 'queue': document_queue, + 'navigation_object_name': 'queue', 'title': _(u'Are you sure you wish to disable document queue: %s') % document_queue, 'next': next, 'previous': previous, @@ -236,7 +241,8 @@ def document_queue_enable(request, document_queue_id): return HttpResponseRedirect(next) return render_to_response('generic_confirm.html', { - 'object': document_queue, + 'queue': document_queue, + 'navigation_object_name': 'queue', 'title': _(u'Are you sure you wish to activate document queue: %s') % document_queue, 'next': next, 'previous': previous, @@ -315,3 +321,141 @@ def node_active_list(request): {'name': _(u'related object'), 'attribute': lambda x: display_link(x['related_object']) if x['related_object'] else u''} ], }, context_instance=RequestContext(request)) + + +def setup_queue_transformation_list(request, document_queue_id): + #check_permissions(request.user, [PERMISSION_SOURCES_SETUP_EDIT]) + + document_queue = get_object_or_404(DocumentQueue, pk=document_queue_id) + + context = { + 'object_list': QueueTransformation.objects.get_for_object(document_queue), + 'title': _(u'transformations for: %s') % document_queue, + #'object_name': _(u'document queue'), + #'object': document_queue, + 'queue': document_queue, + 'object_name': _(u'document queue'), + 'navigation_object_name': 'queue', + 'list_object_variable_name': 'transformation', + 'extra_columns': [ + {'name': _(u'order'), 'attribute': 'order'}, + {'name': _(u'transformation'), 'attribute': lambda x: x.get_transformation_display()}, + {'name': _(u'arguments'), 'attribute': 'arguments'} + ], + 'hide_link': True, + 'hide_object': True, + } + + return render_to_response('generic_list.html', context, + context_instance=RequestContext(request)) + + +def setup_queue_transformation_edit(request, transformation_id): + #check_permissions(request.user, [PERMISSION_SOURCES_SETUP_EDIT]) + + transformation = get_object_or_404(QueueTransformation, pk=transformation_id) + redirect_view = reverse('setup_queue_transformation_list', args=[transformation.content_object.pk]) + next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', redirect_view))) + + if request.method == 'POST': + form = QueueTransformationForm(instance=transformation, data=request.POST) + if form.is_valid(): + try: + # Test the validity of the argument field + eval(form.cleaned_data['arguments'], {}) + except: + messages.error(request, _(u'Queue transformation argument error.')) + else: + try: + form.save() + messages.success(request, _(u'Queue transformation edited successfully')) + return HttpResponseRedirect(next) + except Exception, e: + messages.error(request, _(u'Error editing queue transformation; %s') % e) + else: + form = QueueTransformationForm(instance=transformation) + + return render_to_response('generic_form.html', { + 'title': _(u'Edit transformation: %s') % transformation, + 'form': form, + 'queue': transformation.content_object, + 'transformation': transformation, + 'navigation_object_list': [ + {'object': 'queue', 'name': _(u'document queue')}, + {'object': 'transformation', 'name': _(u'transformation')} + ], + 'next': next, + }, + context_instance=RequestContext(request)) + + +def setup_queue_transformation_delete(request, transformation_id): + #check_permissions(request.user, [PERMISSION_SOURCES_SETUP_EDIT]) + + transformation = get_object_or_404(QueueTransformation, pk=transformation_id) + redirect_view = reverse('setup_queue_transformation_list', args=[transformation.content_object.pk]) + previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', redirect_view))) + + if request.method == 'POST': + try: + transformation.delete() + messages.success(request, _(u'Queue transformation deleted successfully.')) + except Exception, e: + messages.error(request, _(u'Error deleting queue transformation; %(error)s') % { + 'error': e} + ) + return HttpResponseRedirect(redirect_view) + + return render_to_response('generic_confirm.html', { + 'delete_view': True, + 'transformation': transformation, + 'queue': transformation.content_object, + 'navigation_object_list': [ + {'object': 'queue', 'name': _(u'document queue')}, + {'object': 'transformation', 'name': _(u'transformation')} + ], + 'title': _(u'Are you sure you wish to delete queue transformation "%(transformation)s"') % { + 'transformation': transformation.get_transformation_display(), + }, + 'previous': previous, + 'form_icon': u'shape_square_delete.png', + }, + context_instance=RequestContext(request)) + + +def setup_queue_transformation_create(request, document_queue_id): + #check_permissions(request.user, [PERMISSION_SOURCES_SETUP_EDIT]) + + document_queue = get_object_or_404(DocumentQueue, pk=document_queue_id) + + redirect_view = reverse('setup_queue_transformation_list', args=[document_queue.pk]) + previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', redirect_view))) + + if request.method == 'POST': + form = QueueTransformationForm_create(request.POST) + if form.is_valid(): + try: + # Test the validity of the argument field + eval(form.cleaned_data['arguments'], {}) + except: + messages.error(request, _(u'Queue transformation argument error.')) + else: + try: + queue_tranformation = form.save(commit=False) + queue_tranformation.content_object = document_queue + queue_tranformation.save() + messages.success(request, _(u'Queue transformation created successfully')) + return HttpResponseRedirect(redirect_view) + except Exception, e: + messages.error(request, _(u'Error creating queue transformation; %s') % e) + else: + form = QueueTransformationForm_create() + + return render_to_response('generic_form.html', { + 'form': form, + 'queue': document_queue, + 'object_name': _(u'document queue'), + 'navigation_object_name': 'queue', + 'title': _(u'Create new transformation for queue: %s') % document_queue, + }, context_instance=RequestContext(request)) + diff --git a/apps/sources/managers.py b/apps/sources/managers.py index f45e06e340..e27a6468a9 100644 --- a/apps/sources/managers.py +++ b/apps/sources/managers.py @@ -21,4 +21,4 @@ class SourceTransformationManager(models.Manager): except Exception, e: warnings.append(e) - return transformations, warnings + return transformations, warnings diff --git a/apps/sources/urls.py b/apps/sources/urls.py index 354ec22b0a..5d6a015f92 100644 --- a/apps/sources/urls.py +++ b/apps/sources/urls.py @@ -16,12 +16,12 @@ urlpatterns = patterns('sources.views', url(r'^setup/interactive/staging_folder/list/$', 'setup_source_list', {'source_type': SOURCE_CHOICE_STAGING}, 'setup_staging_folder_list'), url(r'^setup/interactive/(?P\w+)/list/$', 'setup_source_list', (), 'setup_source_list'), - url(r'^setup/interactive/(?P\w+)/(?P\w+)/edit/$', 'setup_source_edit', (), 'setup_source_edit'), - url(r'^setup/interactive/(?P\w+)/(?P\w+)/delete/$', 'setup_source_delete', (), 'setup_source_delete'), + url(r'^setup/interactive/(?P\w+)/(?P\d+)/edit/$', 'setup_source_edit', (), 'setup_source_edit'), + url(r'^setup/interactive/(?P\w+)/(?P\d+)/delete/$', 'setup_source_delete', (), 'setup_source_delete'), url(r'^setup/interactive/(?P\w+)/create/$', 'setup_source_create', (), 'setup_source_create'), - url(r'^setup/interactive/(?P\w+)/(?P\w+)/transformation/list/$', 'setup_source_transformation_list', (), 'setup_source_transformation_list'), - url(r'^setup/interactive/(?P\w+)/(?P\w+)/transformation/create/$', 'setup_source_transformation_create', (), 'setup_source_transformation_create'), - url(r'^setup/interactive/source/transformation/(?P\w+)/edit/$', 'setup_source_transformation_edit', (), 'setup_source_transformation_edit'), - url(r'^setup/interactive/source/transformation/(?P\w+)/delete/$', 'setup_source_transformation_delete', (), 'setup_source_transformation_delete'), + url(r'^setup/interactive/(?P\w+)/(?P\d+)/transformation/list/$', 'setup_source_transformation_list', (), 'setup_source_transformation_list'), + url(r'^setup/interactive/(?P\w+)/(?P\d+)/transformation/create/$', 'setup_source_transformation_create', (), 'setup_source_transformation_create'), + url(r'^setup/interactive/source/transformation/(?P\d+)/edit/$', 'setup_source_transformation_edit', (), 'setup_source_transformation_edit'), + url(r'^setup/interactive/source/transformation/(?P\d+)/delete/$', 'setup_source_transformation_delete', (), 'setup_source_transformation_delete'), ) diff --git a/apps/sources/views.py b/apps/sources/views.py index 05766d78a9..8732d66913 100644 --- a/apps/sources/views.py +++ b/apps/sources/views.py @@ -562,9 +562,9 @@ def setup_source_transformation_delete(request, transformation_id): if request.method == 'POST': try: source_transformation.delete() - messages.success(request, _(u'Transformation deleted successfully.')) + messages.success(request, _(u'Source transformation deleted successfully.')) except Exception, e: - messages.error(request, _(u'Error deleting transformation; %(error)s') % { + messages.error(request, _(u'Error deleting source transformation; %(error)s') % { 'error': e} ) return HttpResponseRedirect(redirect_view) @@ -577,7 +577,7 @@ def setup_source_transformation_delete(request, transformation_id): {'object': 'source', 'name': _(u'source')}, {'object': 'transformation', 'name': _(u'transformation')} ], - 'title': _(u'Are you sure you wish to delete transformation "%(transformation)s"') % { + 'title': _(u'Are you sure you wish to delete source transformation "%(transformation)s"') % { 'transformation': source_transformation.get_transformation_display(), }, 'previous': previous, From 6c7ac588c79736bdd37a0c03dc0899e6fc193399 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Mon, 18 Jul 2011 04:00:21 -0400 Subject: [PATCH 10/14] Added mimetype detection utility --- apps/converter/utils.py | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/apps/converter/utils.py b/apps/converter/utils.py index 5fc106a940..4653b6dc9d 100644 --- a/apps/converter/utils.py +++ b/apps/converter/utils.py @@ -3,7 +3,15 @@ import os from django.core.exceptions import ImproperlyConfigured from django.utils.importlib import import_module - +try: + from python_magic import magic + USE_PYTHON_MAGIC = True +except: + import mimetypes + mimetypes.init() + USE_PYTHON_MAGIC = False + + #http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python def copyfile(source, dest, buffer_size=1024 * 1024): """ @@ -72,3 +80,32 @@ def load_backend(): raise ImproperlyConfigured(error_msg) else: raise # If there's some other error, this must be an error in Mayan itself. + + +def get_mimetype(filepath): + """ + Determine a file's mimetype by calling the system's libmagic + library via python-magic or fallback to use python's mimetypes + library + """ + file_mimetype = u'' + file_mime_encoding = u'' + + if USE_PYTHON_MAGIC: + if os.path.exists(filepath): + try: + source = open(filepath, 'r') + mime = magic.Magic(mime=True) + file_mimetype = mime.from_buffer(source.read()) + source.seek(0) + mime_encoding = magic.Magic(mime_encoding=True) + file_mime_encoding = mime_encoding.from_buffer(source.read()) + finally: + if source: + source.close() + else: + path, filename = os.path.split(filepath) + file_mimetype, file_mime_encoding = mimetypes.guess_type(filename) + + return file_mimetype, file_mime_encoding + From 49e4c040d85ae9dfa500bd786228f9aa832f6c90 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Mon, 18 Jul 2011 04:01:11 -0400 Subject: [PATCH 11/14] Removed DENSITY from the supported converter backend transformations --- apps/converter/backends/graphicsmagick/base.py | 2 +- apps/converter/backends/imagemagick/base.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/converter/backends/graphicsmagick/base.py b/apps/converter/backends/graphicsmagick/base.py index 4d3910391b..54ebbaaa95 100644 --- a/apps/converter/backends/graphicsmagick/base.py +++ b/apps/converter/backends/graphicsmagick/base.py @@ -108,7 +108,7 @@ class ConverterClass(ConverterBase): def get_available_transformations(self): return [ TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \ - TRANSFORMATION_DENSITY, TRANSFORMATION_ZOOM + TRANSFORMATION_ZOOM ] def get_page_count(self, input_filepath): diff --git a/apps/converter/backends/imagemagick/base.py b/apps/converter/backends/imagemagick/base.py index c9977fb3b4..4f924316ed 100644 --- a/apps/converter/backends/imagemagick/base.py +++ b/apps/converter/backends/imagemagick/base.py @@ -106,7 +106,7 @@ class ConverterClass(ConverterBase): def get_available_transformations(self): return [ TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \ - TRANSFORMATION_DENSITY, TRANSFORMATION_ZOOM + TRANSFORMATION_ZOOM ] From ac43e294b3eae721055393a76150b24ed8429ea7 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Mon, 18 Jul 2011 04:04:22 -0400 Subject: [PATCH 12/14] Added PDF page count support to the python converter backend --- apps/converter/backends/python/base.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/apps/converter/backends/python/base.py b/apps/converter/backends/python/base.py index 616e997d3f..25448346ff 100644 --- a/apps/converter/backends/python/base.py +++ b/apps/converter/backends/python/base.py @@ -1,3 +1,4 @@ +import slate from PIL import Image from django.utils.translation import ugettext_lazy as _ @@ -9,12 +10,28 @@ from converter.literals import TRANSFORMATION_RESIZE, \ TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM from converter.literals import QUALITY_DEFAULT, DEFAULT_PAGE_NUMBER, \ DEFAULT_FILE_FORMAT +from converter.utils import get_mimetype + class ConverterClass(ConverterBase): def get_page_count(self, input_filepath): page_count = 1 - im = Image.open(input_filepath) - + + mimetype, encoding = get_mimetype(input_filepath) + if mimetype == 'application/pdf': + # If file is a PDF open it with slate to determine the page + # count + with open(input_filepath) as fd: + pages = slate.PDF(fd) + return len(pages) + + try: + im = Image.open(input_filepath) + except IOError: #cannot identify image file + # Return a page count of 1, to atleast allow the document + # to be created + return 1 + try: while 1: im.seek(im.tell()+1) From 5bfd607b31d72a9d8db59111d042f722fe354208 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Mon, 18 Jul 2011 04:06:19 -0400 Subject: [PATCH 13/14] Removed pdftotext from the requirements, move unpaper calling to the OCR app --- apps/converter/api.py | 49 ----------- apps/converter/conf/settings.py | 3 +- apps/ocr/api.py | 142 +++++++++++++++++++------------- apps/ocr/conf/settings.py | 5 +- apps/ocr/exceptions.py | 4 - 5 files changed, 89 insertions(+), 114 deletions(-) diff --git a/apps/converter/api.py b/apps/converter/api.py index 71a188a36d..665a980c27 100644 --- a/apps/converter/api.py +++ b/apps/converter/api.py @@ -5,8 +5,6 @@ import hashlib from common import TEMPORARY_DIRECTORY from documents.utils import document_save_to_temp_dir -from converter.conf.settings import UNPAPER_PATH -from converter.conf.settings import OCR_OPTIONS from converter.conf.settings import UNOCONV_PATH from converter.exceptions import UnpaperError, OfficeConversionError from converter.literals import DEFAULT_PAGE_NUMBER, \ @@ -36,21 +34,6 @@ def cleanup(filename): pass -def execute_unpaper(input_filepath, output_filepath): - """ - Executes the program unpaper using subprocess's Popen - """ - command = [] - command.append(UNPAPER_PATH) - command.append(u'--overwrite') - command.append(input_filepath) - command.append(output_filepath) - proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE) - return_code = proc.wait() - if return_code != 0: - raise UnpaperError(proc.stderr.readline()) - - def execute_unoconv(input_filepath, arguments=''): """ Executes the program unoconv using subprocess's Popen @@ -164,38 +147,6 @@ def get_document_dimensions(document, *args, **kwargs): return [0, 0] -def convert_document_for_ocr(document, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT): - #Extract document file - input_filepath = document_save_to_temp_dir(document, document.uuid) - - #Convert for OCR - temp_filename, separator = os.path.splitext(os.path.basename(input_filepath)) - temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename) - transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, file_format) - unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep) - unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep) - convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format) - - try: - document_page = document.documentpage_set.get(page_number=page) - transformations, warnings = document_page.get_transformation_list() - - #Apply default transformations - backend.convert_file(input_filepath=input_filepath, page=page, quality=QUALITY_HIGH, transformations=transformations, output_filepath=transformation_output_file) - #Do OCR operations - backend.convert_file(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file) - # Process by unpaper - execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file) - # Convert to tif - backend.convert_file(input_filepath=unpaper_output_file, output_filepath=convert_output_file) - finally: - cleanup(transformation_output_file) - cleanup(unpaper_input_file) - cleanup(unpaper_output_file) - - return convert_output_file - - def get_available_transformations_choices(): result = [] for transformation in backend.get_available_transformations(): diff --git a/apps/converter/conf/settings.py b/apps/converter/conf/settings.py index fcaa1ec9b0..95aee33b92 100644 --- a/apps/converter/conf/settings.py +++ b/apps/converter/conf/settings.py @@ -9,12 +9,11 @@ register_settings( settings=[ {'name': u'IM_CONVERT_PATH', 'global_name': u'CONVERTER_IM_CONVERT_PATH', 'default': u'/usr/bin/convert', 'description': _(u'File path to imagemagick\'s convert program.'), 'exists': True}, {'name': u'IM_IDENTIFY_PATH', 'global_name': u'CONVERTER_IM_IDENTIFY_PATH', 'default': u'/usr/bin/identify', 'description': _(u'File path to imagemagick\'s identify program.'), 'exists': True}, - {'name': u'UNPAPER_PATH', 'global_name': u'CONVERTER_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True}, {'name': u'GM_PATH', 'global_name': u'CONVERTER_GM_PATH', 'default': u'/usr/bin/gm', 'description': _(u'File path to graphicsmagick\'s program.'), 'exists': True}, {'name': u'GM_SETTINGS', 'global_name': u'CONVERTER_GM_SETTINGS', 'default': u''}, {'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick, converter.backends.graphicsmagick and converter.backends.python.')}, {'name': u'UNOCONV_PATH', 'global_name': u'CONVERTER_UNOCONV_PATH', 'default': u'/usr/bin/unoconv', 'exists': True}, - {'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'}, + #{'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'}, {'name': u'DEFAULT_OPTIONS', 'global_name': u'CONVERTER_DEFAULT_OPTIONS', 'default': u''}, {'name': u'LOW_QUALITY_OPTIONS', 'global_name': u'CONVERTER_LOW_QUALITY_OPTIONS', 'default': u''}, {'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'}, diff --git a/apps/ocr/api.py b/apps/ocr/api.py index 88e9c20356..ec89a669c9 100644 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -9,13 +9,15 @@ import sys from django.utils.translation import ugettext as _ from django.utils.importlib import import_module -from converter.api import convert_document_for_ocr +from converter.api import convert from documents.models import DocumentPage from ocr.conf.settings import TESSERACT_PATH from ocr.conf.settings import TESSERACT_LANGUAGE -from ocr.conf.settings import PDFTOTEXT_PATH -from ocr.exceptions import TesseractError, PdftotextError +from ocr.exceptions import TesseractError +from ocr.conf.settings import UNPAPER_PATH +from ocr.parsers import parse_document_page +from ocr.parsers.exceptions import ParserError, ParserUnknownFile def get_language_backend(): @@ -30,7 +32,7 @@ def get_language_backend(): return None return module -backend = get_language_backend() +language_backend = get_language_backend() def cleanup(filename): @@ -58,62 +60,38 @@ def run_tesseract(input_filename, output_filename_base, lang=None): raise TesseractError(error_text) -def run_pdftotext(input_filename, output_filename, page_number=None): - """ - Execute the command line binary of pdftotext - """ - command = [unicode(PDFTOTEXT_PATH)] - if page_number: - command.extend([u'-nopgbrk', u'-f', unicode(page_number), u'-l', unicode(page_number)]) - command.extend([unicode(input_filename), unicode(output_filename)]) - proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - return_code = proc.wait() - if return_code != 0: - error_text = proc.stderr.read() - raise PdftotextError(error_text) - - def do_document_ocr(document): """ - Do OCR on all the pages of the given document object, first - trying to extract text from PDF using pdftotext then by calling - tesseract + first try to extract text from document pages using the registered + parser if the parser fails or if there is no parser registered for + the document mimetype do a visual OCR by calling tesseract """ for document_page in document.documentpage_set.all(): - desc, filepath = tempfile.mkstemp() - imagefile = None - source = u'' try: - if document.file_mimetype == u'application/pdf': - pdf_filename = os.extsep.join([filepath, u'pdf']) - document.save_to_file(pdf_filename) - run_pdftotext(pdf_filename, filepath, document_page.page_number) - cleanup(pdf_filename) - if os.stat(filepath).st_size == 0: - #PDF page had no text, run tesseract on the page - imagefile = convert_document_for_ocr(document, page=document_page.page_number) - run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE) - ocr_output = os.extsep.join([filepath, u'txt']) - source = _(u'Text from OCR') - else: - ocr_output = filepath - source = _(u'Text extracted from PDF') - else: - imagefile = convert_document_for_ocr(document, page=document_page.page_number) - run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE) - ocr_output = os.extsep.join([filepath, u'txt']) - source = _(u'Text from OCR') - f = codecs.open(ocr_output, 'r', 'utf-8') - document_page.content = ocr_cleanup(f.read().strip()) - document_page.page_label = source - document_page.save() - f.close() - cleanup(ocr_output) - finally: - os.close(desc) - cleanup(filepath) - if imagefile: - cleanup(imagefile) + # Try to extract text by means of a parser + parse_document_page(document_page) + except (ParserError, ParserUnknownFile): + # Fall back to doing visual OCR + pass + #desc, filepath = tempfile.mkstemp() + #imagefile = None + #source = u'' + #imagefile = convert_document_for_ocr(document, page=document_page.page_number) + #run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE) + #ocr_output = os.extsep.join([filepath, u'txt']) + #source = _(u'Text from OCR') + #f = codecs.open(ocr_output, 'r', 'utf-8') + #document_page.content = ocr_cleanup(f.read().strip()) + #document_page.page_label = source + #document_page.save() + #f.close() + #cleanup(ocr_output) + #finally: + # pass + #os.close(desc) + #cleanup(filepath) + #if imagefile: + # cleanup(imagefile) def ocr_cleanup(text): @@ -126,8 +104,8 @@ def ocr_cleanup(text): for line in text.splitlines(): line = line.strip() for word in line.split(): - if backend: - result = backend.check_word(word) + if language_backend: + result = language_backend.check_word(word) else: result = word if result: @@ -146,3 +124,53 @@ def clean_pages(): if page.content: page.content = ocr_cleanup(page.content) page.save() + + +def execute_unpaper(input_filepath, output_filepath): + """ + Executes the program unpaper using subprocess's Popen + """ + command = [] + command.append(UNPAPER_PATH) + command.append(u'--overwrite') + command.append(input_filepath) + command.append(output_filepath) + proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE) + return_code = proc.wait() + if return_code != 0: + raise UnpaperError(proc.stderr.readline()) + +''' +def convert_document_for_ocr(document, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT): + #Extract document file + input_filepath = document_save_to_temp_dir(document, document.uuid) + + #Convert for OCR + temp_filename, separator = os.path.splitext(os.path.basename(input_filepath)) + temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename) + transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, file_format) + unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep) + unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep) + convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format) + + try: + document_page = document.documentpage_set.get(page_number=page) + transformations, warnings = document_page.get_transformation_list() + + #Apply default transformations + backend.convert_file(input_filepath=input_filepath, page=page, quality=QUALITY_HIGH, transformations=transformations, output_filepath=transformation_output_file) + #Do OCR operations + backend.convert_file(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file) + # Process by unpaper + execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file) + # Convert to tif + backend.convert_file(input_filepath=unpaper_output_file, output_filepath=convert_output_file) + finally: + cleanup(transformation_output_file) + cleanup(unpaper_input_file) + cleanup(unpaper_output_file) + + return convert_output_file +''' + + diff --git a/apps/ocr/conf/settings.py b/apps/ocr/conf/settings.py index e9024b7152..52785f46ac 100644 --- a/apps/ocr/conf/settings.py +++ b/apps/ocr/conf/settings.py @@ -13,8 +13,9 @@ register_settings( {'name': u'REPLICATION_DELAY', 'global_name': u'OCR_REPLICATION_DELAY', 'default': 10, 'description': _(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.')}, {'name': u'NODE_CONCURRENT_EXECUTION', 'global_name': u'OCR_NODE_CONCURRENT_EXECUTION', 'default': 1, 'description': _(u'Maximum amount of concurrent document OCRs a node can perform.')}, {'name': u'AUTOMATIC_OCR', 'global_name': u'OCR_AUTOMATIC_OCR', 'default': False, 'description': _(u'Automatically queue newly created documents for OCR.')}, - {'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'exists': True}, {'name': u'QUEUE_PROCESSING_INTERVAL', 'global_name': u'OCR_QUEUE_PROCESSING_INTERVAL', 'default': 10}, - {'name': u'CACHE_URI', 'global_name': u'OCR_CACHE_URI', 'default': None, 'description': _(u'URI in the form: "memcached://127.0.0.1:11211/" to specify a cache backend to use for locking. Multiple hosts can be specified separated by a semicolon.')} + {'name': u'CACHE_URI', 'global_name': u'OCR_CACHE_URI', 'default': None, 'description': _(u'URI in the form: "memcached://127.0.0.1:11211/" to specify a cache backend to use for locking. Multiple hosts can be specified separated by a semicolon.')}, + {'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True}, + {'name': u'PARSERS_PDFTOTEXT_PATH', 'global_name': u'OCR_PARSERS_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'exists': True}, ] ) diff --git a/apps/ocr/exceptions.py b/apps/ocr/exceptions.py index 4bfa8f725a..b1ec8c3fe3 100644 --- a/apps/ocr/exceptions.py +++ b/apps/ocr/exceptions.py @@ -4,7 +4,3 @@ class AlreadyQueued(Exception): class TesseractError(Exception): pass - - -class PdftotextError(Exception): - pass From d566dfbb1d1d4cc3947f0f6a3d9b442deb8de03f Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Mon, 18 Jul 2011 04:06:59 -0400 Subject: [PATCH 14/14] Added the first text parser backend (PDF) and updated the requirements files and README --- README.md | 2 ++ apps/ocr/parsers/__init__.py | 40 ++++++++++++++++++++++++++++++++++ apps/ocr/parsers/exceptions.py | 10 +++++++++ requirements/development.txt | 2 ++ requirements/production.txt | 2 ++ 5 files changed, 56 insertions(+) create mode 100644 apps/ocr/parsers/__init__.py create mode 100644 apps/ocr/parsers/exceptions.py diff --git a/README.md b/README.md index e1e21d1eae..662d33ab2a 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,8 @@ Python: * django-mptt - Utilities for implementing a modified pre-order traversal tree in django * python-magic - A python wrapper for libmagic * django-taggit - Simple tagging for django +* slate - The simplest way to extract text from PDFs in Python + Execute pip install -r requirements/production.txt to install the python/django dependencies automatically. diff --git a/apps/ocr/parsers/__init__.py b/apps/ocr/parsers/__init__.py new file mode 100644 index 0000000000..815e868747 --- /dev/null +++ b/apps/ocr/parsers/__init__.py @@ -0,0 +1,40 @@ +import codecs +import os +import subprocess +import tempfile +import sys + +import slate + +from django.utils.translation import ugettext as _ + +from ocr.parsers.exceptions import ParserError, ParserUnknownFile + +mimetype_registry = {} + + +def register_parser(mimetype, function): + mimetype_registry[mimetype] = {'function': function} + + +def pdf_parser(document_page): + fd = document_page.document.open() + pdf_pages = slate.PDF(fd) + fd.close() + + if pdf_pages[document_page.page_number - 1] == '\x0c': + raise ParserError + + document_page.content = pdf_pages[document_page.page_number - 1] + document_page.page_label = _(u'Text extracted from PDF') + document_page.save() + + +def parse_document_page(document_page): + try: + mimetype_registry[document_page.document.file_mimetype]['function'](document_page) + except KeyError: + raise ParserUnknownFile + + +register_parser('application/pdf', pdf_parser) diff --git a/apps/ocr/parsers/exceptions.py b/apps/ocr/parsers/exceptions.py new file mode 100644 index 0000000000..e06875f222 --- /dev/null +++ b/apps/ocr/parsers/exceptions.py @@ -0,0 +1,10 @@ +class ParserError(Exception): + """ + Raised when a text parser fails to understand a file it been passed + or the resulting parsed text is invalid + """ + pass + + +class ParserUnknownFile(Exception): + pass diff --git a/requirements/development.txt b/requirements/development.txt index 00def8c63a..3acf630b4e 100644 --- a/requirements/development.txt +++ b/requirements/development.txt @@ -9,3 +9,5 @@ django-celery==2.2.2 django-sentry==1.6.0 django-taggit==0.9.3 -e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt +slate==0.3 +PIL==1.1.7 diff --git a/requirements/production.txt b/requirements/production.txt index 1f1d3a0881..02219abaee 100644 --- a/requirements/production.txt +++ b/requirements/production.txt @@ -6,3 +6,5 @@ django-celery==2.2.2 django-sentry==1.6.0 django-taggit==0.9.3 -e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt +slate==0.3 +PIL==1.1.7