From 6b67cff5d7b576ca99268bcb1df01a9a8144f9d4 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Fri, 8 Apr 2011 03:29:48 -0400 Subject: [PATCH] Changed the way document page count is parsed from the graphics backend, fixing issue #7 --- apps/converter/__init__.py | 2 +- apps/converter/api.py | 38 +++++++-------- apps/converter/backends/graphicsmagick.py | 22 ++++----- apps/converter/backends/imagemagick.py | 21 ++++---- apps/documents/models.py | 58 +++++++++++++---------- apps/ocr/api.py | 8 ++-- 6 files changed, 76 insertions(+), 73 deletions(-) diff --git a/apps/converter/__init__.py b/apps/converter/__init__.py index 0a9f1cf87e..0445dffa0e 100644 --- a/apps/converter/__init__.py +++ b/apps/converter/__init__.py @@ -1,3 +1,3 @@ TRANFORMATION_CHOICES = { - 'rotate': '-rotate %(degrees)d' + u'rotate': u'-rotate %(degrees)d' } diff --git a/apps/converter/api.py b/apps/converter/api.py index 5657ba8dcf..0e4adb4abc 100644 --- a/apps/converter/api.py +++ b/apps/converter/api.py @@ -19,9 +19,9 @@ from common import TEMPORARY_DIRECTORY from converter import TRANFORMATION_CHOICES from documents.utils import document_save_to_temp_dir -QUALITY_DEFAULT = 'quality_default' -QUALITY_LOW = 'quality_low' -QUALITY_HIGH = 'quality_high' +QUALITY_DEFAULT = u'quality_default' +QUALITY_LOW = u'quality_low' +QUALITY_HIGH = u'quality_high' QUALITY_SETTINGS = {QUALITY_DEFAULT: DEFAULT_OPTIONS, QUALITY_LOW: LOW_QUALITY_OPTIONS, QUALITY_HIGH: HIGH_QUALITY_OPTIONS} @@ -44,7 +44,7 @@ def _get_backend(): try: backend = _get_backend() except ImportError: - raise ImportError('Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND) + raise ImportError(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND) def cleanup(filename): @@ -58,7 +58,7 @@ def cleanup(filename): def execute_unpaper(input_filepath, output_filepath): command = [] command.append(UNPAPER_PATH) - command.append('--overwrite') + command.append(u'--overwrite') command.append(input_filepath) command.append(output_filepath) proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE) @@ -78,7 +78,7 @@ def execute_unoconv(input_filepath, output_filepath, arguments=''): """ -def cache_cleanup(input_filepath, size, quality=QUALITY_DEFAULT, page=0, format='jpg', extra_options=''): +def cache_cleanup(input_filepath, size, quality=QUALITY_DEFAULT, page=0, format=u'jpg', extra_options=u''): filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options) try: os.remove(filepath) @@ -93,16 +93,16 @@ def create_image_cache_filename(input_filepath, *args, **kwargs): final_filepath = [] [final_filepath.append(str(arg)) for arg in args] - final_filepath.extend(['%s_%s' % (key, value) for key, value in kwargs.items()]) + final_filepath.extend([u'%s_%s' % (key, value) for key, value in kwargs.items()]) - temp_path += slugify('_'.join(final_filepath)) + temp_path += slugify(u'_'.join(final_filepath)) return temp_path else: return None -def in_image_cache(input_filepath, size, page=0, format='jpg', quality=QUALITY_DEFAULT, extra_options=''): +def in_image_cache(input_filepath, size, page=0, format=u'jpg', quality=QUALITY_DEFAULT, extra_options=u''): output_filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options) if os.path.exists(output_filepath): return output_filepath @@ -110,7 +110,7 @@ def in_image_cache(input_filepath, size, page=0, format='jpg', quality=QUALITY_D return None -def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, format='jpg', extra_options='', mimetype=None, extension=None, cleanup_files=True): +def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, format=u'jpg', extra_options=u'', mimetype=None, extension=None, cleanup_files=True): unoconv_output = None output_filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options) if os.path.exists(output_filepath): @@ -127,9 +127,9 @@ def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, f input_filepath = unoconv_output ''' try: - input_arg = '%s[%s]' % (input_filepath, page) - extra_options += ' -resize %s' % size - backend.execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath='%s:%s' % (format, output_filepath), quality=quality) + input_arg = u'%s[%s]' % (input_filepath, page) + extra_options += u' -resize %s' % size + backend.execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath=u'%s:%s' % (format, output_filepath), quality=quality) finally: if cleanup_files: cleanup(input_filepath) @@ -141,7 +141,7 @@ def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, f def get_page_count(input_filepath): try: - return int(backend.execute_identify(input_filepath, '-format %n')) + return len(backend.execute_identify(unicode(input_filepath)).splitlines()) except Exception, e: #TODO: send to other page number identifying program return 1 @@ -154,12 +154,12 @@ def convert_document_for_ocr(document, page=0, format='tif'): #Convert for OCR temp_filename, separator = os.path.splitext(os.path.basename(input_filepath)) temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename) - transformation_output_file = '%s_trans%s%s%s' % (temp_path, page, os.extsep, format) - unpaper_input_file = '%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep) - unpaper_output_file = '%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep) - convert_output_file = '%s_ocr%s%s%s' % (temp_path, page, os.extsep, format) + transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, format) + unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep) + unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep) + convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, format) - input_arg = '%s[%s]' % (input_filepath, page) + input_arg = u'%s[%s]' % (input_filepath, page) transformation_list = [] try: diff --git a/apps/converter/backends/graphicsmagick.py b/apps/converter/backends/graphicsmagick.py index 3e09c84bba..8923e4cc89 100644 --- a/apps/converter/backends/graphicsmagick.py +++ b/apps/converter/backends/graphicsmagick.py @@ -1,20 +1,18 @@ -import shlex import subprocess from converter.conf.settings import GM_PATH from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS from converter.exceptions import ConvertError, UnknownFormat, IdentifyError -CONVERTER_ERROR_STRING_NO_DECODER = 'No decode delegate for this image format' +CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format' -def execute_identify(input_filepath, arguments): +def execute_identify(input_filepath, arguments=u''): command = [] - command.append(GM_PATH) + command.append(unicode(GM_PATH)) command.append(u'identify') - command.extend(shlex.split(str(arguments))) - command.append(input_filepath) - + command.extend(unicode(arguments).split()) + command.append(unicode(input_filepath)) proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() if return_code != 0: @@ -24,13 +22,13 @@ def execute_identify(input_filepath, arguments): def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None): command = [] - command.append(GM_PATH) + command.append(unicode(GM_PATH)) command.append(u'convert') - command.extend(shlex.split(str(QUALITY_SETTINGS[quality]))) - command.append(input_filepath) + command.extend(unicode(QUALITY_SETTINGS[quality]).split()) + command.append(unicode(input_filepath)) if arguments: - command.extend(shlex.split(str(arguments))) - command.append(output_filepath) + command.extend(unicode(arguments).split()) + command.append(unicode(output_filepath)) proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() if return_code != 0: diff --git a/apps/converter/backends/imagemagick.py b/apps/converter/backends/imagemagick.py index 5aa92fc8f4..c60f6bc341 100644 --- a/apps/converter/backends/imagemagick.py +++ b/apps/converter/backends/imagemagick.py @@ -1,4 +1,3 @@ -import shlex import subprocess from converter.conf.settings import IM_IDENTIFY_PATH @@ -7,14 +6,14 @@ from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS from converter.exceptions import ConvertError, UnknownFormat, \ IdentifyError -CONVERTER_ERROR_STRING_NO_DECODER = 'no decode delegate for this image format' +CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format' -def execute_identify(input_filepath, arguments): +def execute_identify(input_filepath, arguments=u''): command = [] - command.append(IM_IDENTIFY_PATH) - command.extend(shlex.split(str(arguments))) - command.append(input_filepath) + command.append(unicode(IM_IDENTIFY_PATH)) + command.extend(unicode(arguments).split()) + command.append(unicode(input_filepath)) proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() @@ -25,12 +24,12 @@ def execute_identify(input_filepath, arguments): def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None): command = [] - command.append(IM_CONVERT_PATH) - command.extend(shlex.split(str(QUALITY_SETTINGS[quality]))) - command.append(input_filepath) + command.append(unicode(IM_CONVERT_PATH)) + command.extend(unicode(QUALITY_SETTINGS[quality]).split()) + command.append(unicode(input_filepath)) if arguments: - command.extend(shlex.split(str(arguments))) - command.append(output_filepath) + command.extend(unicode(arguments).split()) + command.append(unicode(output_filepath)) proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() if return_code != 0: diff --git a/apps/documents/models.py b/apps/documents/models.py index 3ff17ec043..078c94d8ca 100644 --- a/apps/documents/models.py +++ b/apps/documents/models.py @@ -117,21 +117,27 @@ class Document(models.Model): def update_page_count(self, save=True): handle, filepath = tempfile.mkstemp() self.save_to_file(filepath) - total_pages = get_page_count(filepath) - - for page_number in range(total_pages): - DocumentPage.objects.get_or_create( - document=self, page_number=page_number + 1) - + detected_pages = get_page_count(filepath) os.close(handle) try: os.remove(filepath) except OSError: pass + current_pages = DocumentPage.objects.filter(document=self).order_by('page_number',) + if current_pages.count() > detected_pages: + for page in current_pages[detected_pages:]: + page.delete() + + for page_number in range(detected_pages): + DocumentPage.objects.get_or_create( + document=self, page_number=page_number + 1) + if save: self.save() + return detected_pages + def save_to_file(self, filepath, buffer_size=1024 * 1024): input_descriptor = self.open() output_descriptor = open(filepath, 'wb') @@ -287,7 +293,7 @@ class DocumentPage(models.Model): page_number = models.PositiveIntegerField(default=1, editable=False, verbose_name=_(u'page number')) def __unicode__(self): - return '%s - %d - %s' % (unicode(self.document), self.page_number, self.page_label) + return u'%s - %d - %s' % (unicode(self.document), self.page_number, self.page_label) class Meta: verbose_name = _(u'document page') @@ -309,8 +315,8 @@ class MetadataGroup(models.Model): verbose_name_plural = _(u'metadata document groups') -INCLUSION_AND = '&' -INCLUSION_OR = '|' +INCLUSION_AND = u'&' +INCLUSION_OR = u'|' INCLUSION_CHOICES = ( (INCLUSION_AND, _(u'and')), @@ -318,21 +324,21 @@ INCLUSION_CHOICES = ( ) OPERATOR_CHOICES = ( - ('exact', _(u'is equal')), - ('iexact', _(u'is equal (case insensitive)')), - ('contains', _(u'contains')), - ('icontains', _(u'contains (case insensitive)')), - ('in', _(u'is in')), - ('gt', _(u'is greater than')), - ('gte', _(u'is greater than or equal')), - ('lt', _(u'is less than')), - ('lte', _(u'is less than or equal')), - ('startswith', _(u'starts with')), - ('istartswith', _(u'starts with (case insensitive)')), - ('endswith', _(u'ends with')), - ('iendswith', _(u'ends with (case insensitive)')), - ('regex', _(u'is in regular expression')), - ('iregex', _(u'is in regular expression (case insensitive)')), + (u'exact', _(u'is equal')), + (u'iexact', _(u'is equal (case insensitive)')), + (u'contains', _(u'contains')), + (u'icontains', _(u'contains (case insensitive)')), + (u'in', _(u'is in')), + (u'gt', _(u'is greater than')), + (u'gte', _(u'is greater than or equal')), + (u'lt', _(u'is less than')), + (u'lte', _(u'is less than or equal')), + (u'startswith', _(u'starts with')), + (u'istartswith', _(u'starts with (case insensitive)')), + (u'endswith', _(u'ends with')), + (u'iendswith', _(u'ends with (case insensitive)')), + (u'regex', _(u'is in regular expression')), + (u'iregex', _(u'is in regular expression (case insensitive)')), ) @@ -347,7 +353,7 @@ class MetadataGroupItem(models.Model): enabled = models.BooleanField(default=True, verbose_name=_(u'enabled')) def __unicode__(self): - return '[%s] %s %s %s %s %s' % ('x' if self.enabled else ' ', self.get_inclusion_display(), self.metadata_type, _(u'not') if self.negated else '', self.get_operator_display(), self.expression) + return u'[%s] %s %s %s %s %s' % (u'x' if self.enabled else u' ', self.get_inclusion_display(), self.metadata_type, _(u'not') if self.negated else u'', self.get_operator_display(), self.expression) class Meta: verbose_name = _(u'metadata group item') @@ -364,7 +370,7 @@ class DocumentPageTransformation(models.Model): arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: {\'degrees\':90}')) def __unicode__(self): - return '%s - %s' % (unicode(self.document_page), self.get_transformation_display()) + return u'%s - %s' % (unicode(self.document_page), self.get_transformation_display()) class Meta: ordering = ('order',) diff --git a/apps/ocr/api.py b/apps/ocr/api.py index 120c39ecee..baaa83497a 100644 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -20,7 +20,7 @@ def get_language_backend(): try: module = import_module(u'.'.join([u'ocr', u'lang', TESSERACT_LANGUAGE])) except ImportError: - sys.stderr.write('\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE) + sys.stderr.write(u'\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE) return None return module @@ -40,9 +40,9 @@ def cleanup(filename): def run_tesseract(input_filename, output_filename_base, lang=None): - command = [TESSERACT_PATH, input_filename, output_filename_base] + command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(output_filename_base)] if lang is not None: - command += ['-l', lang] + command += [u'-l', lang] proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() @@ -85,7 +85,7 @@ def ocr_cleanup(text): result = word if result: output.append(result) - output.append('\n') + output.append(u'\n') return u' '.join(output)