Changed the way document page count is parsed from the graphics backend, fixing issue #7

2011-04-08 03:29:48 -04:00
parent 71a3c218f4
commit 6b67cff5d7
6 changed files with 76 additions and 73 deletions
--- a/apps/converter/init.py
+++ b/apps/converter/init.py
@@ -1,3 +1,3 @@
 TRANFORMATION_CHOICES = {
-    'rotate': '-rotate %(degrees)d'
+    u'rotate': u'-rotate %(degrees)d'
 }
--- a/apps/converter/api.py
+++ b/apps/converter/api.py
@@ -19,9 +19,9 @@ from common import TEMPORARY_DIRECTORY
 from converter import TRANFORMATION_CHOICES
 from documents.utils import document_save_to_temp_dir

-QUALITY_DEFAULT = 'quality_default'
-QUALITY_LOW = 'quality_low'
-QUALITY_HIGH = 'quality_high'
+QUALITY_DEFAULT = u'quality_default'
+QUALITY_LOW = u'quality_low'
+QUALITY_HIGH = u'quality_high'

 QUALITY_SETTINGS = {QUALITY_DEFAULT: DEFAULT_OPTIONS,
    QUALITY_LOW: LOW_QUALITY_OPTIONS, QUALITY_HIGH: HIGH_QUALITY_OPTIONS}
@@ -44,7 +44,7 @@ def _get_backend():
 try:
    backend = _get_backend()
 except ImportError:
-    raise ImportError('Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)
+    raise ImportError(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)


 def cleanup(filename):
@@ -58,7 +58,7 @@ def cleanup(filename):
 def execute_unpaper(input_filepath, output_filepath):
    command = []
    command.append(UNPAPER_PATH)
-    command.append('--overwrite')
+    command.append(u'--overwrite')
    command.append(input_filepath)
    command.append(output_filepath)
    proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE)
@@ -78,7 +78,7 @@ def execute_unoconv(input_filepath, output_filepath, arguments=''):
 """


-def cache_cleanup(input_filepath, size, quality=QUALITY_DEFAULT, page=0, format='jpg', extra_options=''):
+def cache_cleanup(input_filepath, size, quality=QUALITY_DEFAULT, page=0, format=u'jpg', extra_options=u''):
    filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options)
    try:
        os.remove(filepath)
@@ -93,16 +93,16 @@ def create_image_cache_filename(input_filepath, *args, **kwargs):

        final_filepath = []
        [final_filepath.append(str(arg)) for arg in args]
-        final_filepath.extend(['%s_%s' % (key, value) for key, value in kwargs.items()])
+        final_filepath.extend([u'%s_%s' % (key, value) for key, value in kwargs.items()])

-        temp_path += slugify('_'.join(final_filepath))
+        temp_path += slugify(u'_'.join(final_filepath))

        return temp_path
    else:
        return None


-def in_image_cache(input_filepath, size, page=0, format='jpg', quality=QUALITY_DEFAULT, extra_options=''):
+def in_image_cache(input_filepath, size, page=0, format=u'jpg', quality=QUALITY_DEFAULT, extra_options=u''):
    output_filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options)
    if os.path.exists(output_filepath):
        return output_filepath
@@ -110,7 +110,7 @@ def in_image_cache(input_filepath, size, page=0, format='jpg', quality=QUALITY_D
        return None


-def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, format='jpg', extra_options='', mimetype=None, extension=None, cleanup_files=True):
+def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, format=u'jpg', extra_options=u'', mimetype=None, extension=None, cleanup_files=True):
    unoconv_output = None
    output_filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options)
    if os.path.exists(output_filepath):
@@ -127,9 +127,9 @@ def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, f
            input_filepath = unoconv_output
    '''
    try:
-        input_arg = '%s[%s]' % (input_filepath, page)
-        extra_options += ' -resize %s' % size
-        backend.execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath='%s:%s' % (format, output_filepath), quality=quality)
+        input_arg = u'%s[%s]' % (input_filepath, page)
+        extra_options += u' -resize %s' % size
+        backend.execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath=u'%s:%s' % (format, output_filepath), quality=quality)
    finally:
        if cleanup_files:
            cleanup(input_filepath)
@@ -141,7 +141,7 @@ def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, f

 def get_page_count(input_filepath):
    try:
-        return int(backend.execute_identify(input_filepath, '-format %n'))
+        return len(backend.execute_identify(unicode(input_filepath)).splitlines())
    except Exception, e:
        #TODO: send to other page number identifying program
        return 1
@@ -154,12 +154,12 @@ def convert_document_for_ocr(document, page=0, format='tif'):
    #Convert for OCR
    temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
    temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
-    transformation_output_file = '%s_trans%s%s%s' % (temp_path, page, os.extsep, format)
-    unpaper_input_file = '%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
-    unpaper_output_file = '%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
-    convert_output_file = '%s_ocr%s%s%s' % (temp_path, page, os.extsep, format)
+    transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, format)
+    unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
+    unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
+    convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, format)

-    input_arg = '%s[%s]' % (input_filepath, page)
+    input_arg = u'%s[%s]' % (input_filepath, page)

    transformation_list = []
    try:
--- a/apps/converter/backends/graphicsmagick.py
+++ b/apps/converter/backends/graphicsmagick.py
@@ -1,20 +1,18 @@
-import shlex
 import subprocess

 from converter.conf.settings import GM_PATH
 from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS
 from converter.exceptions import ConvertError, UnknownFormat, IdentifyError

-CONVERTER_ERROR_STRING_NO_DECODER = 'No decode delegate for this image format'
+CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format'


-def execute_identify(input_filepath, arguments):
+def execute_identify(input_filepath, arguments=u''):
    command = []
-    command.append(GM_PATH)
+    command.append(unicode(GM_PATH))
    command.append(u'identify')
-    command.extend(shlex.split(str(arguments)))
-    command.append(input_filepath)
-
+    command.extend(unicode(arguments).split())
+    command.append(unicode(input_filepath))
    proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    return_code = proc.wait()
    if return_code != 0:
@@ -24,13 +22,13 @@ def execute_identify(input_filepath, arguments):

 def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None):
    command = []
-    command.append(GM_PATH)
+    command.append(unicode(GM_PATH))
    command.append(u'convert')
-    command.extend(shlex.split(str(QUALITY_SETTINGS[quality])))
-    command.append(input_filepath)
+    command.extend(unicode(QUALITY_SETTINGS[quality]).split())
+    command.append(unicode(input_filepath))
    if arguments:
-        command.extend(shlex.split(str(arguments)))
-    command.append(output_filepath)
+        command.extend(unicode(arguments).split())
+    command.append(unicode(output_filepath))
    proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    return_code = proc.wait()
    if return_code != 0:
--- a/apps/converter/backends/imagemagick.py
+++ b/apps/converter/backends/imagemagick.py
@@ -1,4 +1,3 @@
-import shlex
 import subprocess

 from converter.conf.settings import IM_IDENTIFY_PATH
@@ -7,14 +6,14 @@ from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS
 from converter.exceptions import ConvertError, UnknownFormat, \
    IdentifyError

-CONVERTER_ERROR_STRING_NO_DECODER = 'no decode delegate for this image format'
+CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format'


-def execute_identify(input_filepath, arguments):
+def execute_identify(input_filepath, arguments=u''):
    command = []
-    command.append(IM_IDENTIFY_PATH)
-    command.extend(shlex.split(str(arguments)))
-    command.append(input_filepath)
+    command.append(unicode(IM_IDENTIFY_PATH))
+    command.extend(unicode(arguments).split())
+    command.append(unicode(input_filepath))

    proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    return_code = proc.wait()
@@ -25,12 +24,12 @@ def execute_identify(input_filepath, arguments):

 def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None):
    command = []
-    command.append(IM_CONVERT_PATH)
-    command.extend(shlex.split(str(QUALITY_SETTINGS[quality])))
-    command.append(input_filepath)
+    command.append(unicode(IM_CONVERT_PATH))
+    command.extend(unicode(QUALITY_SETTINGS[quality]).split())
+    command.append(unicode(input_filepath))
    if arguments:
-        command.extend(shlex.split(str(arguments)))
-    command.append(output_filepath)
+        command.extend(unicode(arguments).split())
+    command.append(unicode(output_filepath))
    proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    return_code = proc.wait()
    if return_code != 0:
--- a/apps/documents/models.py
+++ b/apps/documents/models.py
@@ -117,21 +117,27 @@ class Document(models.Model):
    def update_page_count(self, save=True):
        handle, filepath = tempfile.mkstemp()
        self.save_to_file(filepath)
-        total_pages = get_page_count(filepath)
-
-        for page_number in range(total_pages):
-            DocumentPage.objects.get_or_create(
-                document=self, page_number=page_number + 1)
-
+        detected_pages = get_page_count(filepath)
        os.close(handle)
        try:
            os.remove(filepath)
        except OSError:
            pass

+        current_pages = DocumentPage.objects.filter(document=self).order_by('page_number',)
+        if current_pages.count() > detected_pages:
+            for page in current_pages[detected_pages:]:
+                page.delete()
+
+        for page_number in range(detected_pages):
+            DocumentPage.objects.get_or_create(
+                document=self, page_number=page_number + 1)
+
        if save:
            self.save()

+        return detected_pages
+
    def save_to_file(self, filepath, buffer_size=1024 * 1024):
        input_descriptor = self.open()
        output_descriptor = open(filepath, 'wb')
@@ -287,7 +293,7 @@ class DocumentPage(models.Model):
    page_number = models.PositiveIntegerField(default=1, editable=False, verbose_name=_(u'page number'))

    def __unicode__(self):
-        return '%s - %d - %s' % (unicode(self.document), self.page_number, self.page_label)
+        return u'%s - %d - %s' % (unicode(self.document), self.page_number, self.page_label)

    class Meta:
        verbose_name = _(u'document page')
@@ -309,8 +315,8 @@ class MetadataGroup(models.Model):
        verbose_name_plural = _(u'metadata document groups')


-INCLUSION_AND = '&'
-INCLUSION_OR = '|'
+INCLUSION_AND = u'&'
+INCLUSION_OR = u'|'

 INCLUSION_CHOICES = (
    (INCLUSION_AND, _(u'and')),
@@ -318,21 +324,21 @@ INCLUSION_CHOICES = (
 )

 OPERATOR_CHOICES = (
-    ('exact', _(u'is equal')),
-    ('iexact', _(u'is equal (case insensitive)')),
-    ('contains', _(u'contains')),
-    ('icontains', _(u'contains (case insensitive)')),
-    ('in', _(u'is in')),
-    ('gt', _(u'is greater than')),
-    ('gte', _(u'is greater than or equal')),
-    ('lt', _(u'is less than')),
-    ('lte', _(u'is less than or equal')),
-    ('startswith', _(u'starts with')),
-    ('istartswith', _(u'starts with (case insensitive)')),
-    ('endswith', _(u'ends with')),
-    ('iendswith', _(u'ends with (case insensitive)')),
-    ('regex', _(u'is in regular expression')),
-    ('iregex', _(u'is in regular expression (case insensitive)')),
+    (u'exact', _(u'is equal')),
+    (u'iexact', _(u'is equal (case insensitive)')),
+    (u'contains', _(u'contains')),
+    (u'icontains', _(u'contains (case insensitive)')),
+    (u'in', _(u'is in')),
+    (u'gt', _(u'is greater than')),
+    (u'gte', _(u'is greater than or equal')),
+    (u'lt', _(u'is less than')),
+    (u'lte', _(u'is less than or equal')),
+    (u'startswith', _(u'starts with')),
+    (u'istartswith', _(u'starts with (case insensitive)')),
+    (u'endswith', _(u'ends with')),
+    (u'iendswith', _(u'ends with (case insensitive)')),
+    (u'regex', _(u'is in regular expression')),
+    (u'iregex', _(u'is in regular expression (case insensitive)')),
 )


@@ -347,7 +353,7 @@ class MetadataGroupItem(models.Model):
    enabled = models.BooleanField(default=True, verbose_name=_(u'enabled'))

    def __unicode__(self):
-        return '[%s] %s %s %s %s %s' % ('x' if self.enabled else ' ', self.get_inclusion_display(), self.metadata_type, _(u'not') if self.negated else '', self.get_operator_display(), self.expression)
+        return u'[%s] %s %s %s %s %s' % (u'x' if self.enabled else u' ', self.get_inclusion_display(), self.metadata_type, _(u'not') if self.negated else u'', self.get_operator_display(), self.expression)

    class Meta:
        verbose_name = _(u'metadata group item')
@@ -364,7 +370,7 @@ class DocumentPageTransformation(models.Model):
    arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: {\'degrees\':90}'))

    def __unicode__(self):
-        return '%s - %s' % (unicode(self.document_page), self.get_transformation_display())
+        return u'%s - %s' % (unicode(self.document_page), self.get_transformation_display())

    class Meta:
        ordering = ('order',)
--- a/apps/ocr/api.py
+++ b/apps/ocr/api.py
@@ -20,7 +20,7 @@ def get_language_backend():
    try:
        module = import_module(u'.'.join([u'ocr', u'lang', TESSERACT_LANGUAGE]))
    except ImportError:
-        sys.stderr.write('\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE)
+        sys.stderr.write(u'\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE)
        return None
    return module

@@ -40,9 +40,9 @@ def cleanup(filename):


 def run_tesseract(input_filename, output_filename_base, lang=None):
-    command = [TESSERACT_PATH, input_filename, output_filename_base]
+    command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(output_filename_base)]
    if lang is not None:
-        command += ['-l', lang]
+        command += [u'-l', lang]

    proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    return_code = proc.wait()
@@ -85,7 +85,7 @@ def ocr_cleanup(text):
                result = word
            if result:
                output.append(result)
-        output.append('\n')
+        output.append(u'\n')

    return u' '.join(output)