From 6b67cff5d7b576ca99268bcb1df01a9a8144f9d4 Mon Sep 17 00:00:00 2001
From: Roberto Rosario <Roberto.Rosario.Gonzalez@gmail.com>
Date: Fri, 8 Apr 2011 03:29:48 -0400
Subject: [PATCH] Changed the way document page count is parsed from the
 graphics backend, fixing issue #7

---
 apps/converter/__init__.py                |  2 +-
 apps/converter/api.py                     | 38 +++++++--------
 apps/converter/backends/graphicsmagick.py | 22 ++++-----
 apps/converter/backends/imagemagick.py    | 21 ++++----
 apps/documents/models.py                  | 58 +++++++++++++----------
 apps/ocr/api.py                           |  8 ++--
 6 files changed, 76 insertions(+), 73 deletions(-)

diff --git a/apps/converter/__init__.py b/apps/converter/__init__.py
index 0a9f1cf87e..0445dffa0e 100644
--- a/apps/converter/__init__.py
+++ b/apps/converter/__init__.py
@@ -1,3 +1,3 @@
 TRANFORMATION_CHOICES = {
-    'rotate': '-rotate %(degrees)d'
+    u'rotate': u'-rotate %(degrees)d'
 }
diff --git a/apps/converter/api.py b/apps/converter/api.py
index 5657ba8dcf..0e4adb4abc 100644
--- a/apps/converter/api.py
+++ b/apps/converter/api.py
@@ -19,9 +19,9 @@ from common import TEMPORARY_DIRECTORY
 from converter import TRANFORMATION_CHOICES
 from documents.utils import document_save_to_temp_dir
 
-QUALITY_DEFAULT = 'quality_default'
-QUALITY_LOW = 'quality_low'
-QUALITY_HIGH = 'quality_high'
+QUALITY_DEFAULT = u'quality_default'
+QUALITY_LOW = u'quality_low'
+QUALITY_HIGH = u'quality_high'
 
 QUALITY_SETTINGS = {QUALITY_DEFAULT: DEFAULT_OPTIONS,
     QUALITY_LOW: LOW_QUALITY_OPTIONS, QUALITY_HIGH: HIGH_QUALITY_OPTIONS}
@@ -44,7 +44,7 @@ def _get_backend():
 try:
     backend = _get_backend()
 except ImportError:
-    raise ImportError('Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)
+    raise ImportError(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)
 
 
 def cleanup(filename):
@@ -58,7 +58,7 @@ def cleanup(filename):
 def execute_unpaper(input_filepath, output_filepath):
     command = []
     command.append(UNPAPER_PATH)
-    command.append('--overwrite')
+    command.append(u'--overwrite')
     command.append(input_filepath)
     command.append(output_filepath)
     proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE)
@@ -78,7 +78,7 @@ def execute_unoconv(input_filepath, output_filepath, arguments=''):
 """
 
 
-def cache_cleanup(input_filepath, size, quality=QUALITY_DEFAULT, page=0, format='jpg', extra_options=''):
+def cache_cleanup(input_filepath, size, quality=QUALITY_DEFAULT, page=0, format=u'jpg', extra_options=u''):
     filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options)
     try:
         os.remove(filepath)
@@ -93,16 +93,16 @@ def create_image_cache_filename(input_filepath, *args, **kwargs):
 
         final_filepath = []
         [final_filepath.append(str(arg)) for arg in args]
-        final_filepath.extend(['%s_%s' % (key, value) for key, value in kwargs.items()])
+        final_filepath.extend([u'%s_%s' % (key, value) for key, value in kwargs.items()])
 
-        temp_path += slugify('_'.join(final_filepath))
+        temp_path += slugify(u'_'.join(final_filepath))
 
         return temp_path
     else:
         return None
 
 
-def in_image_cache(input_filepath, size, page=0, format='jpg', quality=QUALITY_DEFAULT, extra_options=''):
+def in_image_cache(input_filepath, size, page=0, format=u'jpg', quality=QUALITY_DEFAULT, extra_options=u''):
     output_filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options)
     if os.path.exists(output_filepath):
         return output_filepath
@@ -110,7 +110,7 @@ def in_image_cache(input_filepath, size, page=0, format='jpg', quality=QUALITY_D
         return None
 
 
-def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, format='jpg', extra_options='', mimetype=None, extension=None, cleanup_files=True):
+def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, format=u'jpg', extra_options=u'', mimetype=None, extension=None, cleanup_files=True):
     unoconv_output = None
     output_filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options)
     if os.path.exists(output_filepath):
@@ -127,9 +127,9 @@ def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, f
             input_filepath = unoconv_output
     '''
     try:
-        input_arg = '%s[%s]' % (input_filepath, page)
-        extra_options += ' -resize %s' % size
-        backend.execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath='%s:%s' % (format, output_filepath), quality=quality)
+        input_arg = u'%s[%s]' % (input_filepath, page)
+        extra_options += u' -resize %s' % size
+        backend.execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath=u'%s:%s' % (format, output_filepath), quality=quality)
     finally:
         if cleanup_files:
             cleanup(input_filepath)
@@ -141,7 +141,7 @@ def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, f
 
 def get_page_count(input_filepath):
     try:
-        return int(backend.execute_identify(input_filepath, '-format %n'))
+        return len(backend.execute_identify(unicode(input_filepath)).splitlines())
     except Exception, e:
         #TODO: send to other page number identifying program
         return 1
@@ -154,12 +154,12 @@ def convert_document_for_ocr(document, page=0, format='tif'):
     #Convert for OCR
     temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
     temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
-    transformation_output_file = '%s_trans%s%s%s' % (temp_path, page, os.extsep, format)
-    unpaper_input_file = '%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
-    unpaper_output_file = '%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
-    convert_output_file = '%s_ocr%s%s%s' % (temp_path, page, os.extsep, format)
+    transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, format)
+    unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
+    unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
+    convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, format)
 
-    input_arg = '%s[%s]' % (input_filepath, page)
+    input_arg = u'%s[%s]' % (input_filepath, page)
 
     transformation_list = []
     try:
diff --git a/apps/converter/backends/graphicsmagick.py b/apps/converter/backends/graphicsmagick.py
index 3e09c84bba..8923e4cc89 100644
--- a/apps/converter/backends/graphicsmagick.py
+++ b/apps/converter/backends/graphicsmagick.py
@@ -1,20 +1,18 @@
-import shlex
 import subprocess
 
 from converter.conf.settings import GM_PATH
 from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS
 from converter.exceptions import ConvertError, UnknownFormat, IdentifyError
 
-CONVERTER_ERROR_STRING_NO_DECODER = 'No decode delegate for this image format'
+CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format'
 
 
-def execute_identify(input_filepath, arguments):
+def execute_identify(input_filepath, arguments=u''):
     command = []
-    command.append(GM_PATH)
+    command.append(unicode(GM_PATH))
     command.append(u'identify')
-    command.extend(shlex.split(str(arguments)))
-    command.append(input_filepath)
-
+    command.extend(unicode(arguments).split())
+    command.append(unicode(input_filepath))
     proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
     return_code = proc.wait()
     if return_code != 0:
@@ -24,13 +22,13 @@ def execute_identify(input_filepath, arguments):
 
 def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None):
     command = []
-    command.append(GM_PATH)
+    command.append(unicode(GM_PATH))
     command.append(u'convert')
-    command.extend(shlex.split(str(QUALITY_SETTINGS[quality])))
-    command.append(input_filepath)
+    command.extend(unicode(QUALITY_SETTINGS[quality]).split())
+    command.append(unicode(input_filepath))
     if arguments:
-        command.extend(shlex.split(str(arguments)))
-    command.append(output_filepath)
+        command.extend(unicode(arguments).split())
+    command.append(unicode(output_filepath))
     proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
     return_code = proc.wait()
     if return_code != 0:
diff --git a/apps/converter/backends/imagemagick.py b/apps/converter/backends/imagemagick.py
index 5aa92fc8f4..c60f6bc341 100644
--- a/apps/converter/backends/imagemagick.py
+++ b/apps/converter/backends/imagemagick.py
@@ -1,4 +1,3 @@
-import shlex
 import subprocess
 
 from converter.conf.settings import IM_IDENTIFY_PATH
@@ -7,14 +6,14 @@ from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS
 from converter.exceptions import ConvertError, UnknownFormat, \
     IdentifyError
 
-CONVERTER_ERROR_STRING_NO_DECODER = 'no decode delegate for this image format'
+CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format'
 
 
-def execute_identify(input_filepath, arguments):
+def execute_identify(input_filepath, arguments=u''):
     command = []
-    command.append(IM_IDENTIFY_PATH)
-    command.extend(shlex.split(str(arguments)))
-    command.append(input_filepath)
+    command.append(unicode(IM_IDENTIFY_PATH))
+    command.extend(unicode(arguments).split())
+    command.append(unicode(input_filepath))
 
     proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
     return_code = proc.wait()
@@ -25,12 +24,12 @@ def execute_identify(input_filepath, arguments):
 
 def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None):
     command = []
-    command.append(IM_CONVERT_PATH)
-    command.extend(shlex.split(str(QUALITY_SETTINGS[quality])))
-    command.append(input_filepath)
+    command.append(unicode(IM_CONVERT_PATH))
+    command.extend(unicode(QUALITY_SETTINGS[quality]).split())
+    command.append(unicode(input_filepath))
     if arguments:
-        command.extend(shlex.split(str(arguments)))
-    command.append(output_filepath)
+        command.extend(unicode(arguments).split())
+    command.append(unicode(output_filepath))
     proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
     return_code = proc.wait()
     if return_code != 0:
diff --git a/apps/documents/models.py b/apps/documents/models.py
index 3ff17ec043..078c94d8ca 100644
--- a/apps/documents/models.py
+++ b/apps/documents/models.py
@@ -117,21 +117,27 @@ class Document(models.Model):
     def update_page_count(self, save=True):
         handle, filepath = tempfile.mkstemp()
         self.save_to_file(filepath)
-        total_pages = get_page_count(filepath)
-
-        for page_number in range(total_pages):
-            DocumentPage.objects.get_or_create(
-                document=self, page_number=page_number + 1)
-
+        detected_pages = get_page_count(filepath)
         os.close(handle)
         try:
             os.remove(filepath)
         except OSError:
             pass
 
+        current_pages = DocumentPage.objects.filter(document=self).order_by('page_number',)
+        if current_pages.count() > detected_pages:
+            for page in current_pages[detected_pages:]:
+                page.delete()
+
+        for page_number in range(detected_pages):
+            DocumentPage.objects.get_or_create(
+                document=self, page_number=page_number + 1)
+
         if save:
             self.save()
 
+        return detected_pages
+
     def save_to_file(self, filepath, buffer_size=1024 * 1024):
         input_descriptor = self.open()
         output_descriptor = open(filepath, 'wb')
@@ -287,7 +293,7 @@ class DocumentPage(models.Model):
     page_number = models.PositiveIntegerField(default=1, editable=False, verbose_name=_(u'page number'))
 
     def __unicode__(self):
-        return '%s - %d - %s' % (unicode(self.document), self.page_number, self.page_label)
+        return u'%s - %d - %s' % (unicode(self.document), self.page_number, self.page_label)
 
     class Meta:
         verbose_name = _(u'document page')
@@ -309,8 +315,8 @@ class MetadataGroup(models.Model):
         verbose_name_plural = _(u'metadata document groups')
 
 
-INCLUSION_AND = '&'
-INCLUSION_OR = '|'
+INCLUSION_AND = u'&'
+INCLUSION_OR = u'|'
 
 INCLUSION_CHOICES = (
     (INCLUSION_AND, _(u'and')),
@@ -318,21 +324,21 @@ INCLUSION_CHOICES = (
 )
 
 OPERATOR_CHOICES = (
-    ('exact', _(u'is equal')),
-    ('iexact', _(u'is equal (case insensitive)')),
-    ('contains', _(u'contains')),
-    ('icontains', _(u'contains (case insensitive)')),
-    ('in', _(u'is in')),
-    ('gt', _(u'is greater than')),
-    ('gte', _(u'is greater than or equal')),
-    ('lt', _(u'is less than')),
-    ('lte', _(u'is less than or equal')),
-    ('startswith', _(u'starts with')),
-    ('istartswith', _(u'starts with (case insensitive)')),
-    ('endswith', _(u'ends with')),
-    ('iendswith', _(u'ends with (case insensitive)')),
-    ('regex', _(u'is in regular expression')),
-    ('iregex', _(u'is in regular expression (case insensitive)')),
+    (u'exact', _(u'is equal')),
+    (u'iexact', _(u'is equal (case insensitive)')),
+    (u'contains', _(u'contains')),
+    (u'icontains', _(u'contains (case insensitive)')),
+    (u'in', _(u'is in')),
+    (u'gt', _(u'is greater than')),
+    (u'gte', _(u'is greater than or equal')),
+    (u'lt', _(u'is less than')),
+    (u'lte', _(u'is less than or equal')),
+    (u'startswith', _(u'starts with')),
+    (u'istartswith', _(u'starts with (case insensitive)')),
+    (u'endswith', _(u'ends with')),
+    (u'iendswith', _(u'ends with (case insensitive)')),
+    (u'regex', _(u'is in regular expression')),
+    (u'iregex', _(u'is in regular expression (case insensitive)')),
 )
 
 
@@ -347,7 +353,7 @@ class MetadataGroupItem(models.Model):
     enabled = models.BooleanField(default=True, verbose_name=_(u'enabled'))
 
     def __unicode__(self):
-        return '[%s] %s %s %s %s %s' % ('x' if self.enabled else ' ', self.get_inclusion_display(), self.metadata_type, _(u'not') if self.negated else '', self.get_operator_display(), self.expression)
+        return u'[%s] %s %s %s %s %s' % (u'x' if self.enabled else u' ', self.get_inclusion_display(), self.metadata_type, _(u'not') if self.negated else u'', self.get_operator_display(), self.expression)
 
     class Meta:
         verbose_name = _(u'metadata group item')
@@ -364,7 +370,7 @@ class DocumentPageTransformation(models.Model):
     arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: {\'degrees\':90}'))
 
     def __unicode__(self):
-        return '%s - %s' % (unicode(self.document_page), self.get_transformation_display())
+        return u'%s - %s' % (unicode(self.document_page), self.get_transformation_display())
 
     class Meta:
         ordering = ('order',)
diff --git a/apps/ocr/api.py b/apps/ocr/api.py
index 120c39ecee..baaa83497a 100644
--- a/apps/ocr/api.py
+++ b/apps/ocr/api.py
@@ -20,7 +20,7 @@ def get_language_backend():
     try:
         module = import_module(u'.'.join([u'ocr', u'lang', TESSERACT_LANGUAGE]))
     except ImportError:
-        sys.stderr.write('\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE)
+        sys.stderr.write(u'\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE)
         return None
     return module
 
@@ -40,9 +40,9 @@ def cleanup(filename):
 
 
 def run_tesseract(input_filename, output_filename_base, lang=None):
-    command = [TESSERACT_PATH, input_filename, output_filename_base]
+    command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(output_filename_base)]
     if lang is not None:
-        command += ['-l', lang]
+        command += [u'-l', lang]
 
     proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
     return_code = proc.wait()
@@ -85,7 +85,7 @@ def ocr_cleanup(text):
                 result = word
             if result:
                 output.append(result)
-        output.append('\n')
+        output.append(u'\n')
 
     return u' '.join(output)