Changed the way document page count is parsed from the graphics backend, fixing issue #7

This commit is contained in:
Roberto Rosario
2011-04-08 03:29:48 -04:00
parent 71a3c218f4
commit 6b67cff5d7
6 changed files with 76 additions and 73 deletions

View File

@@ -1,3 +1,3 @@
TRANFORMATION_CHOICES = {
'rotate': '-rotate %(degrees)d'
u'rotate': u'-rotate %(degrees)d'
}

View File

@@ -19,9 +19,9 @@ from common import TEMPORARY_DIRECTORY
from converter import TRANFORMATION_CHOICES
from documents.utils import document_save_to_temp_dir
QUALITY_DEFAULT = 'quality_default'
QUALITY_LOW = 'quality_low'
QUALITY_HIGH = 'quality_high'
QUALITY_DEFAULT = u'quality_default'
QUALITY_LOW = u'quality_low'
QUALITY_HIGH = u'quality_high'
QUALITY_SETTINGS = {QUALITY_DEFAULT: DEFAULT_OPTIONS,
QUALITY_LOW: LOW_QUALITY_OPTIONS, QUALITY_HIGH: HIGH_QUALITY_OPTIONS}
@@ -44,7 +44,7 @@ def _get_backend():
try:
backend = _get_backend()
except ImportError:
raise ImportError('Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)
raise ImportError(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)
def cleanup(filename):
@@ -58,7 +58,7 @@ def cleanup(filename):
def execute_unpaper(input_filepath, output_filepath):
command = []
command.append(UNPAPER_PATH)
command.append('--overwrite')
command.append(u'--overwrite')
command.append(input_filepath)
command.append(output_filepath)
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE)
@@ -78,7 +78,7 @@ def execute_unoconv(input_filepath, output_filepath, arguments=''):
"""
def cache_cleanup(input_filepath, size, quality=QUALITY_DEFAULT, page=0, format='jpg', extra_options=''):
def cache_cleanup(input_filepath, size, quality=QUALITY_DEFAULT, page=0, format=u'jpg', extra_options=u''):
filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options)
try:
os.remove(filepath)
@@ -93,16 +93,16 @@ def create_image_cache_filename(input_filepath, *args, **kwargs):
final_filepath = []
[final_filepath.append(str(arg)) for arg in args]
final_filepath.extend(['%s_%s' % (key, value) for key, value in kwargs.items()])
final_filepath.extend([u'%s_%s' % (key, value) for key, value in kwargs.items()])
temp_path += slugify('_'.join(final_filepath))
temp_path += slugify(u'_'.join(final_filepath))
return temp_path
else:
return None
def in_image_cache(input_filepath, size, page=0, format='jpg', quality=QUALITY_DEFAULT, extra_options=''):
def in_image_cache(input_filepath, size, page=0, format=u'jpg', quality=QUALITY_DEFAULT, extra_options=u''):
output_filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options)
if os.path.exists(output_filepath):
return output_filepath
@@ -110,7 +110,7 @@ def in_image_cache(input_filepath, size, page=0, format='jpg', quality=QUALITY_D
return None
def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, format='jpg', extra_options='', mimetype=None, extension=None, cleanup_files=True):
def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, format=u'jpg', extra_options=u'', mimetype=None, extension=None, cleanup_files=True):
unoconv_output = None
output_filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options)
if os.path.exists(output_filepath):
@@ -127,9 +127,9 @@ def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, f
input_filepath = unoconv_output
'''
try:
input_arg = '%s[%s]' % (input_filepath, page)
extra_options += ' -resize %s' % size
backend.execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath='%s:%s' % (format, output_filepath), quality=quality)
input_arg = u'%s[%s]' % (input_filepath, page)
extra_options += u' -resize %s' % size
backend.execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath=u'%s:%s' % (format, output_filepath), quality=quality)
finally:
if cleanup_files:
cleanup(input_filepath)
@@ -141,7 +141,7 @@ def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, f
def get_page_count(input_filepath):
try:
return int(backend.execute_identify(input_filepath, '-format %n'))
return len(backend.execute_identify(unicode(input_filepath)).splitlines())
except Exception, e:
#TODO: send to other page number identifying program
return 1
@@ -154,12 +154,12 @@ def convert_document_for_ocr(document, page=0, format='tif'):
#Convert for OCR
temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
transformation_output_file = '%s_trans%s%s%s' % (temp_path, page, os.extsep, format)
unpaper_input_file = '%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
unpaper_output_file = '%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
convert_output_file = '%s_ocr%s%s%s' % (temp_path, page, os.extsep, format)
transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, format)
unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, format)
input_arg = '%s[%s]' % (input_filepath, page)
input_arg = u'%s[%s]' % (input_filepath, page)
transformation_list = []
try:

View File

@@ -1,20 +1,18 @@
import shlex
import subprocess
from converter.conf.settings import GM_PATH
from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS
from converter.exceptions import ConvertError, UnknownFormat, IdentifyError
CONVERTER_ERROR_STRING_NO_DECODER = 'No decode delegate for this image format'
CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format'
def execute_identify(input_filepath, arguments):
def execute_identify(input_filepath, arguments=u''):
command = []
command.append(GM_PATH)
command.append(unicode(GM_PATH))
command.append(u'identify')
command.extend(shlex.split(str(arguments)))
command.append(input_filepath)
command.extend(unicode(arguments).split())
command.append(unicode(input_filepath))
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
@@ -24,13 +22,13 @@ def execute_identify(input_filepath, arguments):
def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None):
command = []
command.append(GM_PATH)
command.append(unicode(GM_PATH))
command.append(u'convert')
command.extend(shlex.split(str(QUALITY_SETTINGS[quality])))
command.append(input_filepath)
command.extend(unicode(QUALITY_SETTINGS[quality]).split())
command.append(unicode(input_filepath))
if arguments:
command.extend(shlex.split(str(arguments)))
command.append(output_filepath)
command.extend(unicode(arguments).split())
command.append(unicode(output_filepath))
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:

View File

@@ -1,4 +1,3 @@
import shlex
import subprocess
from converter.conf.settings import IM_IDENTIFY_PATH
@@ -7,14 +6,14 @@ from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS
from converter.exceptions import ConvertError, UnknownFormat, \
IdentifyError
CONVERTER_ERROR_STRING_NO_DECODER = 'no decode delegate for this image format'
CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format'
def execute_identify(input_filepath, arguments):
def execute_identify(input_filepath, arguments=u''):
command = []
command.append(IM_IDENTIFY_PATH)
command.extend(shlex.split(str(arguments)))
command.append(input_filepath)
command.append(unicode(IM_IDENTIFY_PATH))
command.extend(unicode(arguments).split())
command.append(unicode(input_filepath))
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
@@ -25,12 +24,12 @@ def execute_identify(input_filepath, arguments):
def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None):
command = []
command.append(IM_CONVERT_PATH)
command.extend(shlex.split(str(QUALITY_SETTINGS[quality])))
command.append(input_filepath)
command.append(unicode(IM_CONVERT_PATH))
command.extend(unicode(QUALITY_SETTINGS[quality]).split())
command.append(unicode(input_filepath))
if arguments:
command.extend(shlex.split(str(arguments)))
command.append(output_filepath)
command.extend(unicode(arguments).split())
command.append(unicode(output_filepath))
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:

View File

@@ -117,21 +117,27 @@ class Document(models.Model):
def update_page_count(self, save=True):
handle, filepath = tempfile.mkstemp()
self.save_to_file(filepath)
total_pages = get_page_count(filepath)
for page_number in range(total_pages):
DocumentPage.objects.get_or_create(
document=self, page_number=page_number + 1)
detected_pages = get_page_count(filepath)
os.close(handle)
try:
os.remove(filepath)
except OSError:
pass
current_pages = DocumentPage.objects.filter(document=self).order_by('page_number',)
if current_pages.count() > detected_pages:
for page in current_pages[detected_pages:]:
page.delete()
for page_number in range(detected_pages):
DocumentPage.objects.get_or_create(
document=self, page_number=page_number + 1)
if save:
self.save()
return detected_pages
def save_to_file(self, filepath, buffer_size=1024 * 1024):
input_descriptor = self.open()
output_descriptor = open(filepath, 'wb')
@@ -287,7 +293,7 @@ class DocumentPage(models.Model):
page_number = models.PositiveIntegerField(default=1, editable=False, verbose_name=_(u'page number'))
def __unicode__(self):
return '%s - %d - %s' % (unicode(self.document), self.page_number, self.page_label)
return u'%s - %d - %s' % (unicode(self.document), self.page_number, self.page_label)
class Meta:
verbose_name = _(u'document page')
@@ -309,8 +315,8 @@ class MetadataGroup(models.Model):
verbose_name_plural = _(u'metadata document groups')
INCLUSION_AND = '&'
INCLUSION_OR = '|'
INCLUSION_AND = u'&'
INCLUSION_OR = u'|'
INCLUSION_CHOICES = (
(INCLUSION_AND, _(u'and')),
@@ -318,21 +324,21 @@ INCLUSION_CHOICES = (
)
OPERATOR_CHOICES = (
('exact', _(u'is equal')),
('iexact', _(u'is equal (case insensitive)')),
('contains', _(u'contains')),
('icontains', _(u'contains (case insensitive)')),
('in', _(u'is in')),
('gt', _(u'is greater than')),
('gte', _(u'is greater than or equal')),
('lt', _(u'is less than')),
('lte', _(u'is less than or equal')),
('startswith', _(u'starts with')),
('istartswith', _(u'starts with (case insensitive)')),
('endswith', _(u'ends with')),
('iendswith', _(u'ends with (case insensitive)')),
('regex', _(u'is in regular expression')),
('iregex', _(u'is in regular expression (case insensitive)')),
(u'exact', _(u'is equal')),
(u'iexact', _(u'is equal (case insensitive)')),
(u'contains', _(u'contains')),
(u'icontains', _(u'contains (case insensitive)')),
(u'in', _(u'is in')),
(u'gt', _(u'is greater than')),
(u'gte', _(u'is greater than or equal')),
(u'lt', _(u'is less than')),
(u'lte', _(u'is less than or equal')),
(u'startswith', _(u'starts with')),
(u'istartswith', _(u'starts with (case insensitive)')),
(u'endswith', _(u'ends with')),
(u'iendswith', _(u'ends with (case insensitive)')),
(u'regex', _(u'is in regular expression')),
(u'iregex', _(u'is in regular expression (case insensitive)')),
)
@@ -347,7 +353,7 @@ class MetadataGroupItem(models.Model):
enabled = models.BooleanField(default=True, verbose_name=_(u'enabled'))
def __unicode__(self):
return '[%s] %s %s %s %s %s' % ('x' if self.enabled else ' ', self.get_inclusion_display(), self.metadata_type, _(u'not') if self.negated else '', self.get_operator_display(), self.expression)
return u'[%s] %s %s %s %s %s' % (u'x' if self.enabled else u' ', self.get_inclusion_display(), self.metadata_type, _(u'not') if self.negated else u'', self.get_operator_display(), self.expression)
class Meta:
verbose_name = _(u'metadata group item')
@@ -364,7 +370,7 @@ class DocumentPageTransformation(models.Model):
arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: {\'degrees\':90}'))
def __unicode__(self):
return '%s - %s' % (unicode(self.document_page), self.get_transformation_display())
return u'%s - %s' % (unicode(self.document_page), self.get_transformation_display())
class Meta:
ordering = ('order',)

View File

@@ -20,7 +20,7 @@ def get_language_backend():
try:
module = import_module(u'.'.join([u'ocr', u'lang', TESSERACT_LANGUAGE]))
except ImportError:
sys.stderr.write('\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE)
sys.stderr.write(u'\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE)
return None
return module
@@ -40,9 +40,9 @@ def cleanup(filename):
def run_tesseract(input_filename, output_filename_base, lang=None):
command = [TESSERACT_PATH, input_filename, output_filename_base]
command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(output_filename_base)]
if lang is not None:
command += ['-l', lang]
command += [u'-l', lang]
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
@@ -85,7 +85,7 @@ def ocr_cleanup(text):
result = word
if result:
output.append(result)
output.append('\n')
output.append(u'\n')
return u' '.join(output)