Changed the way document page count is parsed from the graphics backend, fixing issue #7

This commit is contained in:
Roberto Rosario
2011-04-08 03:29:48 -04:00
parent 71a3c218f4
commit 6b67cff5d7
6 changed files with 76 additions and 73 deletions

View File

@@ -1,3 +1,3 @@
TRANFORMATION_CHOICES = { TRANFORMATION_CHOICES = {
'rotate': '-rotate %(degrees)d' u'rotate': u'-rotate %(degrees)d'
} }

View File

@@ -19,9 +19,9 @@ from common import TEMPORARY_DIRECTORY
from converter import TRANFORMATION_CHOICES from converter import TRANFORMATION_CHOICES
from documents.utils import document_save_to_temp_dir from documents.utils import document_save_to_temp_dir
QUALITY_DEFAULT = 'quality_default' QUALITY_DEFAULT = u'quality_default'
QUALITY_LOW = 'quality_low' QUALITY_LOW = u'quality_low'
QUALITY_HIGH = 'quality_high' QUALITY_HIGH = u'quality_high'
QUALITY_SETTINGS = {QUALITY_DEFAULT: DEFAULT_OPTIONS, QUALITY_SETTINGS = {QUALITY_DEFAULT: DEFAULT_OPTIONS,
QUALITY_LOW: LOW_QUALITY_OPTIONS, QUALITY_HIGH: HIGH_QUALITY_OPTIONS} QUALITY_LOW: LOW_QUALITY_OPTIONS, QUALITY_HIGH: HIGH_QUALITY_OPTIONS}
@@ -44,7 +44,7 @@ def _get_backend():
try: try:
backend = _get_backend() backend = _get_backend()
except ImportError: except ImportError:
raise ImportError('Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND) raise ImportError(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)
def cleanup(filename): def cleanup(filename):
@@ -58,7 +58,7 @@ def cleanup(filename):
def execute_unpaper(input_filepath, output_filepath): def execute_unpaper(input_filepath, output_filepath):
command = [] command = []
command.append(UNPAPER_PATH) command.append(UNPAPER_PATH)
command.append('--overwrite') command.append(u'--overwrite')
command.append(input_filepath) command.append(input_filepath)
command.append(output_filepath) command.append(output_filepath)
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE) proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE)
@@ -78,7 +78,7 @@ def execute_unoconv(input_filepath, output_filepath, arguments=''):
""" """
def cache_cleanup(input_filepath, size, quality=QUALITY_DEFAULT, page=0, format='jpg', extra_options=''): def cache_cleanup(input_filepath, size, quality=QUALITY_DEFAULT, page=0, format=u'jpg', extra_options=u''):
filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options) filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options)
try: try:
os.remove(filepath) os.remove(filepath)
@@ -93,16 +93,16 @@ def create_image_cache_filename(input_filepath, *args, **kwargs):
final_filepath = [] final_filepath = []
[final_filepath.append(str(arg)) for arg in args] [final_filepath.append(str(arg)) for arg in args]
final_filepath.extend(['%s_%s' % (key, value) for key, value in kwargs.items()]) final_filepath.extend([u'%s_%s' % (key, value) for key, value in kwargs.items()])
temp_path += slugify('_'.join(final_filepath)) temp_path += slugify(u'_'.join(final_filepath))
return temp_path return temp_path
else: else:
return None return None
def in_image_cache(input_filepath, size, page=0, format='jpg', quality=QUALITY_DEFAULT, extra_options=''): def in_image_cache(input_filepath, size, page=0, format=u'jpg', quality=QUALITY_DEFAULT, extra_options=u''):
output_filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options) output_filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options)
if os.path.exists(output_filepath): if os.path.exists(output_filepath):
return output_filepath return output_filepath
@@ -110,7 +110,7 @@ def in_image_cache(input_filepath, size, page=0, format='jpg', quality=QUALITY_D
return None return None
def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, format='jpg', extra_options='', mimetype=None, extension=None, cleanup_files=True): def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, format=u'jpg', extra_options=u'', mimetype=None, extension=None, cleanup_files=True):
unoconv_output = None unoconv_output = None
output_filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options) output_filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options)
if os.path.exists(output_filepath): if os.path.exists(output_filepath):
@@ -127,9 +127,9 @@ def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, f
input_filepath = unoconv_output input_filepath = unoconv_output
''' '''
try: try:
input_arg = '%s[%s]' % (input_filepath, page) input_arg = u'%s[%s]' % (input_filepath, page)
extra_options += ' -resize %s' % size extra_options += u' -resize %s' % size
backend.execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath='%s:%s' % (format, output_filepath), quality=quality) backend.execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath=u'%s:%s' % (format, output_filepath), quality=quality)
finally: finally:
if cleanup_files: if cleanup_files:
cleanup(input_filepath) cleanup(input_filepath)
@@ -141,7 +141,7 @@ def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, f
def get_page_count(input_filepath): def get_page_count(input_filepath):
try: try:
return int(backend.execute_identify(input_filepath, '-format %n')) return len(backend.execute_identify(unicode(input_filepath)).splitlines())
except Exception, e: except Exception, e:
#TODO: send to other page number identifying program #TODO: send to other page number identifying program
return 1 return 1
@@ -154,12 +154,12 @@ def convert_document_for_ocr(document, page=0, format='tif'):
#Convert for OCR #Convert for OCR
temp_filename, separator = os.path.splitext(os.path.basename(input_filepath)) temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename) temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
transformation_output_file = '%s_trans%s%s%s' % (temp_path, page, os.extsep, format) transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, format)
unpaper_input_file = '%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep) unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
unpaper_output_file = '%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep) unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
convert_output_file = '%s_ocr%s%s%s' % (temp_path, page, os.extsep, format) convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, format)
input_arg = '%s[%s]' % (input_filepath, page) input_arg = u'%s[%s]' % (input_filepath, page)
transformation_list = [] transformation_list = []
try: try:

View File

@@ -1,20 +1,18 @@
import shlex
import subprocess import subprocess
from converter.conf.settings import GM_PATH from converter.conf.settings import GM_PATH
from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS
from converter.exceptions import ConvertError, UnknownFormat, IdentifyError from converter.exceptions import ConvertError, UnknownFormat, IdentifyError
CONVERTER_ERROR_STRING_NO_DECODER = 'No decode delegate for this image format' CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format'
def execute_identify(input_filepath, arguments): def execute_identify(input_filepath, arguments=u''):
command = [] command = []
command.append(GM_PATH) command.append(unicode(GM_PATH))
command.append(u'identify') command.append(u'identify')
command.extend(shlex.split(str(arguments))) command.extend(unicode(arguments).split())
command.append(input_filepath) command.append(unicode(input_filepath))
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait() return_code = proc.wait()
if return_code != 0: if return_code != 0:
@@ -24,13 +22,13 @@ def execute_identify(input_filepath, arguments):
def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None): def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None):
command = [] command = []
command.append(GM_PATH) command.append(unicode(GM_PATH))
command.append(u'convert') command.append(u'convert')
command.extend(shlex.split(str(QUALITY_SETTINGS[quality]))) command.extend(unicode(QUALITY_SETTINGS[quality]).split())
command.append(input_filepath) command.append(unicode(input_filepath))
if arguments: if arguments:
command.extend(shlex.split(str(arguments))) command.extend(unicode(arguments).split())
command.append(output_filepath) command.append(unicode(output_filepath))
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait() return_code = proc.wait()
if return_code != 0: if return_code != 0:

View File

@@ -1,4 +1,3 @@
import shlex
import subprocess import subprocess
from converter.conf.settings import IM_IDENTIFY_PATH from converter.conf.settings import IM_IDENTIFY_PATH
@@ -7,14 +6,14 @@ from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS
from converter.exceptions import ConvertError, UnknownFormat, \ from converter.exceptions import ConvertError, UnknownFormat, \
IdentifyError IdentifyError
CONVERTER_ERROR_STRING_NO_DECODER = 'no decode delegate for this image format' CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format'
def execute_identify(input_filepath, arguments): def execute_identify(input_filepath, arguments=u''):
command = [] command = []
command.append(IM_IDENTIFY_PATH) command.append(unicode(IM_IDENTIFY_PATH))
command.extend(shlex.split(str(arguments))) command.extend(unicode(arguments).split())
command.append(input_filepath) command.append(unicode(input_filepath))
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait() return_code = proc.wait()
@@ -25,12 +24,12 @@ def execute_identify(input_filepath, arguments):
def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None): def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None):
command = [] command = []
command.append(IM_CONVERT_PATH) command.append(unicode(IM_CONVERT_PATH))
command.extend(shlex.split(str(QUALITY_SETTINGS[quality]))) command.extend(unicode(QUALITY_SETTINGS[quality]).split())
command.append(input_filepath) command.append(unicode(input_filepath))
if arguments: if arguments:
command.extend(shlex.split(str(arguments))) command.extend(unicode(arguments).split())
command.append(output_filepath) command.append(unicode(output_filepath))
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait() return_code = proc.wait()
if return_code != 0: if return_code != 0:

View File

@@ -117,21 +117,27 @@ class Document(models.Model):
def update_page_count(self, save=True): def update_page_count(self, save=True):
handle, filepath = tempfile.mkstemp() handle, filepath = tempfile.mkstemp()
self.save_to_file(filepath) self.save_to_file(filepath)
total_pages = get_page_count(filepath) detected_pages = get_page_count(filepath)
for page_number in range(total_pages):
DocumentPage.objects.get_or_create(
document=self, page_number=page_number + 1)
os.close(handle) os.close(handle)
try: try:
os.remove(filepath) os.remove(filepath)
except OSError: except OSError:
pass pass
current_pages = DocumentPage.objects.filter(document=self).order_by('page_number',)
if current_pages.count() > detected_pages:
for page in current_pages[detected_pages:]:
page.delete()
for page_number in range(detected_pages):
DocumentPage.objects.get_or_create(
document=self, page_number=page_number + 1)
if save: if save:
self.save() self.save()
return detected_pages
def save_to_file(self, filepath, buffer_size=1024 * 1024): def save_to_file(self, filepath, buffer_size=1024 * 1024):
input_descriptor = self.open() input_descriptor = self.open()
output_descriptor = open(filepath, 'wb') output_descriptor = open(filepath, 'wb')
@@ -287,7 +293,7 @@ class DocumentPage(models.Model):
page_number = models.PositiveIntegerField(default=1, editable=False, verbose_name=_(u'page number')) page_number = models.PositiveIntegerField(default=1, editable=False, verbose_name=_(u'page number'))
def __unicode__(self): def __unicode__(self):
return '%s - %d - %s' % (unicode(self.document), self.page_number, self.page_label) return u'%s - %d - %s' % (unicode(self.document), self.page_number, self.page_label)
class Meta: class Meta:
verbose_name = _(u'document page') verbose_name = _(u'document page')
@@ -309,8 +315,8 @@ class MetadataGroup(models.Model):
verbose_name_plural = _(u'metadata document groups') verbose_name_plural = _(u'metadata document groups')
INCLUSION_AND = '&' INCLUSION_AND = u'&'
INCLUSION_OR = '|' INCLUSION_OR = u'|'
INCLUSION_CHOICES = ( INCLUSION_CHOICES = (
(INCLUSION_AND, _(u'and')), (INCLUSION_AND, _(u'and')),
@@ -318,21 +324,21 @@ INCLUSION_CHOICES = (
) )
OPERATOR_CHOICES = ( OPERATOR_CHOICES = (
('exact', _(u'is equal')), (u'exact', _(u'is equal')),
('iexact', _(u'is equal (case insensitive)')), (u'iexact', _(u'is equal (case insensitive)')),
('contains', _(u'contains')), (u'contains', _(u'contains')),
('icontains', _(u'contains (case insensitive)')), (u'icontains', _(u'contains (case insensitive)')),
('in', _(u'is in')), (u'in', _(u'is in')),
('gt', _(u'is greater than')), (u'gt', _(u'is greater than')),
('gte', _(u'is greater than or equal')), (u'gte', _(u'is greater than or equal')),
('lt', _(u'is less than')), (u'lt', _(u'is less than')),
('lte', _(u'is less than or equal')), (u'lte', _(u'is less than or equal')),
('startswith', _(u'starts with')), (u'startswith', _(u'starts with')),
('istartswith', _(u'starts with (case insensitive)')), (u'istartswith', _(u'starts with (case insensitive)')),
('endswith', _(u'ends with')), (u'endswith', _(u'ends with')),
('iendswith', _(u'ends with (case insensitive)')), (u'iendswith', _(u'ends with (case insensitive)')),
('regex', _(u'is in regular expression')), (u'regex', _(u'is in regular expression')),
('iregex', _(u'is in regular expression (case insensitive)')), (u'iregex', _(u'is in regular expression (case insensitive)')),
) )
@@ -347,7 +353,7 @@ class MetadataGroupItem(models.Model):
enabled = models.BooleanField(default=True, verbose_name=_(u'enabled')) enabled = models.BooleanField(default=True, verbose_name=_(u'enabled'))
def __unicode__(self): def __unicode__(self):
return '[%s] %s %s %s %s %s' % ('x' if self.enabled else ' ', self.get_inclusion_display(), self.metadata_type, _(u'not') if self.negated else '', self.get_operator_display(), self.expression) return u'[%s] %s %s %s %s %s' % (u'x' if self.enabled else u' ', self.get_inclusion_display(), self.metadata_type, _(u'not') if self.negated else u'', self.get_operator_display(), self.expression)
class Meta: class Meta:
verbose_name = _(u'metadata group item') verbose_name = _(u'metadata group item')
@@ -364,7 +370,7 @@ class DocumentPageTransformation(models.Model):
arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: {\'degrees\':90}')) arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: {\'degrees\':90}'))
def __unicode__(self): def __unicode__(self):
return '%s - %s' % (unicode(self.document_page), self.get_transformation_display()) return u'%s - %s' % (unicode(self.document_page), self.get_transformation_display())
class Meta: class Meta:
ordering = ('order',) ordering = ('order',)

View File

@@ -20,7 +20,7 @@ def get_language_backend():
try: try:
module = import_module(u'.'.join([u'ocr', u'lang', TESSERACT_LANGUAGE])) module = import_module(u'.'.join([u'ocr', u'lang', TESSERACT_LANGUAGE]))
except ImportError: except ImportError:
sys.stderr.write('\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE) sys.stderr.write(u'\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE)
return None return None
return module return module
@@ -40,9 +40,9 @@ def cleanup(filename):
def run_tesseract(input_filename, output_filename_base, lang=None): def run_tesseract(input_filename, output_filename_base, lang=None):
command = [TESSERACT_PATH, input_filename, output_filename_base] command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(output_filename_base)]
if lang is not None: if lang is not None:
command += ['-l', lang] command += [u'-l', lang]
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait() return_code = proc.wait()
@@ -85,7 +85,7 @@ def ocr_cleanup(text):
result = word result = word
if result: if result:
output.append(result) output.append(result)
output.append('\n') output.append(u'\n')
return u' '.join(output) return u' '.join(output)