Changed the way document page count is parsed from the graphics backend, fixing issue #7
This commit is contained in:
@@ -1,3 +1,3 @@
|
||||
TRANFORMATION_CHOICES = {
|
||||
'rotate': '-rotate %(degrees)d'
|
||||
u'rotate': u'-rotate %(degrees)d'
|
||||
}
|
||||
|
||||
@@ -19,9 +19,9 @@ from common import TEMPORARY_DIRECTORY
|
||||
from converter import TRANFORMATION_CHOICES
|
||||
from documents.utils import document_save_to_temp_dir
|
||||
|
||||
QUALITY_DEFAULT = 'quality_default'
|
||||
QUALITY_LOW = 'quality_low'
|
||||
QUALITY_HIGH = 'quality_high'
|
||||
QUALITY_DEFAULT = u'quality_default'
|
||||
QUALITY_LOW = u'quality_low'
|
||||
QUALITY_HIGH = u'quality_high'
|
||||
|
||||
QUALITY_SETTINGS = {QUALITY_DEFAULT: DEFAULT_OPTIONS,
|
||||
QUALITY_LOW: LOW_QUALITY_OPTIONS, QUALITY_HIGH: HIGH_QUALITY_OPTIONS}
|
||||
@@ -44,7 +44,7 @@ def _get_backend():
|
||||
try:
|
||||
backend = _get_backend()
|
||||
except ImportError:
|
||||
raise ImportError('Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)
|
||||
raise ImportError(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)
|
||||
|
||||
|
||||
def cleanup(filename):
|
||||
@@ -58,7 +58,7 @@ def cleanup(filename):
|
||||
def execute_unpaper(input_filepath, output_filepath):
|
||||
command = []
|
||||
command.append(UNPAPER_PATH)
|
||||
command.append('--overwrite')
|
||||
command.append(u'--overwrite')
|
||||
command.append(input_filepath)
|
||||
command.append(output_filepath)
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE)
|
||||
@@ -78,7 +78,7 @@ def execute_unoconv(input_filepath, output_filepath, arguments=''):
|
||||
"""
|
||||
|
||||
|
||||
def cache_cleanup(input_filepath, size, quality=QUALITY_DEFAULT, page=0, format='jpg', extra_options=''):
|
||||
def cache_cleanup(input_filepath, size, quality=QUALITY_DEFAULT, page=0, format=u'jpg', extra_options=u''):
|
||||
filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options)
|
||||
try:
|
||||
os.remove(filepath)
|
||||
@@ -93,16 +93,16 @@ def create_image_cache_filename(input_filepath, *args, **kwargs):
|
||||
|
||||
final_filepath = []
|
||||
[final_filepath.append(str(arg)) for arg in args]
|
||||
final_filepath.extend(['%s_%s' % (key, value) for key, value in kwargs.items()])
|
||||
final_filepath.extend([u'%s_%s' % (key, value) for key, value in kwargs.items()])
|
||||
|
||||
temp_path += slugify('_'.join(final_filepath))
|
||||
temp_path += slugify(u'_'.join(final_filepath))
|
||||
|
||||
return temp_path
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def in_image_cache(input_filepath, size, page=0, format='jpg', quality=QUALITY_DEFAULT, extra_options=''):
|
||||
def in_image_cache(input_filepath, size, page=0, format=u'jpg', quality=QUALITY_DEFAULT, extra_options=u''):
|
||||
output_filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options)
|
||||
if os.path.exists(output_filepath):
|
||||
return output_filepath
|
||||
@@ -110,7 +110,7 @@ def in_image_cache(input_filepath, size, page=0, format='jpg', quality=QUALITY_D
|
||||
return None
|
||||
|
||||
|
||||
def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, format='jpg', extra_options='', mimetype=None, extension=None, cleanup_files=True):
|
||||
def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, format=u'jpg', extra_options=u'', mimetype=None, extension=None, cleanup_files=True):
|
||||
unoconv_output = None
|
||||
output_filepath = create_image_cache_filename(input_filepath, size=size, page=page, format=format, quality=quality, extra_options=extra_options)
|
||||
if os.path.exists(output_filepath):
|
||||
@@ -127,9 +127,9 @@ def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, f
|
||||
input_filepath = unoconv_output
|
||||
'''
|
||||
try:
|
||||
input_arg = '%s[%s]' % (input_filepath, page)
|
||||
extra_options += ' -resize %s' % size
|
||||
backend.execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath='%s:%s' % (format, output_filepath), quality=quality)
|
||||
input_arg = u'%s[%s]' % (input_filepath, page)
|
||||
extra_options += u' -resize %s' % size
|
||||
backend.execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath=u'%s:%s' % (format, output_filepath), quality=quality)
|
||||
finally:
|
||||
if cleanup_files:
|
||||
cleanup(input_filepath)
|
||||
@@ -141,7 +141,7 @@ def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, f
|
||||
|
||||
def get_page_count(input_filepath):
|
||||
try:
|
||||
return int(backend.execute_identify(input_filepath, '-format %n'))
|
||||
return len(backend.execute_identify(unicode(input_filepath)).splitlines())
|
||||
except Exception, e:
|
||||
#TODO: send to other page number identifying program
|
||||
return 1
|
||||
@@ -154,12 +154,12 @@ def convert_document_for_ocr(document, page=0, format='tif'):
|
||||
#Convert for OCR
|
||||
temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
|
||||
temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
|
||||
transformation_output_file = '%s_trans%s%s%s' % (temp_path, page, os.extsep, format)
|
||||
unpaper_input_file = '%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
|
||||
unpaper_output_file = '%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
|
||||
convert_output_file = '%s_ocr%s%s%s' % (temp_path, page, os.extsep, format)
|
||||
transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, format)
|
||||
unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
|
||||
unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
|
||||
convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, format)
|
||||
|
||||
input_arg = '%s[%s]' % (input_filepath, page)
|
||||
input_arg = u'%s[%s]' % (input_filepath, page)
|
||||
|
||||
transformation_list = []
|
||||
try:
|
||||
|
||||
@@ -1,20 +1,18 @@
|
||||
import shlex
|
||||
import subprocess
|
||||
|
||||
from converter.conf.settings import GM_PATH
|
||||
from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS
|
||||
from converter.exceptions import ConvertError, UnknownFormat, IdentifyError
|
||||
|
||||
CONVERTER_ERROR_STRING_NO_DECODER = 'No decode delegate for this image format'
|
||||
CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format'
|
||||
|
||||
|
||||
def execute_identify(input_filepath, arguments):
|
||||
def execute_identify(input_filepath, arguments=u''):
|
||||
command = []
|
||||
command.append(GM_PATH)
|
||||
command.append(unicode(GM_PATH))
|
||||
command.append(u'identify')
|
||||
command.extend(shlex.split(str(arguments)))
|
||||
command.append(input_filepath)
|
||||
|
||||
command.extend(unicode(arguments).split())
|
||||
command.append(unicode(input_filepath))
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
@@ -24,13 +22,13 @@ def execute_identify(input_filepath, arguments):
|
||||
|
||||
def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None):
|
||||
command = []
|
||||
command.append(GM_PATH)
|
||||
command.append(unicode(GM_PATH))
|
||||
command.append(u'convert')
|
||||
command.extend(shlex.split(str(QUALITY_SETTINGS[quality])))
|
||||
command.append(input_filepath)
|
||||
command.extend(unicode(QUALITY_SETTINGS[quality]).split())
|
||||
command.append(unicode(input_filepath))
|
||||
if arguments:
|
||||
command.extend(shlex.split(str(arguments)))
|
||||
command.append(output_filepath)
|
||||
command.extend(unicode(arguments).split())
|
||||
command.append(unicode(output_filepath))
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import shlex
|
||||
import subprocess
|
||||
|
||||
from converter.conf.settings import IM_IDENTIFY_PATH
|
||||
@@ -7,14 +6,14 @@ from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS
|
||||
from converter.exceptions import ConvertError, UnknownFormat, \
|
||||
IdentifyError
|
||||
|
||||
CONVERTER_ERROR_STRING_NO_DECODER = 'no decode delegate for this image format'
|
||||
CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format'
|
||||
|
||||
|
||||
def execute_identify(input_filepath, arguments):
|
||||
def execute_identify(input_filepath, arguments=u''):
|
||||
command = []
|
||||
command.append(IM_IDENTIFY_PATH)
|
||||
command.extend(shlex.split(str(arguments)))
|
||||
command.append(input_filepath)
|
||||
command.append(unicode(IM_IDENTIFY_PATH))
|
||||
command.extend(unicode(arguments).split())
|
||||
command.append(unicode(input_filepath))
|
||||
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
@@ -25,12 +24,12 @@ def execute_identify(input_filepath, arguments):
|
||||
|
||||
def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None):
|
||||
command = []
|
||||
command.append(IM_CONVERT_PATH)
|
||||
command.extend(shlex.split(str(QUALITY_SETTINGS[quality])))
|
||||
command.append(input_filepath)
|
||||
command.append(unicode(IM_CONVERT_PATH))
|
||||
command.extend(unicode(QUALITY_SETTINGS[quality]).split())
|
||||
command.append(unicode(input_filepath))
|
||||
if arguments:
|
||||
command.extend(shlex.split(str(arguments)))
|
||||
command.append(output_filepath)
|
||||
command.extend(unicode(arguments).split())
|
||||
command.append(unicode(output_filepath))
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
|
||||
@@ -117,21 +117,27 @@ class Document(models.Model):
|
||||
def update_page_count(self, save=True):
|
||||
handle, filepath = tempfile.mkstemp()
|
||||
self.save_to_file(filepath)
|
||||
total_pages = get_page_count(filepath)
|
||||
|
||||
for page_number in range(total_pages):
|
||||
DocumentPage.objects.get_or_create(
|
||||
document=self, page_number=page_number + 1)
|
||||
|
||||
detected_pages = get_page_count(filepath)
|
||||
os.close(handle)
|
||||
try:
|
||||
os.remove(filepath)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
current_pages = DocumentPage.objects.filter(document=self).order_by('page_number',)
|
||||
if current_pages.count() > detected_pages:
|
||||
for page in current_pages[detected_pages:]:
|
||||
page.delete()
|
||||
|
||||
for page_number in range(detected_pages):
|
||||
DocumentPage.objects.get_or_create(
|
||||
document=self, page_number=page_number + 1)
|
||||
|
||||
if save:
|
||||
self.save()
|
||||
|
||||
return detected_pages
|
||||
|
||||
def save_to_file(self, filepath, buffer_size=1024 * 1024):
|
||||
input_descriptor = self.open()
|
||||
output_descriptor = open(filepath, 'wb')
|
||||
@@ -287,7 +293,7 @@ class DocumentPage(models.Model):
|
||||
page_number = models.PositiveIntegerField(default=1, editable=False, verbose_name=_(u'page number'))
|
||||
|
||||
def __unicode__(self):
|
||||
return '%s - %d - %s' % (unicode(self.document), self.page_number, self.page_label)
|
||||
return u'%s - %d - %s' % (unicode(self.document), self.page_number, self.page_label)
|
||||
|
||||
class Meta:
|
||||
verbose_name = _(u'document page')
|
||||
@@ -309,8 +315,8 @@ class MetadataGroup(models.Model):
|
||||
verbose_name_plural = _(u'metadata document groups')
|
||||
|
||||
|
||||
INCLUSION_AND = '&'
|
||||
INCLUSION_OR = '|'
|
||||
INCLUSION_AND = u'&'
|
||||
INCLUSION_OR = u'|'
|
||||
|
||||
INCLUSION_CHOICES = (
|
||||
(INCLUSION_AND, _(u'and')),
|
||||
@@ -318,21 +324,21 @@ INCLUSION_CHOICES = (
|
||||
)
|
||||
|
||||
OPERATOR_CHOICES = (
|
||||
('exact', _(u'is equal')),
|
||||
('iexact', _(u'is equal (case insensitive)')),
|
||||
('contains', _(u'contains')),
|
||||
('icontains', _(u'contains (case insensitive)')),
|
||||
('in', _(u'is in')),
|
||||
('gt', _(u'is greater than')),
|
||||
('gte', _(u'is greater than or equal')),
|
||||
('lt', _(u'is less than')),
|
||||
('lte', _(u'is less than or equal')),
|
||||
('startswith', _(u'starts with')),
|
||||
('istartswith', _(u'starts with (case insensitive)')),
|
||||
('endswith', _(u'ends with')),
|
||||
('iendswith', _(u'ends with (case insensitive)')),
|
||||
('regex', _(u'is in regular expression')),
|
||||
('iregex', _(u'is in regular expression (case insensitive)')),
|
||||
(u'exact', _(u'is equal')),
|
||||
(u'iexact', _(u'is equal (case insensitive)')),
|
||||
(u'contains', _(u'contains')),
|
||||
(u'icontains', _(u'contains (case insensitive)')),
|
||||
(u'in', _(u'is in')),
|
||||
(u'gt', _(u'is greater than')),
|
||||
(u'gte', _(u'is greater than or equal')),
|
||||
(u'lt', _(u'is less than')),
|
||||
(u'lte', _(u'is less than or equal')),
|
||||
(u'startswith', _(u'starts with')),
|
||||
(u'istartswith', _(u'starts with (case insensitive)')),
|
||||
(u'endswith', _(u'ends with')),
|
||||
(u'iendswith', _(u'ends with (case insensitive)')),
|
||||
(u'regex', _(u'is in regular expression')),
|
||||
(u'iregex', _(u'is in regular expression (case insensitive)')),
|
||||
)
|
||||
|
||||
|
||||
@@ -347,7 +353,7 @@ class MetadataGroupItem(models.Model):
|
||||
enabled = models.BooleanField(default=True, verbose_name=_(u'enabled'))
|
||||
|
||||
def __unicode__(self):
|
||||
return '[%s] %s %s %s %s %s' % ('x' if self.enabled else ' ', self.get_inclusion_display(), self.metadata_type, _(u'not') if self.negated else '', self.get_operator_display(), self.expression)
|
||||
return u'[%s] %s %s %s %s %s' % (u'x' if self.enabled else u' ', self.get_inclusion_display(), self.metadata_type, _(u'not') if self.negated else u'', self.get_operator_display(), self.expression)
|
||||
|
||||
class Meta:
|
||||
verbose_name = _(u'metadata group item')
|
||||
@@ -364,7 +370,7 @@ class DocumentPageTransformation(models.Model):
|
||||
arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: {\'degrees\':90}'))
|
||||
|
||||
def __unicode__(self):
|
||||
return '%s - %s' % (unicode(self.document_page), self.get_transformation_display())
|
||||
return u'%s - %s' % (unicode(self.document_page), self.get_transformation_display())
|
||||
|
||||
class Meta:
|
||||
ordering = ('order',)
|
||||
|
||||
@@ -20,7 +20,7 @@ def get_language_backend():
|
||||
try:
|
||||
module = import_module(u'.'.join([u'ocr', u'lang', TESSERACT_LANGUAGE]))
|
||||
except ImportError:
|
||||
sys.stderr.write('\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE)
|
||||
sys.stderr.write(u'\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE)
|
||||
return None
|
||||
return module
|
||||
|
||||
@@ -40,9 +40,9 @@ def cleanup(filename):
|
||||
|
||||
|
||||
def run_tesseract(input_filename, output_filename_base, lang=None):
|
||||
command = [TESSERACT_PATH, input_filename, output_filename_base]
|
||||
command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(output_filename_base)]
|
||||
if lang is not None:
|
||||
command += ['-l', lang]
|
||||
command += [u'-l', lang]
|
||||
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
@@ -85,7 +85,7 @@ def ocr_cleanup(text):
|
||||
result = word
|
||||
if result:
|
||||
output.append(result)
|
||||
output.append('\n')
|
||||
output.append(u'\n')
|
||||
|
||||
return u' '.join(output)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user