Merge branch 'converter_export' into smart_staging

This commit is contained in:
Roberto Rosario
2011-07-18 04:12:28 -04:00
40 changed files with 1138 additions and 550 deletions

View File

@@ -5,7 +5,7 @@ Open source, Django based document manager with custom metadata indexing, file s
[Website](http://bit.ly/mayan-edms)
Requirements
Basic requirements
---
Python:
@@ -15,6 +15,21 @@ Python:
* django-filetransfers - File upload/download abstraction
* celery- asynchronous task queue/job queue based on distributed message passing
* django-celery - celery Django integration
* django-mptt - Utilities for implementing a modified pre-order traversal tree in django
* python-magic - A python wrapper for libmagic
* django-taggit - Simple tagging for django
* slate - The simplest way to extract text from PDFs in Python
Execute pip install -r requirements/production.txt to install the python/django dependencies automatically.
Executables:
* tesseract-ocr - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google.
* unpaper - post-processing scanned and photocopied book pages
Optional requirements
---
For the GridFS storage backend:
@@ -22,13 +37,12 @@ For the GridFS storage backend:
* GridFS - a storage specification for large objects in MongoDB
* MongoDB - a scalable, open source, document-oriented database
Or execute pip install -r requirements/production.txt to install the dependencies automatically.
Libraries:
Executables:
* libmagic - MIME detection library, if not installed Mayan will fall back to using python's simpler mimetype built in library
Mayan has the ability to switch between different image conversion backends, at the moment these two are supported:
* libmagic - MIME detection library
* tesseract-ocr - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google.
* unpaper - post-processing scanned and photocopied book pages
* ImageMagick - Convert, Edit, Or Compose Bitmap Images
* GraphicMagick - Robust collection of tools and libraries to read, write, and manipulate an image.

View File

@@ -1,11 +1,16 @@
from django.utils.translation import ugettext_lazy as _
from django.core.exceptions import ImproperlyConfigured
from navigation.api import register_sidebar_template
TRANFORMATION_CHOICES = {
u'rotate': u'-rotate %(degrees)d'
}
from converter.utils import load_backend
from converter.conf.settings import GRAPHICS_BACKEND
formats_list = {'text': _('file formats'), 'view': 'formats_list', 'famfam': 'pictures'}
register_sidebar_template(['formats_list'], 'converter_file_formats_help.html')
try:
backend = load_backend().ConverterClass()
except ImproperlyConfigured:
raise ImproperlyConfigured(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)

View File

@@ -1,66 +1,29 @@
import os
import subprocess
from django.utils.importlib import import_module
from django.template.defaultfilters import slugify
from converter.conf.settings import UNPAPER_PATH
from converter.conf.settings import OCR_OPTIONS
from converter.conf.settings import DEFAULT_OPTIONS
from converter.conf.settings import LOW_QUALITY_OPTIONS
from converter.conf.settings import HIGH_QUALITY_OPTIONS
from converter.conf.settings import PRINT_QUALITY_OPTIONS
from converter.conf.settings import GRAPHICS_BACKEND
from converter.conf.settings import UNOCONV_PATH
from converter.exceptions import UnpaperError, OfficeConversionError
import hashlib
from common import TEMPORARY_DIRECTORY
from documents.utils import document_save_to_temp_dir
DEFAULT_ZOOM_LEVEL = 100
DEFAULT_ROTATION = 0
DEFAULT_PAGE_INDEX_NUMBER = 0
DEFAULT_FILE_FORMAT = u'jpg'
DEFAULT_OCR_FILE_FORMAT = u'tif'
from converter.conf.settings import UNOCONV_PATH
from converter.exceptions import UnpaperError, OfficeConversionError
from converter.literals import DEFAULT_PAGE_NUMBER, \
DEFAULT_OCR_FILE_FORMAT, QUALITY_DEFAULT, DEFAULT_ZOOM_LEVEL, \
DEFAULT_ROTATION, DEFAULT_FILE_FORMAT, QUALITY_HIGH
QUALITY_DEFAULT = u'quality_default'
QUALITY_LOW = u'quality_low'
QUALITY_HIGH = u'quality_high'
QUALITY_PRINT = u'quality_print'
QUALITY_SETTINGS = {
QUALITY_DEFAULT: DEFAULT_OPTIONS,
QUALITY_LOW: LOW_QUALITY_OPTIONS,
QUALITY_HIGH: HIGH_QUALITY_OPTIONS,
QUALITY_PRINT: PRINT_QUALITY_OPTIONS
}
from converter import backend
from converter.literals import TRANSFORMATION_CHOICES
from converter.literals import TRANSFORMATION_RESIZE, \
TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \
TRANSFORMATION_ZOOM
from converter.literals import DIMENSION_SEPARATOR
HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
CONVERTER_OFFICE_FILE_EXTENSIONS = [
u'ods', u'docx', u'doc'
]
def _lazy_load(fn):
_cached = []
def _decorated():
if not _cached:
_cached.append(fn())
return _cached[0]
return _decorated
@_lazy_load
def _get_backend():
return import_module(GRAPHICS_BACKEND)
try:
backend = _get_backend()
except ImportError:
raise ImportError(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)
def cleanup(filename):
"""
Tries to remove the given filename. Ignores non-existent files
@@ -71,21 +34,6 @@ def cleanup(filename):
pass
def execute_unpaper(input_filepath, output_filepath):
"""
Executes the program unpaper using subprocess's Popen
"""
command = []
command.append(UNPAPER_PATH)
command.append(u'--overwrite')
command.append(input_filepath)
command.append(output_filepath)
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
raise UnpaperError(proc.stderr.readline())
def execute_unoconv(input_filepath, arguments=''):
"""
Executes the program unoconv using subprocess's Popen
@@ -109,19 +57,11 @@ def cache_cleanup(input_filepath, *args, **kwargs):
def create_image_cache_filename(input_filepath, *args, **kwargs):
if input_filepath:
temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
final_filepath = []
[final_filepath.append(str(arg)) for arg in args]
final_filepath.extend([u'%s_%s' % (key, value) for key, value in kwargs.items()])
temp_path += slugify(u'_'.join(final_filepath))
return temp_path
hash_value = HASH_FUNCTION(u''.join([input_filepath, unicode(args), unicode(kwargs)]))
return os.path.join(TEMPORARY_DIRECTORY, hash_value)
else:
return None
def convert_office_document(input_filepath):
if os.path.exists(UNOCONV_PATH):
@@ -138,15 +78,14 @@ def convert_document(document, *args, **kwargs):
return convert(document_save_to_temp_dir(document, document.checksum), *args, **kwargs)
def convert(input_filepath, *args, **kwargs):
def convert(input_filepath, cleanup_files=True, *args, **kwargs):
size = kwargs.get('size')
file_format = kwargs.get('file_format', DEFAULT_FILE_FORMAT)
extra_options = kwargs.get('extra_options', u'')
zoom = kwargs.get('zoom', DEFAULT_ZOOM_LEVEL)
rotation = kwargs.get('rotation', DEFAULT_ROTATION)
page = kwargs.get('page', DEFAULT_PAGE_INDEX_NUMBER)
cleanup_files = kwargs.get('cleanup_files', True)
page = kwargs.get('page', DEFAULT_PAGE_NUMBER)
quality = kwargs.get('quality', QUALITY_DEFAULT)
transformations = kwargs.get('transformations', [])
unoconv_output = None
@@ -160,20 +99,32 @@ def convert(input_filepath, *args, **kwargs):
if result:
unoconv_output = result
input_filepath = result
extra_options = u''
input_arg = u'%s[%s]' % (input_filepath, page)
extra_options += u' -resize %s' % size
transformations.append(
{
'transformation': TRANSFORMATION_RESIZE,
'arguments': dict(zip([u'width', u'height'], size.split(DIMENSION_SEPARATOR)))
}
)
if zoom != 100:
extra_options += u' -resize %d%% ' % zoom
transformations.append(
{
'transformation': TRANSFORMATION_ZOOM,
'arguments': {'percent': zoom}
}
)
if rotation != 0 and rotation != 360:
extra_options += u' -rotate %d ' % rotation
transformations.append(
{
'transformation': TRANSFORMATION_ROTATE,
'arguments': {'degrees': rotation}
}
)
if format == u'jpg':
extra_options += u' -quality 85'
try:
backend.execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath=u'%s:%s' % (file_format, output_filepath), quality=quality)
backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, quality=quality, transformations=transformations, page=page, file_format=file_format)
finally:
if cleanup_files:
cleanup(input_filepath)
@@ -184,51 +135,22 @@ def convert(input_filepath, *args, **kwargs):
def get_page_count(input_filepath):
try:
return len(backend.execute_identify(unicode(input_filepath)).splitlines())
except:
#TODO: send to other page number identifying program
return 1
return backend.get_page_count(input_filepath)
def get_document_dimensions(document, *args, **kwargs):
document_filepath = create_image_cache_filename(document.checksum, *args, **kwargs)
if os.path.exists(document_filepath):
options = [u'-format', u'%w %h']
return [int(dimension) for dimension in backend.execute_identify(unicode(document_filepath), options).split()]
return [int(dimension) for dimension in backend.identify_file(unicode(document_filepath), options).split()]
else:
return [0, 0]
def convert_document_for_ocr(document, page=DEFAULT_PAGE_INDEX_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT):
#Extract document file
input_filepath = document_save_to_temp_dir(document, document.uuid)
#Convert for OCR
temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, file_format)
unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format)
input_arg = u'%s[%s]' % (input_filepath, page)
try:
document_page = document.documentpage_set.get(page_number=page + 1)
transformation_string, warnings = document_page.get_transformation_string()
#Apply default transformations
backend.execute_convert(input_filepath=input_arg, quality=QUALITY_HIGH, arguments=transformation_string, output_filepath=transformation_output_file)
#Do OCR operations
backend.execute_convert(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file)
# Process by unpaper
execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file)
# Convert to tif
backend.execute_convert(input_filepath=unpaper_output_file, output_filepath=convert_output_file)
finally:
cleanup(transformation_output_file)
cleanup(unpaper_input_file)
cleanup(unpaper_output_file)
return convert_output_file
def get_available_transformations_choices():
result = []
for transformation in backend.get_available_transformations():
transformation_template = u'%s %s' % (TRANSFORMATION_CHOICES[transformation]['label'], u','.join(['<%s>' % argument['name'] if argument['required'] else '[%s]' % argument['name'] for argument in TRANSFORMATION_CHOICES[transformation]['arguments']]))
result.append([transformation, transformation_template])
return result

View File

@@ -0,0 +1,18 @@
class ConverterBase(object):
"""
Base class that all backend classes must inherit
"""
def convert_file(self, input_filepath, *args, **kwargs):
raise NotImplementedError("Your %s class has not defined a convert_file() method, which is required." % self.__class__.__name__)
def convert_document(self, document, *args, **kwargs):
raise NotImplementedError("Your %s class has not defined a convert_document() method, which is required." % self.__class__.__name__)
def get_format_list(self):
raise NotImplementedError("Your %s class has not defined a get_format_list() method, which is required." % self.__class__.__name__)
def get_available_transformations(self):
raise NotImplementedError("Your %s class has not defined a get_available_transformations() method, which is required." % self.__class__.__name__)
def get_page_count(self):
raise NotImplementedError("Your %s class has not defined a get_page_count() method, which is required." % self.__class__.__name__)

View File

@@ -1,71 +0,0 @@
import subprocess
import re
from converter.conf.settings import GM_PATH
from converter.conf.settings import GM_SETTINGS
from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS
from converter.exceptions import ConvertError, UnknownFormat, IdentifyError
CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format'
CONVERTER_ERROR_STARTS_WITH = u'starts with'
def execute_identify(input_filepath, arguments=None):
command = []
command.append(unicode(GM_PATH))
command.append(u'identify')
if arguments:
command.extend(arguments)
command.append(unicode(input_filepath))
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
raise IdentifyError(proc.stderr.readline())
return proc.stdout.read()
def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None):
command = []
command.append(unicode(GM_PATH))
command.append(u'convert')
command.extend(unicode(QUALITY_SETTINGS[quality]).split())
command.extend(unicode(GM_SETTINGS).split())
command.append(unicode(input_filepath))
if arguments:
command.extend(unicode(arguments).split())
command.append(unicode(output_filepath))
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
#Got an error from convert program
error_line = proc.stderr.readline()
if (CONVERTER_ERROR_STRING_NO_DECODER in error_line) or (CONVERTER_ERROR_STARTS_WITH in error_line):
#Try to determine from error message which class of error is it
raise UnknownFormat
else:
raise ConvertError(error_line)
def get_format_list():
"""
Call GraphicsMagick to parse all of it's supported file formats, and
return a list of the names and descriptions
"""
format_regex = re.compile(' *([A-Z0-9]+)[*]? +([A-Z0-9]+) +([rw\-+]+) *(.*).*')
formats = []
command = []
command.append(unicode(GM_PATH))
command.append(u'convert')
command.append(u'-list')
command.append(u'formats')
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
raise ConvertError(proc.stderr.readline())
for line in proc.stdout.readlines():
fields = format_regex.findall(line)
if fields:
formats.append((fields[0][0], fields[0][3]))
return formats

View File

@@ -0,0 +1,119 @@
import subprocess
import re
from converter.conf.settings import GM_PATH
from converter.conf.settings import GM_SETTINGS
from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS
from converter.exceptions import ConvertError, UnknownFormat, \
IdentifyError
from converter.backends import ConverterBase
from converter.literals import TRANSFORMATION_RESIZE, \
TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \
TRANSFORMATION_ZOOM
from converter.literals import DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER, \
DEFAULT_FILE_FORMAT
CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format'
CONVERTER_ERROR_STARTS_WITH = u'starts with'
class ConverterClass(ConverterBase):
def identify_file(self, input_filepath, arguments=None):
command = []
command.append(unicode(GM_PATH))
command.append(u'identify')
if arguments:
command.extend(arguments)
command.append(unicode(input_filepath))
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
raise IdentifyError(proc.stderr.readline())
return proc.stdout.read()
def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
arguments = []
if transformations:
for transformation in transformations:
if transformation['transformation'] == TRANSFORMATION_RESIZE:
dimensions = []
dimensions.append(unicode(transformation['arguments']['width']))
if 'height' in transformation['arguments']:
dimensions.append(unicode(transformation['arguments']['height']))
arguments.append(u'-resize')
arguments.append(u'%s' % DIMENSION_SEPARATOR.join(dimensions))
elif transformation['transformation'] == TRANSFORMATION_ZOOM:
arguments.append(u'-resize')
arguments.append(u'%d%%' % transformation['arguments']['percent'])
elif transformation['transformation'] == TRANSFORMATION_ROTATE:
arguments.append(u'-rotate')
arguments.append(u'%s' % transformation['arguments']['degrees'])
if format == u'jpeg':
arguments.append(u'-quality')
arguments.append(u'85')
# Graphicsmagick page number is 0 base
input_arg = u'%s[%d]' % (input_filepath, page - 1)
# Specify the file format next to the output filename
output_filepath = u'%s:%s' % (file_format, output_filepath)
command = []
command.append(unicode(GM_PATH))
command.append(u'convert')
command.extend(unicode(QUALITY_SETTINGS[quality]).split())
command.extend(unicode(GM_SETTINGS).split())
command.append(unicode(input_arg))
if arguments:
command.extend(arguments)
command.append(unicode(output_filepath))
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
#Got an error from convert program
error_line = proc.stderr.readline()
if (CONVERTER_ERROR_STRING_NO_DECODER in error_line) or (CONVERTER_ERROR_STARTS_WITH in error_line):
#Try to determine from error message which class of error is it
raise UnknownFormat
else:
raise ConvertError(error_line)
def get_format_list(self):
"""
Call GraphicsMagick to parse all of it's supported file formats, and
return a list of the names and descriptions
"""
format_regex = re.compile(' *([A-Z0-9]+)[*]? +([A-Z0-9]+) +([rw\-+]+) *(.*).*')
formats = []
command = []
command.append(unicode(GM_PATH))
command.append(u'convert')
command.append(u'-list')
command.append(u'formats')
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
raise ConvertError(proc.stderr.readline())
for line in proc.stdout.readlines():
fields = format_regex.findall(line)
if fields:
formats.append((fields[0][0], fields[0][3]))
return formats
def get_available_transformations(self):
return [
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \
TRANSFORMATION_ZOOM
]
def get_page_count(self, input_filepath):
try:
return len(self.identify_file(unicode(input_filepath)).splitlines())
except:
#TODO: send to other page number identifying program
return 1

View File

@@ -1,68 +0,0 @@
import subprocess
import re
from converter.conf.settings import IM_IDENTIFY_PATH
from converter.conf.settings import IM_CONVERT_PATH
from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS
from converter.exceptions import ConvertError, UnknownFormat, \
IdentifyError
CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format'
def execute_identify(input_filepath, arguments=None):
command = []
command.append(unicode(IM_IDENTIFY_PATH))
if arguments:
command.extend(arguments)
command.append(unicode(input_filepath))
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
raise IdentifyError(proc.stderr.readline())
return proc.stdout.read()
def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None):
command = []
command.append(unicode(IM_CONVERT_PATH))
command.extend(unicode(QUALITY_SETTINGS[quality]).split())
command.append(unicode(input_filepath))
if arguments:
command.extend(unicode(arguments).split())
command.append(unicode(output_filepath))
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
#Got an error from convert program
error_line = proc.stderr.readline()
if CONVERTER_ERROR_STRING_NO_DECODER in error_line:
#Try to determine from error message which class of error is it
raise UnknownFormat
else:
raise ConvertError(error_line)
def get_format_list():
"""
Call ImageMagick to parse all of it's supported file formats, and
return a list of the names and descriptions
"""
format_regex = re.compile(' *([A-Z0-9]+)[*]? +([A-Z0-9]+) +([rw\-+]+) *(.*).*')
formats = []
command = []
command.append(unicode(IM_CONVERT_PATH))
command.append(u'-list')
command.append(u'format')
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
raise ConvertError(proc.stderr.readline())
for line in proc.stdout.readlines():
fields = format_regex.findall(line)
if fields:
formats.append((fields[0][0], fields[0][3]))
return formats

View File

@@ -0,0 +1,118 @@
import subprocess
import re
from converter.conf.settings import IM_IDENTIFY_PATH
from converter.conf.settings import IM_CONVERT_PATH
from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS
from converter.exceptions import ConvertError, UnknownFormat, \
IdentifyError
from converter.backends import ConverterBase
from converter.literals import TRANSFORMATION_RESIZE, \
TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \
TRANSFORMATION_ZOOM
from converter.literals import DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER, \
DEFAULT_FILE_FORMAT
CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format'
class ConverterClass(ConverterBase):
def identify_file(self, input_filepath, arguments=None):
command = []
command.append(unicode(IM_IDENTIFY_PATH))
if arguments:
command.extend(arguments)
command.append(unicode(input_filepath))
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
raise IdentifyError(proc.stderr.readline())
return proc.stdout.read()
def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
arguments = []
if transformations:
for transformation in transformations:
if transformation['transformation'] == TRANSFORMATION_RESIZE:
dimensions = []
dimensions.append(unicode(transformation['arguments']['width']))
if 'height' in transformation['arguments']:
dimensions.append(unicode(transformation['arguments']['height']))
arguments.append(u'-resize')
arguments.append(u'%s' % DIMENSION_SEPARATOR.join(dimensions))
elif transformation['transformation'] == TRANSFORMATION_ZOOM:
arguments.append(u'-resize')
arguments.append(u'%d%%' % transformation['arguments']['percent'])
elif transformation['transformation'] == TRANSFORMATION_ROTATE:
arguments.append(u'-rotate')
arguments.append(u'%s' % transformation['arguments']['degrees'])
if format == u'jpeg':
arguments.append(u'-quality')
arguments.append(u'85')
# Imagemagick page number is 0 base
input_arg = u'%s[%d]' % (input_filepath, page - 1)
# Specify the file format next to the output filename
output_filepath = u'%s:%s' % (file_format, output_filepath)
command = []
command.append(unicode(IM_CONVERT_PATH))
command.extend(unicode(QUALITY_SETTINGS[quality]).split())
command.append(unicode(input_arg))
if arguments:
command.extend(arguments)
command.append(unicode(output_filepath))
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
#Got an error from convert program
error_line = proc.stderr.readline()
if CONVERTER_ERROR_STRING_NO_DECODER in error_line:
#Try to determine from error message which class of error is it
raise UnknownFormat
else:
raise ConvertError(error_line)
def get_format_list(self):
"""
Call ImageMagick to parse all of it's supported file formats, and
return a list of the names and descriptions
"""
format_regex = re.compile(' *([A-Z0-9]+)[*]? +([A-Z0-9]+) +([rw\-+]+) *(.*).*')
formats = []
command = []
command.append(unicode(IM_CONVERT_PATH))
command.append(u'-list')
command.append(u'format')
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
raise ConvertError(proc.stderr.readline())
for line in proc.stdout.readlines():
fields = format_regex.findall(line)
if fields:
formats.append((fields[0][0], fields[0][3]))
return formats
def get_available_transformations(self):
return [
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \
TRANSFORMATION_ZOOM
]
def get_page_count(self, input_filepath):
try:
return len(self.identify_file(unicode(input_filepath)).splitlines())
except:
#TODO: send to other page number identifying program
return 1

View File

@@ -0,0 +1,3 @@
from PIL import Image
Image.init()

View File

@@ -0,0 +1,93 @@
import slate
from PIL import Image
from django.utils.translation import ugettext_lazy as _
from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS
from converter.exceptions import ConvertError, UnknownFormat, IdentifyError
from converter.backends import ConverterBase
from converter.literals import TRANSFORMATION_RESIZE, \
TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM
from converter.literals import QUALITY_DEFAULT, DEFAULT_PAGE_NUMBER, \
DEFAULT_FILE_FORMAT
from converter.utils import get_mimetype
class ConverterClass(ConverterBase):
def get_page_count(self, input_filepath):
page_count = 1
mimetype, encoding = get_mimetype(input_filepath)
if mimetype == 'application/pdf':
# If file is a PDF open it with slate to determine the page
# count
with open(input_filepath) as fd:
pages = slate.PDF(fd)
return len(pages)
try:
im = Image.open(input_filepath)
except IOError: #cannot identify image file
# Return a page count of 1, to atleast allow the document
# to be created
return 1
try:
while 1:
im.seek(im.tell()+1)
page_count += 1
# do something to im
except EOFError:
pass # end of sequence
return page_count
def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
try:
im = Image.open(input_filepath)
except Exception: # Python Imaging Library doesn't recognize it as an image
raise UnknownFormat
current_page = 0
try:
while current_page == page - 1:
im.seek(im.tell() + 1)
current_page += 1
# do something to im
except EOFError:
pass # end of sequence
if transformations:
for transformation in transformations:
aspect = 1.0 * im.size[1] / im.size[0]
if transformation['transformation'] == TRANSFORMATION_RESIZE:
width = int(transformation['arguments']['width'])
height = int(transformation['arguments'].get('height', 1.0 * width * aspect))
im = im.resize((width, height), Image.ANTIALIAS)
elif transformation['transformation'] == TRANSFORMATION_ZOOM:
decimal_value = float(transformation['arguments']['percent']) / 100
im = im.transform((im.size[0] * decimal_value, im.size[1] * decimal_value), Image.EXTENT, (0, 0, im.size[0], im.size[1]))
elif transformation['transformation'] == TRANSFORMATION_ROTATE:
# PIL counter degress counter-clockwise, reverse them
im = im.rotate(360 - transformation['arguments']['degrees'])
if im.mode not in ('L', 'RGB'):
im = im.convert('RGB')
im.save(output_filepath, format=file_format)
def get_format_list(self):
"""
Introspect PIL's internal registry to obtain a list of the
supported file types
"""
formats = []
for format_name in Image.ID:
formats.append((format_name, u''))
return formats
def get_available_transformations(self):
return [
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \
TRANSFORMATION_ZOOM
]

View File

@@ -9,12 +9,11 @@ register_settings(
settings=[
{'name': u'IM_CONVERT_PATH', 'global_name': u'CONVERTER_IM_CONVERT_PATH', 'default': u'/usr/bin/convert', 'description': _(u'File path to imagemagick\'s convert program.'), 'exists': True},
{'name': u'IM_IDENTIFY_PATH', 'global_name': u'CONVERTER_IM_IDENTIFY_PATH', 'default': u'/usr/bin/identify', 'description': _(u'File path to imagemagick\'s identify program.'), 'exists': True},
{'name': u'UNPAPER_PATH', 'global_name': u'CONVERTER_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
{'name': u'GM_PATH', 'global_name': u'CONVERTER_GM_PATH', 'default': u'/usr/bin/gm', 'description': _(u'File path to graphicsmagick\'s program.'), 'exists': True},
{'name': u'GM_SETTINGS', 'global_name': u'CONVERTER_GM_SETTINGS', 'default': u''},
{'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.imagemagick', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick and converter.backends.graphicsmagick.')},
{'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick, converter.backends.graphicsmagick and converter.backends.python.')},
{'name': u'UNOCONV_PATH', 'global_name': u'CONVERTER_UNOCONV_PATH', 'default': u'/usr/bin/unoconv', 'exists': True},
{'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'},
#{'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'},
{'name': u'DEFAULT_OPTIONS', 'global_name': u'CONVERTER_DEFAULT_OPTIONS', 'default': u''},
{'name': u'LOW_QUALITY_OPTIONS', 'global_name': u'CONVERTER_LOW_QUALITY_OPTIONS', 'default': u''},
{'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'},

View File

@@ -0,0 +1,64 @@
from django.utils.translation import ugettext_lazy as _
from converter.conf.settings import DEFAULT_OPTIONS
from converter.conf.settings import LOW_QUALITY_OPTIONS
from converter.conf.settings import HIGH_QUALITY_OPTIONS
from converter.conf.settings import PRINT_QUALITY_OPTIONS
DEFAULT_ZOOM_LEVEL = 100
DEFAULT_ROTATION = 0
DEFAULT_PAGE_NUMBER = 1
DEFAULT_FILE_FORMAT = u'jpeg'
DEFAULT_OCR_FILE_FORMAT = u'tif'
QUALITY_DEFAULT = u'quality_default'
QUALITY_LOW = u'quality_low'
QUALITY_HIGH = u'quality_high'
QUALITY_PRINT = u'quality_print'
QUALITY_SETTINGS = {
QUALITY_DEFAULT: DEFAULT_OPTIONS,
QUALITY_LOW: LOW_QUALITY_OPTIONS,
QUALITY_HIGH: HIGH_QUALITY_OPTIONS,
QUALITY_PRINT: PRINT_QUALITY_OPTIONS
}
DIMENSION_SEPARATOR = u'x'
TRANSFORMATION_RESIZE = u'resize'
TRANSFORMATION_ROTATE = u'rotate'
TRANSFORMATION_DENSITY = u'density'
TRANSFORMATION_ZOOM = u'zoom'
TRANSFORMATION_CHOICES = {
TRANSFORMATION_RESIZE: {
'label': _(u'Resize'),
'description': _(u'Resize.'),
'arguments': [
{'name': 'width', 'label': _(u'width'), 'required': True},
{'name': 'height', 'label': _(u'height'), 'required': False},
]
},
TRANSFORMATION_ROTATE: {
'label': _(u'Rotate'),
'description': _(u'Rotate by n degress.'),
'arguments': [
{'name': 'degrees', 'label': _(u'degrees'), 'required': True}
]
},
TRANSFORMATION_DENSITY: {
'label': _(u'Density'),
'description': _(u'Change the resolution (ie: DPI) without resizing.'),
'arguments': [
{'name': 'width', 'label': _(u'width'), 'required': True},
{'name': 'height', 'label': _(u'height'), 'required': False},
]
},
TRANSFORMATION_ZOOM: {
'label': _(u'Zoom'),
'description': _(u'Zoom by n percent.'),
'arguments': [
{'name': 'percent', 'label': _(u'percent'), 'required': True}
]
},
}

View File

@@ -1,6 +1,18 @@
import os
from django.core.exceptions import ImproperlyConfigured
from django.utils.importlib import import_module
try:
from python_magic import magic
USE_PYTHON_MAGIC = True
except:
import mimetypes
mimetypes.init()
USE_PYTHON_MAGIC = False
#http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python
def copyfile(source, dest, buffer_size=1024 * 1024):
"""
Copy a file from source to dest. source and dest
@@ -21,3 +33,79 @@ def copyfile(source, dest, buffer_size=1024 * 1024):
source.close()
dest.close()
def _lazy_load(fn):
_cached = []
def _decorated():
if not _cached:
_cached.append(fn())
return _cached[0]
return _decorated
@_lazy_load
def load_backend():
from converter.conf.settings import GRAPHICS_BACKEND as backend_name
try:
module = import_module('.base', 'converter.backends.%s' % backend_name)
import warnings
warnings.warn(
"Short names for CONVERTER_BACKEND are deprecated; prepend with 'converter.backends.'",
PendingDeprecationWarning
)
return module
except ImportError, e:
# Look for a fully qualified converter backend name
try:
return import_module('.base', backend_name)
except ImportError, e_user:
# The converter backend wasn't found. Display a helpful error message
# listing all possible (built-in) converter backends.
backend_dir = os.path.join(os.path.dirname(__file__), 'backends')
try:
available_backends = [f for f in os.listdir(backend_dir)
if os.path.isdir(os.path.join(backend_dir, f))
and not f.startswith('.')]
except EnvironmentError:
available_backends = []
available_backends.sort()
if backend_name not in available_backends:
error_msg = ("%r isn't an available converter backend. \n" +
"Try using converter.backends.XXX, where XXX is one of:\n %s\n" +
"Error was: %s") % \
(backend_name, ", ".join(map(repr, available_backends)), e_user)
raise ImproperlyConfigured(error_msg)
else:
raise # If there's some other error, this must be an error in Mayan itself.
def get_mimetype(filepath):
"""
Determine a file's mimetype by calling the system's libmagic
library via python-magic or fallback to use python's mimetypes
library
"""
file_mimetype = u''
file_mime_encoding = u''
if USE_PYTHON_MAGIC:
if os.path.exists(filepath):
try:
source = open(filepath, 'r')
mime = magic.Magic(mime=True)
file_mimetype = mime.from_buffer(source.read())
source.seek(0)
mime_encoding = magic.Magic(mime_encoding=True)
file_mime_encoding = mime_encoding.from_buffer(source.read())
finally:
if source:
source.close()
else:
path, filename = os.path.split(filepath)
file_mimetype, file_mime_encoding = mimetypes.guess_type(filename)
return file_mimetype, file_mime_encoding

View File

@@ -1,38 +1,18 @@
from django.utils.translation import ugettext_lazy as _
from django.shortcuts import render_to_response
from django.template import RequestContext
from django.utils.importlib import import_module
from converter import backend
from converter.conf.settings import GRAPHICS_BACKEND
def _lazy_load(fn):
_cached = []
def _decorated():
if not _cached:
_cached.append(fn())
return _cached[0]
return _decorated
@_lazy_load
def _get_backend():
return import_module(GRAPHICS_BACKEND)
try:
backend = _get_backend()
except ImportError:
raise ImportError(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)
def formats_list(request):
#check_permissions(request.user, [PERMISSION_DOCUMENT_VIEW])
context = {
'title': _(u'suported file formats'),
'hide_object': True,
'object_list': backend.get_format_list(),
'object_list': sorted(backend.get_format_list()),
'extra_columns': [
{
'name': _(u'name'),

View File

@@ -18,10 +18,6 @@ def default_uuid():
"""unicode(uuid.uuid4())"""
return unicode(uuid.uuid4())
available_transformations = {
'rotate': {'label': _(u'Rotate [degrees]'), 'arguments': [{'name': 'degrees'}]}
}
register_settings(
namespace=u'documents',
module=u'documents.conf.settings',
@@ -31,8 +27,6 @@ register_settings(
{'name': u'UUID_FUNCTION', 'global_name': u'DOCUMENTS_UUID_FUNCTION', 'default': default_uuid},
# Storage
{'name': u'STORAGE_BACKEND', 'global_name': u'DOCUMENTS_STORAGE_BACKEND', 'default': FileBasedStorage},
# Transformations
{'name': u'AVAILABLE_TRANSFORMATIONS', 'global_name': u'DOCUMENTS_AVAILABLE_TRANSFORMATIONS', 'default': available_transformations},
# Usage
{'name': u'PREVIEW_SIZE', 'global_name': u'DOCUMENTS_PREVIEW_SIZE', 'default': u'640x480'},
{'name': u'PRINT_SIZE', 'global_name': u'DOCUMENTS_PRINT_SIZE', 'default': u'1400'},

View File

@@ -13,3 +13,24 @@ class RecentDocumentManager(models.Manager):
to_delete = self.model.objects.filter(user=user)[RECENT_COUNT:]
for recent_to_delete in to_delete:
recent_to_delete.delete()
class DocumentPageTransformationManager(models.Manager):
def get_for_document_page(self, document_page):
return self.model.objects.filter(document_page=document_page)
def get_for_document_page_as_list(self, document_page):
warnings = []
transformations = []
for transformation in self.get_for_document_page(document_page).values('transformation', 'arguments'):
try:
transformations.append(
{
'transformation': transformation['transformation'],
'arguments': eval(transformation['arguments'], {})
}
)
except Exception, e:
warnings.append(e)
return transformations, warnings

View File

@@ -12,15 +12,13 @@ from python_magic import magic
from taggit.managers import TaggableManager
from dynamic_search.api import register
from converter.api import get_page_count
from converter import TRANFORMATION_CHOICES
from converter.api import get_available_transformations_choices
from documents.conf.settings import CHECKSUM_FUNCTION
from documents.conf.settings import UUID_FUNCTION
from documents.conf.settings import STORAGE_BACKEND
from documents.conf.settings import AVAILABLE_TRANSFORMATIONS
from documents.managers import RecentDocumentManager
available_transformations = ([(name, data['label']) for name, data in AVAILABLE_TRANSFORMATIONS.items()])
from documents.managers import RecentDocumentManager, \
DocumentPageTransformationManager
def get_filename_from_uuid(instance, filename):
@@ -92,7 +90,7 @@ class Document(models.Model):
mimetype, page count and transformation when originally created
"""
new_document = not self.pk
transformations = kwargs.pop('transformations', None)
super(Document, self).save(*args, **kwargs)
if new_document:
@@ -101,7 +99,8 @@ class Document(models.Model):
self.update_mimetype(save=False)
self.save()
self.update_page_count(save=False)
self.apply_default_transformations()
if transformations:
self.apply_default_transformations(transformations)
@models.permalink
def get_absolute_url(self):
@@ -202,21 +201,21 @@ class Document(models.Model):
exists in storage
"""
return self.file.storage.exists(self.file.path)
def apply_default_transformations(self):
def apply_default_transformations(self, transformations):
#Only apply default transformations on new documents
if DEFAULT_TRANSFORMATIONS and reduce(lambda x, y: x + y, [page.documentpagetransformation_set.count() for page in self.documentpage_set.all()]) == 0:
for transformation in DEFAULT_TRANSFORMATIONS:
if 'name' in transformation:
for document_page in self.documentpage_set.all():
page_transformation = DocumentPageTransformation(
document_page=document_page,
order=0,
transformation=transformation['name'])
if 'arguments' in transformation:
page_transformation.arguments = transformation['arguments']
if reduce(lambda x, y: x + y, [page.documentpagetransformation_set.count() for page in self.documentpage_set.all()]) == 0:
for transformation in transformations:
for document_page in self.documentpage_set.all():
page_transformation = DocumentPageTransformation(
document_page=document_page,
order=0,
transformation=transformation.get('transformation'),
arguments=transformation.get('arguments')
)
page_transformation.save()
page_transformation.save()
class DocumentTypeFilename(models.Model):
@@ -258,26 +257,13 @@ class DocumentPage(models.Model):
verbose_name = _(u'document page')
verbose_name_plural = _(u'document pages')
def get_transformation_list(self):
return DocumentPageTransformation.objects.get_for_document_page_as_list(self)
@models.permalink
def get_absolute_url(self):
return ('document_page_view', [self.pk])
def get_transformation_string(self):
transformation_list = []
warnings = []
for page_transformation in self.documentpagetransformation_set.all():
try:
if page_transformation.transformation in TRANFORMATION_CHOICES:
transformation_list.append(
TRANFORMATION_CHOICES[page_transformation.transformation] % eval(
page_transformation.arguments
)
)
except Exception, e:
warnings.append(e)
return u' '.join(transformation_list), warnings
class DocumentPageTransformation(models.Model):
"""
@@ -286,9 +272,11 @@ class DocumentPageTransformation(models.Model):
"""
document_page = models.ForeignKey(DocumentPage, verbose_name=_(u'document page'))
order = models.PositiveIntegerField(default=0, blank=True, null=True, verbose_name=_(u'order'), db_index=True)
transformation = models.CharField(choices=available_transformations, max_length=128, verbose_name=_(u'transformation'))
transformation = models.CharField(choices=get_available_transformations_choices(), max_length=128, verbose_name=_(u'transformation'))
arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: {\'degrees\':90}'))
objects = DocumentPageTransformationManager()
def __unicode__(self):
return u'"%s" for %s' % (self.get_transformation_display(), unicode(self.document_page))

View File

@@ -1,14 +1,12 @@
from django.conf.urls.defaults import patterns, url
from converter.api import QUALITY_HIGH, QUALITY_PRINT
from converter.literals import QUALITY_HIGH, QUALITY_PRINT
from documents.conf.settings import PREVIEW_SIZE
from documents.conf.settings import PRINT_SIZE
from documents.conf.settings import THUMBNAIL_SIZE
from documents.conf.settings import DISPLAY_SIZE
from documents.conf.settings import MULTIPAGE_PREVIEW_SIZE
#from documents.literals import UPLOAD_SOURCE_LOCAL, \
# UPLOAD_SOURCE_STAGING, UPLOAD_SOURCE_USER_STAGING
urlpatterns = patterns('documents.views',
url(r'^list/$', 'document_list', (), 'document_list'),

View File

@@ -20,10 +20,11 @@ from common.widgets import two_state_template
from common.literals import PAGE_SIZE_DIMENSIONS, \
PAGE_ORIENTATION_PORTRAIT, PAGE_ORIENTATION_LANDSCAPE
from common.conf.settings import DEFAULT_PAPER_SIZE
from converter.api import convert_document, QUALITY_DEFAULT
from converter.api import convert_document
from converter.exceptions import UnkownConvertError, UnknownFormat
from converter.api import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, \
DEFAULT_FILE_FORMAT, QUALITY_PRINT
from converter.literals import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, \
DEFAULT_FILE_FORMAT, QUALITY_PRINT, QUALITY_DEFAULT, \
DEFAULT_PAGE_NUMBER
from filetransfers.api import serve_file
from grouping.utils import get_document_group_subtemplate
from metadata.api import save_metadata_list, \
@@ -286,38 +287,14 @@ def document_edit(request, document_id):
}, context_instance=RequestContext(request))
def calculate_converter_arguments(document, *args, **kwargs):
size = kwargs.pop('size', PREVIEW_SIZE)
quality = kwargs.pop('quality', QUALITY_DEFAULT)
page = kwargs.pop('page', 1)
file_format = kwargs.pop('file_format', DEFAULT_FILE_FORMAT)
zoom = kwargs.pop('zoom', DEFAULT_ZOOM_LEVEL)
rotation = kwargs.pop('rotation', DEFAULT_ROTATION)
document_page = DocumentPage.objects.get(document=document, page_number=page)
transformation_string, warnings = document_page.get_transformation_string()
arguments = {
'size': size,
'file_format': file_format,
'quality': quality,
'extra_options': transformation_string,
'page': page - 1,
'zoom': zoom,
'rotation': rotation
}
return arguments, warnings
def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_DEFAULT):
check_permissions(request.user, [PERMISSION_DOCUMENT_VIEW])
document = get_object_or_404(Document, pk=document_id)
page = int(request.GET.get('page', 1))
page = int(request.GET.get('page', DEFAULT_PAGE_NUMBER))
zoom = int(request.GET.get('zoom', 100))
zoom = int(request.GET.get('zoom', DEFAULT_ZOOM_LEVEL))
if zoom < ZOOM_MIN_LEVEL:
zoom = ZOOM_MIN_LEVEL
@@ -325,16 +302,17 @@ def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_
if zoom > ZOOM_MAX_LEVEL:
zoom = ZOOM_MAX_LEVEL
rotation = int(request.GET.get('rotation', 0)) % 360
rotation = int(request.GET.get('rotation', DEFAULT_ROTATION)) % 360
arguments, warnings = calculate_converter_arguments(document, size=size, file_format=DEFAULT_FILE_FORMAT, quality=quality, page=page, zoom=zoom, rotation=rotation)
document_page = get_object_or_404(document.documentpage_set, page_number=page)
transformations, warnings = document_page.get_transformation_list()
if warnings and (request.user.is_staff or request.user.is_superuser):
for warning in warnings:
messages.warning(request, _(u'Page transformation error: %s') % warning)
try:
output_file = convert_document(document, **arguments)
output_file = convert_document(document, size=size, file_format=DEFAULT_FILE_FORMAT, quality=quality, page=page, zoom=zoom, rotation=rotation, transformations=transformations)
except UnkownConvertError, e:
if request.user.is_staff or request.user.is_superuser:
messages.error(request, e)
@@ -592,13 +570,13 @@ def document_page_view(request, document_page_id):
document_page = get_object_or_404(DocumentPage, pk=document_page_id)
zoom = int(request.GET.get('zoom', 100))
rotation = int(request.GET.get('rotation', 0))
zoom = int(request.GET.get('zoom', DEFAULT_ZOOM_LEVEL))
rotation = int(request.GET.get('rotation', DEFAULT_ROTATION))
document_page_form = DocumentPageForm(instance=document_page, zoom=zoom, rotation=rotation)
base_title = _(u'details for: %s') % document_page
if zoom != 100:
if zoom != DEFAULT_ZOOM_LEVEL:
zoom_text = u'(%d%%)' % zoom
else:
zoom_text = u''

View File

@@ -9,7 +9,7 @@ from documents.models import Document
from main.api import register_tool
from ocr.conf.settings import AUTOMATIC_OCR
from ocr.models import DocumentQueue
from ocr.models import DocumentQueue, QueueTransformation
#Permissions
PERMISSION_OCR_DOCUMENT = {'namespace': 'ocr', 'name': 'ocr_document', 'label': _(u'Submit document for OCR')}
@@ -30,20 +30,27 @@ re_queue_multiple_document = {'text': _('re-queue'), 'view': 're_queue_multiple_
queue_document_delete = {'text': _(u'delete'), 'view': 'queue_document_delete', 'args': 'object.id', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]}
queue_document_multiple_delete = {'text': _(u'delete'), 'view': 'queue_document_multiple_delete', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]}
document_queue_disable = {'text': _(u'stop queue'), 'view': 'document_queue_disable', 'args': 'object.id', 'famfam': 'control_stop_blue', 'permissions': [PERMISSION_OCR_QUEUE_ENABLE_DISABLE]}
document_queue_enable = {'text': _(u'activate queue'), 'view': 'document_queue_enable', 'args': 'object.id', 'famfam': 'control_play_blue', 'permissions': [PERMISSION_OCR_QUEUE_ENABLE_DISABLE]}
document_queue_disable = {'text': _(u'stop queue'), 'view': 'document_queue_disable', 'args': 'queue.id', 'famfam': 'control_stop_blue', 'permissions': [PERMISSION_OCR_QUEUE_ENABLE_DISABLE]}
document_queue_enable = {'text': _(u'activate queue'), 'view': 'document_queue_enable', 'args': 'queue.id', 'famfam': 'control_play_blue', 'permissions': [PERMISSION_OCR_QUEUE_ENABLE_DISABLE]}
all_document_ocr_cleanup = {'text': _(u'clean up pages content'), 'view': 'all_document_ocr_cleanup', 'famfam': 'text_strikethrough', 'permissions': [PERMISSION_OCR_CLEAN_ALL_PAGES], 'description': _(u'Runs a language filter to remove common OCR mistakes from document pages content.')}
queue_document_list = {'text': _(u'queue document list'), 'view': 'queue_document_list', 'famfam': 'hourglass', 'permissions': [PERMISSION_OCR_DOCUMENT]}
node_active_list = {'text': _(u'active tasks'), 'view': 'node_active_list', 'famfam': 'server_chart', 'permissions': [PERMISSION_OCR_DOCUMENT]}
setup_queue_transformation_list = {'text': _(u'transformations'), 'view': 'setup_queue_transformation_list', 'args': 'queue.pk', 'famfam': 'shape_move_front'}
setup_queue_transformation_create = {'text': _(u'add transformation'), 'view': 'setup_queue_transformation_create', 'args': 'queue.pk', 'famfam': 'shape_square_add'}
setup_queue_transformation_edit = {'text': _(u'edit'), 'view': 'setup_queue_transformation_edit', 'args': 'transformation.pk', 'famfam': 'shape_square_edit'}
setup_queue_transformation_delete = {'text': _(u'delete'), 'view': 'setup_queue_transformation_delete', 'args': 'transformation.pk', 'famfam': 'shape_square_delete'}
register_links(Document, [submit_document])
register_links(DocumentQueue, [document_queue_disable, document_queue_enable])
register_links(DocumentQueue, [document_queue_disable, document_queue_enable, setup_queue_transformation_list])
register_links(QueueTransformation, [setup_queue_transformation_edit, setup_queue_transformation_delete])
register_multi_item_links(['queue_document_list'], [re_queue_multiple_document, queue_document_multiple_delete])
register_links(['queue_document_list', 'node_active_list'], [queue_document_list, node_active_list], menu_name='secondary_menu')
register_links(['setup_queue_transformation_create', 'setup_queue_transformation_edit', 'setup_queue_transformation_delete', 'document_queue_disable', 'document_queue_enable', 'queue_document_list', 'node_active_list', 'setup_queue_transformation_list'], [queue_document_list, node_active_list], menu_name='secondary_menu')
register_links(['setup_queue_transformation_edit', 'setup_queue_transformation_delete', 'setup_queue_transformation_list', 'setup_queue_transformation_create'], [setup_queue_transformation_create], menu_name='sidebar')
register_tool(all_document_ocr_cleanup, namespace='ocr', title=_(u'OCR'))

View File

@@ -9,13 +9,15 @@ import sys
from django.utils.translation import ugettext as _
from django.utils.importlib import import_module
from converter.api import convert_document_for_ocr
from converter.api import convert
from documents.models import DocumentPage
from ocr.conf.settings import TESSERACT_PATH
from ocr.conf.settings import TESSERACT_LANGUAGE
from ocr.conf.settings import PDFTOTEXT_PATH
from ocr.exceptions import TesseractError, PdftotextError
from ocr.exceptions import TesseractError
from ocr.conf.settings import UNPAPER_PATH
from ocr.parsers import parse_document_page
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
def get_language_backend():
@@ -30,7 +32,7 @@ def get_language_backend():
return None
return module
backend = get_language_backend()
language_backend = get_language_backend()
def cleanup(filename):
@@ -58,63 +60,38 @@ def run_tesseract(input_filename, output_filename_base, lang=None):
raise TesseractError(error_text)
def run_pdftotext(input_filename, output_filename, page_number=None):
"""
Execute the command line binary of pdftotext
"""
command = [unicode(PDFTOTEXT_PATH)]
if page_number:
command.extend([u'-nopgbrk', u'-f', unicode(page_number), u'-l', unicode(page_number)])
command.extend([unicode(input_filename), unicode(output_filename)])
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
error_text = proc.stderr.read()
raise PdftotextError(error_text)
def do_document_ocr(document):
"""
Do OCR on all the pages of the given document object, first
trying to extract text from PDF using pdftotext then by calling
tesseract
first try to extract text from document pages using the registered
parser if the parser fails or if there is no parser registered for
the document mimetype do a visual OCR by calling tesseract
"""
for page_index, document_page in enumerate(document.documentpage_set.all()):
desc, filepath = tempfile.mkstemp()
imagefile = None
source = u''
for document_page in document.documentpage_set.all():
try:
if document.file_mimetype == u'application/pdf':
pdf_filename = os.extsep.join([filepath, u'pdf'])
document.save_to_file(pdf_filename)
run_pdftotext(pdf_filename, filepath, document_page.page_number)
cleanup(pdf_filename)
if os.stat(filepath).st_size == 0:
#PDF page had no text, run tesseract on the page
imagefile = convert_document_for_ocr(document, page=page_index)
run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
ocr_output = os.extsep.join([filepath, u'txt'])
source = _(u'Text from OCR')
else:
ocr_output = filepath
source = _(u'Text extracted from PDF')
else:
imagefile = convert_document_for_ocr(document, page=page_index)
run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
ocr_output = os.extsep.join([filepath, u'txt'])
source = _(u'Text from OCR')
f = codecs.open(ocr_output, 'r', 'utf-8')
document_page = document.documentpage_set.get(page_number=page_index + 1)
document_page.content = ocr_cleanup(f.read().strip())
document_page.page_label = source
document_page.save()
f.close()
cleanup(ocr_output)
finally:
os.close(desc)
cleanup(filepath)
if imagefile:
cleanup(imagefile)
# Try to extract text by means of a parser
parse_document_page(document_page)
except (ParserError, ParserUnknownFile):
# Fall back to doing visual OCR
pass
#desc, filepath = tempfile.mkstemp()
#imagefile = None
#source = u''
#imagefile = convert_document_for_ocr(document, page=document_page.page_number)
#run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
#ocr_output = os.extsep.join([filepath, u'txt'])
#source = _(u'Text from OCR')
#f = codecs.open(ocr_output, 'r', 'utf-8')
#document_page.content = ocr_cleanup(f.read().strip())
#document_page.page_label = source
#document_page.save()
#f.close()
#cleanup(ocr_output)
#finally:
# pass
#os.close(desc)
#cleanup(filepath)
#if imagefile:
# cleanup(imagefile)
def ocr_cleanup(text):
@@ -127,8 +104,8 @@ def ocr_cleanup(text):
for line in text.splitlines():
line = line.strip()
for word in line.split():
if backend:
result = backend.check_word(word)
if language_backend:
result = language_backend.check_word(word)
else:
result = word
if result:
@@ -147,3 +124,53 @@ def clean_pages():
if page.content:
page.content = ocr_cleanup(page.content)
page.save()
def execute_unpaper(input_filepath, output_filepath):
"""
Executes the program unpaper using subprocess's Popen
"""
command = []
command.append(UNPAPER_PATH)
command.append(u'--overwrite')
command.append(input_filepath)
command.append(output_filepath)
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
raise UnpaperError(proc.stderr.readline())
'''
def convert_document_for_ocr(document, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT):
#Extract document file
input_filepath = document_save_to_temp_dir(document, document.uuid)
#Convert for OCR
temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, file_format)
unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format)
try:
document_page = document.documentpage_set.get(page_number=page)
transformations, warnings = document_page.get_transformation_list()
#Apply default transformations
backend.convert_file(input_filepath=input_filepath, page=page, quality=QUALITY_HIGH, transformations=transformations, output_filepath=transformation_output_file)
#Do OCR operations
backend.convert_file(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file)
# Process by unpaper
execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file)
# Convert to tif
backend.convert_file(input_filepath=unpaper_output_file, output_filepath=convert_output_file)
finally:
cleanup(transformation_output_file)
cleanup(unpaper_input_file)
cleanup(unpaper_output_file)
return convert_output_file
'''

View File

@@ -13,8 +13,9 @@ register_settings(
{'name': u'REPLICATION_DELAY', 'global_name': u'OCR_REPLICATION_DELAY', 'default': 10, 'description': _(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.')},
{'name': u'NODE_CONCURRENT_EXECUTION', 'global_name': u'OCR_NODE_CONCURRENT_EXECUTION', 'default': 1, 'description': _(u'Maximum amount of concurrent document OCRs a node can perform.')},
{'name': u'AUTOMATIC_OCR', 'global_name': u'OCR_AUTOMATIC_OCR', 'default': False, 'description': _(u'Automatically queue newly created documents for OCR.')},
{'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'exists': True},
{'name': u'QUEUE_PROCESSING_INTERVAL', 'global_name': u'OCR_QUEUE_PROCESSING_INTERVAL', 'default': 10},
{'name': u'CACHE_URI', 'global_name': u'OCR_CACHE_URI', 'default': None, 'description': _(u'URI in the form: "memcached://127.0.0.1:11211/" to specify a cache backend to use for locking. Multiple hosts can be specified separated by a semicolon.')}
{'name': u'CACHE_URI', 'global_name': u'OCR_CACHE_URI', 'default': None, 'description': _(u'URI in the form: "memcached://127.0.0.1:11211/" to specify a cache backend to use for locking. Multiple hosts can be specified separated by a semicolon.')},
{'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
{'name': u'PARSERS_PDFTOTEXT_PATH', 'global_name': u'OCR_PARSERS_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'exists': True},
]
)

View File

@@ -4,7 +4,3 @@ class AlreadyQueued(Exception):
class TesseractError(Exception):
pass
class PdftotextError(Exception):
pass

21
apps/ocr/forms.py Normal file
View File

@@ -0,0 +1,21 @@
from django import forms
from django.utils.translation import ugettext_lazy as _
from django.utils.translation import ugettext
from ocr.models import QueueTransformation
class QueueTransformationForm(forms.ModelForm):
class Meta:
model = QueueTransformation
def __init__(self, *args, **kwargs):
super(QueueTransformationForm, self).__init__(*args, **kwargs)
self.fields['content_type'].widget = forms.HiddenInput()
self.fields['object_id'].widget = forms.HiddenInput()
class QueueTransformationForm_create(forms.ModelForm):
class Meta:
model = QueueTransformation
exclude = ('content_type', 'object_id')

View File

@@ -1,18 +0,0 @@
from django.db import models
from ocr.exceptions import AlreadyQueued
class DocumentQueueManager(models.Manager):
"""
Module manager class to handle adding documents to an OCR document
queue
"""
def queue_document(self, document, queue_name='default'):
document_queue = self.model.objects.get(name=queue_name)
if document_queue.queuedocument_set.filter(document=document):
raise AlreadyQueued
document_queue.queuedocument_set.create(document=document, delay=True)
return document_queue

41
apps/ocr/managers.py Normal file
View File

@@ -0,0 +1,41 @@
from django.db import models
from django.contrib.contenttypes.models import ContentType
from ocr.exceptions import AlreadyQueued
class DocumentQueueManager(models.Manager):
"""
Module manager class to handle adding documents to an OCR document
queue
"""
def queue_document(self, document, queue_name='default'):
document_queue = self.model.objects.get(name=queue_name)
if document_queue.queuedocument_set.filter(document=document):
raise AlreadyQueued
document_queue.queuedocument_set.create(document=document, delay=True)
return document_queue
class QueueTransformationManager(models.Manager):
def get_for_object(self, obj):
ct = ContentType.objects.get_for_model(obj)
return self.model.objects.filter(content_type=ct).filter(object_id=obj.pk)
def get_for_object_as_list(self, obj):
warnings = []
transformations = []
for transformation in self.get_for_object(obj).values('transformation', 'arguments'):
try:
transformations.append(
{
'transformation': transformation['transformation'],
'arguments': eval(transformation['arguments'], {})
}
)
except Exception, e:
warnings.append(e)
return transformations, warnings

View File

@@ -2,13 +2,16 @@ from django.db import models
from django.utils.translation import ugettext_lazy as _
from django.utils.translation import ugettext
from django.core.exceptions import ObjectDoesNotExist
from django.contrib.contenttypes.models import ContentType
from django.contrib.contenttypes import generic
from documents.models import Document
from converter.api import get_available_transformations_choices
from ocr.literals import DOCUMENTQUEUE_STATE_STOPPED, \
DOCUMENTQUEUE_STATE_CHOICES, QUEUEDOCUMENT_STATE_PENDING, \
QUEUEDOCUMENT_STATE_CHOICES
from ocr.manager import DocumentQueueManager
from ocr.managers import DocumentQueueManager, QueueTransformationManager
class DocumentQueue(models.Model):
@@ -51,3 +54,26 @@ class QueueDocument(models.Model):
return unicode(self.document)
except ObjectDoesNotExist:
return ugettext(u'Missing document.')
class QueueTransformation(models.Model):
"""
Model that stores the transformation and transformation arguments
for a given document queue
"""
content_type = models.ForeignKey(ContentType)
object_id = models.PositiveIntegerField()
content_object = generic.GenericForeignKey('content_type', 'object_id')
order = models.PositiveIntegerField(default=0, blank=True, null=True, verbose_name=_(u'order'), db_index=True)
transformation = models.CharField(choices=get_available_transformations_choices(), max_length=128, verbose_name=_(u'transformation'))
arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: %s') % u'{\'degrees\':90}')
objects = QueueTransformationManager()
def __unicode__(self):
return self.get_transformation_display()
class Meta:
ordering = ('order',)
verbose_name = _(u'document queue transformation')
verbose_name_plural = _(u'document queue transformations')

View File

@@ -0,0 +1,40 @@
import codecs
import os
import subprocess
import tempfile
import sys
import slate
from django.utils.translation import ugettext as _
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
mimetype_registry = {}
def register_parser(mimetype, function):
mimetype_registry[mimetype] = {'function': function}
def pdf_parser(document_page):
fd = document_page.document.open()
pdf_pages = slate.PDF(fd)
fd.close()
if pdf_pages[document_page.page_number - 1] == '\x0c':
raise ParserError
document_page.content = pdf_pages[document_page.page_number - 1]
document_page.page_label = _(u'Text extracted from PDF')
document_page.save()
def parse_document_page(document_page):
try:
mimetype_registry[document_page.document.file_mimetype]['function'](document_page)
except KeyError:
raise ParserUnknownFile
register_parser('application/pdf', pdf_parser)

View File

@@ -0,0 +1,10 @@
class ParserError(Exception):
"""
Raised when a text parser fails to understand a file it been passed
or the resulting parsed text is invalid
"""
pass
class ParserUnknownFile(Exception):
pass

View File

@@ -1,16 +1,22 @@
from django.conf.urls.defaults import patterns, url
urlpatterns = patterns('ocr.views',
url(r'^(?P<document_id>\d+)/submit/$', 'submit_document', (), 'submit_document'),
url(r'^ocr/queue/document/list/$', 'queue_document_list', (), 'queue_document_list'),
url(r'^ocr/queue/document/(?P<queue_document_id>\d+)/delete/$', 'queue_document_delete', (), 'queue_document_delete'),
url(r'^ocr/queue/document/multiple/delete/$', 'queue_document_multiple_delete', (), 'queue_document_multiple_delete'),
url(r'^ocr/queue/document/(?P<queue_document_id>\d+)/re-queue/$', 're_queue_document', (), 're_queue_document'),
url(r'^ocr/queue/document/multiple/re-queue/$', 're_queue_multiple_document', (), 're_queue_multiple_document'),
url(r'^document/(?P<document_id>\d+)/submit/$', 'submit_document', (), 'submit_document'),
url(r'^queue/document/list/$', 'queue_document_list', (), 'queue_document_list'),
url(r'^queue/document/(?P<queue_document_id>\d+)/delete/$', 'queue_document_delete', (), 'queue_document_delete'),
url(r'^queue/document/multiple/delete/$', 'queue_document_multiple_delete', (), 'queue_document_multiple_delete'),
url(r'^queue/document/(?P<queue_document_id>\d+)/re-queue/$', 're_queue_document', (), 're_queue_document'),
url(r'^queue/document/multiple/re-queue/$', 're_queue_multiple_document', (), 're_queue_multiple_document'),
url(r'^ocr/queue/(?P<document_queue_id>\d+)/enable/$', 'document_queue_enable', (), 'document_queue_enable'),
url(r'^ocr/queue/(?P<document_queue_id>\d+)/disable/$', 'document_queue_disable', (), 'document_queue_disable'),
url(r'^queue/(?P<document_queue_id>\d+)/enable/$', 'document_queue_enable', (), 'document_queue_enable'),
url(r'^queue/(?P<document_queue_id>\d+)/disable/$', 'document_queue_disable', (), 'document_queue_disable'),
url(r'^ocr/document/all/clean_up/$', 'all_document_ocr_cleanup', (), 'all_document_ocr_cleanup'),
url(r'^ocr/node/active/list/$', 'node_active_list', (), 'node_active_list'),
url(r'^document/all/clean_up/$', 'all_document_ocr_cleanup', (), 'all_document_ocr_cleanup'),
url(r'^node/active/list/$', 'node_active_list', (), 'node_active_list'),
url(r'^queue/(?P<document_queue_id>\d+)/transformation/list/$', 'setup_queue_transformation_list', (), 'setup_queue_transformation_list'),
url(r'^queue/(?P<document_queue_id>\w+)/transformation/create/$', 'setup_queue_transformation_create', (), 'setup_queue_transformation_create'),
url(r'^queue/transformation/(?P<transformation_id>\w+)/edit/$', 'setup_queue_transformation_edit', (), 'setup_queue_transformation_edit'),
url(r'^queue/transformation/(?P<transformation_id>\w+)/delete/$', 'setup_queue_transformation_delete', (), 'setup_queue_transformation_delete'),
)

View File

@@ -6,9 +6,8 @@ from django.shortcuts import render_to_response, get_object_or_404
from django.template import RequestContext
from django.contrib import messages
from django.views.generic.list_detail import object_list
from django.core.urlresolvers import reverse
from django.utils.translation import ugettext_lazy as _
from django.conf import settings
from django.core.urlresolvers import reverse
from celery.task.control import inspect
from permissions.api import check_permissions
@@ -18,12 +17,13 @@ from documents.widgets import document_link, document_thumbnail
from ocr import PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE, \
PERMISSION_OCR_QUEUE_ENABLE_DISABLE, PERMISSION_OCR_CLEAN_ALL_PAGES
from ocr.models import DocumentQueue, QueueDocument
from ocr.models import DocumentQueue, QueueDocument, QueueTransformation
from ocr.literals import QUEUEDOCUMENT_STATE_PENDING, \
QUEUEDOCUMENT_STATE_PROCESSING, DOCUMENTQUEUE_STATE_STOPPED, \
DOCUMENTQUEUE_STATE_ACTIVE
from ocr.exceptions import AlreadyQueued
from ocr.api import clean_pages
from ocr.forms import QueueTransformationForm, QueueTransformationForm_create
def queue_document_list(request, queue_name='default'):
@@ -38,8 +38,10 @@ def queue_document_list(request, queue_name='default'):
extra_context={
'title': _(u'documents in queue: %s') % document_queue,
'hide_object': True,
'object': document_queue,
'queue': document_queue,
'object_name': _(u'document queue'),
'navigation_object_name': 'queue',
'list_object_variable_name': 'queue_document',
'extra_columns': [
{'name': 'document', 'attribute': lambda x: document_link(x.document) if hasattr(x, 'document') else _(u'Missing document.')},
{'name': _(u'thumbnail'), 'attribute': lambda x: document_thumbnail(x.document)},
@@ -212,7 +214,8 @@ def document_queue_disable(request, document_queue_id):
return HttpResponseRedirect(next)
return render_to_response('generic_confirm.html', {
'object': document_queue,
'queue': document_queue,
'navigation_object_name': 'queue',
'title': _(u'Are you sure you wish to disable document queue: %s') % document_queue,
'next': next,
'previous': previous,
@@ -238,7 +241,8 @@ def document_queue_enable(request, document_queue_id):
return HttpResponseRedirect(next)
return render_to_response('generic_confirm.html', {
'object': document_queue,
'queue': document_queue,
'navigation_object_name': 'queue',
'title': _(u'Are you sure you wish to activate document queue: %s') % document_queue,
'next': next,
'previous': previous,
@@ -317,3 +321,141 @@ def node_active_list(request):
{'name': _(u'related object'), 'attribute': lambda x: display_link(x['related_object']) if x['related_object'] else u''}
],
}, context_instance=RequestContext(request))
def setup_queue_transformation_list(request, document_queue_id):
#check_permissions(request.user, [PERMISSION_SOURCES_SETUP_EDIT])
document_queue = get_object_or_404(DocumentQueue, pk=document_queue_id)
context = {
'object_list': QueueTransformation.objects.get_for_object(document_queue),
'title': _(u'transformations for: %s') % document_queue,
#'object_name': _(u'document queue'),
#'object': document_queue,
'queue': document_queue,
'object_name': _(u'document queue'),
'navigation_object_name': 'queue',
'list_object_variable_name': 'transformation',
'extra_columns': [
{'name': _(u'order'), 'attribute': 'order'},
{'name': _(u'transformation'), 'attribute': lambda x: x.get_transformation_display()},
{'name': _(u'arguments'), 'attribute': 'arguments'}
],
'hide_link': True,
'hide_object': True,
}
return render_to_response('generic_list.html', context,
context_instance=RequestContext(request))
def setup_queue_transformation_edit(request, transformation_id):
#check_permissions(request.user, [PERMISSION_SOURCES_SETUP_EDIT])
transformation = get_object_or_404(QueueTransformation, pk=transformation_id)
redirect_view = reverse('setup_queue_transformation_list', args=[transformation.content_object.pk])
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', redirect_view)))
if request.method == 'POST':
form = QueueTransformationForm(instance=transformation, data=request.POST)
if form.is_valid():
try:
# Test the validity of the argument field
eval(form.cleaned_data['arguments'], {})
except:
messages.error(request, _(u'Queue transformation argument error.'))
else:
try:
form.save()
messages.success(request, _(u'Queue transformation edited successfully'))
return HttpResponseRedirect(next)
except Exception, e:
messages.error(request, _(u'Error editing queue transformation; %s') % e)
else:
form = QueueTransformationForm(instance=transformation)
return render_to_response('generic_form.html', {
'title': _(u'Edit transformation: %s') % transformation,
'form': form,
'queue': transformation.content_object,
'transformation': transformation,
'navigation_object_list': [
{'object': 'queue', 'name': _(u'document queue')},
{'object': 'transformation', 'name': _(u'transformation')}
],
'next': next,
},
context_instance=RequestContext(request))
def setup_queue_transformation_delete(request, transformation_id):
#check_permissions(request.user, [PERMISSION_SOURCES_SETUP_EDIT])
transformation = get_object_or_404(QueueTransformation, pk=transformation_id)
redirect_view = reverse('setup_queue_transformation_list', args=[transformation.content_object.pk])
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', redirect_view)))
if request.method == 'POST':
try:
transformation.delete()
messages.success(request, _(u'Queue transformation deleted successfully.'))
except Exception, e:
messages.error(request, _(u'Error deleting queue transformation; %(error)s') % {
'error': e}
)
return HttpResponseRedirect(redirect_view)
return render_to_response('generic_confirm.html', {
'delete_view': True,
'transformation': transformation,
'queue': transformation.content_object,
'navigation_object_list': [
{'object': 'queue', 'name': _(u'document queue')},
{'object': 'transformation', 'name': _(u'transformation')}
],
'title': _(u'Are you sure you wish to delete queue transformation "%(transformation)s"') % {
'transformation': transformation.get_transformation_display(),
},
'previous': previous,
'form_icon': u'shape_square_delete.png',
},
context_instance=RequestContext(request))
def setup_queue_transformation_create(request, document_queue_id):
#check_permissions(request.user, [PERMISSION_SOURCES_SETUP_EDIT])
document_queue = get_object_or_404(DocumentQueue, pk=document_queue_id)
redirect_view = reverse('setup_queue_transformation_list', args=[document_queue.pk])
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', redirect_view)))
if request.method == 'POST':
form = QueueTransformationForm_create(request.POST)
if form.is_valid():
try:
# Test the validity of the argument field
eval(form.cleaned_data['arguments'], {})
except:
messages.error(request, _(u'Queue transformation argument error.'))
else:
try:
queue_tranformation = form.save(commit=False)
queue_tranformation.content_object = document_queue
queue_tranformation.save()
messages.success(request, _(u'Queue transformation created successfully'))
return HttpResponseRedirect(redirect_view)
except Exception, e:
messages.error(request, _(u'Error creating queue transformation; %s') % e)
else:
form = QueueTransformationForm_create()
return render_to_response('generic_form.html', {
'form': form,
'queue': document_queue,
'object_name': _(u'document queue'),
'navigation_object_name': 'queue',
'title': _(u'Create new transformation for queue: %s') % document_queue,
}, context_instance=RequestContext(request))

View File

@@ -6,3 +6,19 @@ class SourceTransformationManager(models.Manager):
def get_for_object(self, obj):
ct = ContentType.objects.get_for_model(obj)
return self.model.objects.filter(content_type=ct).filter(object_id=obj.pk)
def get_for_object_as_list(self, obj):
warnings = []
transformations = []
for transformation in self.get_for_object(obj).values('transformation', 'arguments'):
try:
transformations.append(
{
'transformation': transformation['transformation'],
'arguments': eval(transformation['arguments'], {})
}
)
except Exception, e:
warnings.append(e)
return transformations, warnings

View File

@@ -4,14 +4,13 @@ from django.contrib.contenttypes.models import ContentType
from django.contrib.contenttypes import generic
from documents.models import DocumentType
from documents.conf.settings import AVAILABLE_TRANSFORMATIONS
from documents.managers import RecentDocumentManager
from metadata.models import MetadataType
from converter.api import get_available_transformations_choices
from converter.literals import DIMENSION_SEPARATOR
from sources.managers import SourceTransformationManager
available_transformations = ([(name, data['label']) for name, data in AVAILABLE_TRANSFORMATIONS.items()])
SOURCE_UNCOMPRESS_CHOICE_Y = 'y'
SOURCE_UNCOMPRESS_CHOICE_N = 'n'
SOURCE_UNCOMPRESS_CHOICE_ASK = 'a'
@@ -120,7 +119,7 @@ class StagingFolder(InteractiveBaseModel):
if self.preview_height:
dimensions.append(unicode(self.preview_height))
return u'x'.join(dimensions)
return DIMENSION_SEPARATOR.join(dimensions)
class Meta(InteractiveBaseModel.Meta):
verbose_name = _(u'staging folder')
@@ -164,8 +163,8 @@ class SourceTransformation(models.Model):
object_id = models.PositiveIntegerField()
content_object = generic.GenericForeignKey('content_type', 'object_id')
order = models.PositiveIntegerField(default=0, blank=True, null=True, verbose_name=_(u'order'), db_index=True)
transformation = models.CharField(choices=available_transformations, max_length=128, verbose_name=_(u'transformation'))
arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: {\'degrees\':90}'))
transformation = models.CharField(choices=get_available_transformations_choices(), max_length=128, verbose_name=_(u'transformation'))
arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: %s') % u'{\'degrees\':90}')
objects = SourceTransformationManager()

View File

@@ -8,11 +8,9 @@ from django.utils.translation import ugettext
from django.contrib import messages
from django.utils.translation import ugettext_lazy as _
from converter import TRANFORMATION_CHOICES
from converter.api import convert, cache_cleanup
DEFAULT_STAGING_DIRECTORY = u'/tmp'
#from documents.conf.settings import DEFAULT_TRANSFORMATIONS
HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
#TODO: Do benchmarks
@@ -107,16 +105,15 @@ class StagingFile(object):
def upload(self):
"""
Return a StagingFile encapsulated in a File class instance to
allow for easier upload a staging files
allow for easier upload of staging files
"""
try:
return File(file(self.filepath, 'rb'), name=self.filename)
except Exception, exc:
raise Exception(ugettext(u'Unable to upload staging file: %s') % exc)
def delete(self, preview_size):
# tranformation_string, errors = get_transformation_string(DEFAULT_TRANSFORMATIONS)
cache_cleanup(self.filepath, size=preview_size)# , extra_options=tranformation_string)
def delete(self, preview_size, transformations):
cache_cleanup(self.filepath, size=preview_size, transformations=transformations)
try:
os.unlink(self.filepath)
except OSError, exc:
@@ -125,24 +122,7 @@ class StagingFile(object):
else:
raise OSError(ugettext(u'Unable to delete staging file: %s') % exc)
def preview(self, preview_size):
def preview(self, preview_size, transformations):
errors = []
# tranformation_string, errors = get_transformation_string(DEFAULT_TRANSFORMATIONS)
# output_file = convert(self.filepath, size=STAGING_FILES_PREVIEW_SIZE, extra_options=tranformation_string, cleanup_files=False)
output_file = convert(self.filepath, size=preview_size, cleanup_files=False)
output_file = convert(self.filepath, size=preview_size, cleanup_files=False, transformations=transformations)
return output_file, errors
def get_transformation_string(transformations):
transformation_list = []
errors = []
for transformation in transformations:
try:
if transformation['name'] in TRANFORMATION_CHOICES:
output = TRANFORMATION_CHOICES[transformation['name']] % eval(transformation['arguments'])
transformation_list.append(output)
except Exception, e:
errors.append(e)
tranformation_string = ' '.join(transformation_list)
return tranformation_string, errors

View File

@@ -16,12 +16,12 @@ urlpatterns = patterns('sources.views',
url(r'^setup/interactive/staging_folder/list/$', 'setup_source_list', {'source_type': SOURCE_CHOICE_STAGING}, 'setup_staging_folder_list'),
url(r'^setup/interactive/(?P<source_type>\w+)/list/$', 'setup_source_list', (), 'setup_source_list'),
url(r'^setup/interactive/(?P<source_type>\w+)/(?P<source_id>\w+)/edit/$', 'setup_source_edit', (), 'setup_source_edit'),
url(r'^setup/interactive/(?P<source_type>\w+)/(?P<source_id>\w+)/delete/$', 'setup_source_delete', (), 'setup_source_delete'),
url(r'^setup/interactive/(?P<source_type>\w+)/(?P<source_id>\d+)/edit/$', 'setup_source_edit', (), 'setup_source_edit'),
url(r'^setup/interactive/(?P<source_type>\w+)/(?P<source_id>\d+)/delete/$', 'setup_source_delete', (), 'setup_source_delete'),
url(r'^setup/interactive/(?P<source_type>\w+)/create/$', 'setup_source_create', (), 'setup_source_create'),
url(r'^setup/interactive/(?P<source_type>\w+)/(?P<source_id>\w+)/transformation/list/$', 'setup_source_transformation_list', (), 'setup_source_transformation_list'),
url(r'^setup/interactive/(?P<source_type>\w+)/(?P<source_id>\w+)/transformation/create/$', 'setup_source_transformation_create', (), 'setup_source_transformation_create'),
url(r'^setup/interactive/source/transformation/(?P<transformation_id>\w+)/edit/$', 'setup_source_transformation_edit', (), 'setup_source_transformation_edit'),
url(r'^setup/interactive/source/transformation/(?P<transformation_id>\w+)/delete/$', 'setup_source_transformation_delete', (), 'setup_source_transformation_delete'),
url(r'^setup/interactive/(?P<source_type>\w+)/(?P<source_id>\d+)/transformation/list/$', 'setup_source_transformation_list', (), 'setup_source_transformation_list'),
url(r'^setup/interactive/(?P<source_type>\w+)/(?P<source_id>\d+)/transformation/create/$', 'setup_source_transformation_create', (), 'setup_source_transformation_create'),
url(r'^setup/interactive/source/transformation/(?P<transformation_id>\d+)/edit/$', 'setup_source_transformation_edit', (), 'setup_source_transformation_edit'),
url(r'^setup/interactive/source/transformation/(?P<transformation_id>\d+)/delete/$', 'setup_source_transformation_delete', (), 'setup_source_transformation_delete'),
)

View File

@@ -129,9 +129,13 @@ def upload_interactive(request, source_type=None, source_id=None):
expand = True
else:
expand = False
if (not expand) or (expand and not _handle_zip_file(request, request.FILES['file'], document_type)):
transformations, errors = SourceTransformation.objects.get_for_object_as_list(web_form)
if (not expand) or (expand and not _handle_zip_file(request, request.FILES['file'], document_type=document_type, transformations=transformations)):
instance = form.save()
instance.save()
instance.apply_default_transformations(transformations)
if document_type:
instance.document_type = document_type
_handle_save_document(request, instance, form)
@@ -174,16 +178,18 @@ def upload_interactive(request, source_type=None, source_id=None):
expand = True
else:
expand = False
if (not expand) or (expand and not _handle_zip_file(request, staging_file.upload(), document_type)):
transformations, errors = SourceTransformation.objects.get_for_object_as_list(staging_folder)
if (not expand) or (expand and not _handle_zip_file(request, staging_file.upload(), document_type=document_type, transformations=transformations)):
document = Document(file=staging_file.upload())
if document_type:
document.document_type = document_type
document.save()
document.apply_default_transformations(transformations)
_handle_save_document(request, document, form)
messages.success(request, _(u'Staging file: %s, uploaded successfully.') % staging_file.filename)
if staging_folder.delete_after_upload:
staging_file.delete(staging_folder.get_preview_size())
staging_file.delete(preview_size=staging_folder.get_preview_size(), transformations=transformations)
messages.success(request, _(u'Staging file: %s, deleted successfully.') % staging_file.filename)
except Exception, e:
messages.error(request, e)
@@ -260,7 +266,7 @@ def _handle_save_document(request, document, form=None):
create_history(HISTORY_DOCUMENT_CREATED, document, {'user': request.user})
def _handle_zip_file(request, uploaded_file, document_type=None):
def _handle_zip_file(request, uploaded_file, document_type=None, transformations=None):
filename = getattr(uploaded_file, 'filename', getattr(uploaded_file, 'name', ''))
if filename.lower().endswith('zip'):
zfobj = zipfile.ZipFile(uploaded_file)
@@ -285,7 +291,12 @@ def staging_file_preview(request, source_type, source_id, staging_file_id):
staging_folder = get_object_or_404(StagingFolder, pk=source_id)
StagingFile = create_staging_file_class(request, staging_folder.folder_path)
try:
output_file, errors = StagingFile.get(staging_file_id).preview(staging_folder.get_preview_size())
transformations, errors=SourceTransformation.objects.get_for_object_as_list(staging_folder)
output_file, errors = StagingFile.get(staging_file_id).preview(
preview_size=staging_folder.get_preview_size(),
transformations=transformations
)
if errors and (request.user.is_staff or request.user.is_superuser):
for error in errors:
messages.warning(request, _(u'Staging file transformation error: %(error)s') % {
@@ -313,15 +324,19 @@ def staging_file_delete(request, source_type, source_id, staging_file_id):
StagingFile = create_staging_file_class(request, staging_folder.folder_path)
staging_file = StagingFile.get(staging_file_id)
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None)))
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', '/')))
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', '/')))
if request.method == 'POST':
try:
staging_file.delete(staging_folder.get_preview_size())
transformations, errors=SourceTransformation.objects.get_for_object_as_list(staging_folder)
staging_file.delete(
preview_size=staging_folder.get_preview_size(),
transformations=transformations
)
messages.success(request, _(u'Staging file delete successfully.'))
except Exception, e:
messages.error(request, e)
messages.error(request, _(u'Staging file delete error; %s.') % e)
return HttpResponseRedirect(next)
results = get_active_tab_links()
@@ -509,11 +524,17 @@ def setup_source_transformation_edit(request, transformation_id):
form = SourceTransformationForm(instance=source_transformation, data=request.POST)
if form.is_valid():
try:
form.save()
messages.success(request, _(u'Source transformation edited successfully'))
return HttpResponseRedirect(next)
except Exception, e:
messages.error(request, _(u'Error editing source transformation; %s') % e)
# Test the validity of the argument field
eval(form.cleaned_data['arguments'], {})
except:
messages.error(request, _(u'Source transformation argument error.'))
else:
try:
form.save()
messages.success(request, _(u'Source transformation edited successfully'))
return HttpResponseRedirect(next)
except Exception, e:
messages.error(request, _(u'Error editing source transformation; %s') % e)
else:
form = SourceTransformationForm(instance=source_transformation)
@@ -541,9 +562,9 @@ def setup_source_transformation_delete(request, transformation_id):
if request.method == 'POST':
try:
source_transformation.delete()
messages.success(request, _(u'Transformation deleted successfully.'))
messages.success(request, _(u'Source transformation deleted successfully.'))
except Exception, e:
messages.error(request, _(u'Error deleting transformation; %(error)s') % {
messages.error(request, _(u'Error deleting source transformation; %(error)s') % {
'error': e}
)
return HttpResponseRedirect(redirect_view)
@@ -556,7 +577,7 @@ def setup_source_transformation_delete(request, transformation_id):
{'object': 'source', 'name': _(u'source')},
{'object': 'transformation', 'name': _(u'transformation')}
],
'title': _(u'Are you sure you wish to delete transformation "%(transformation)s"') % {
'title': _(u'Are you sure you wish to delete source transformation "%(transformation)s"') % {
'transformation': source_transformation.get_transformation_display(),
},
'previous': previous,
@@ -598,13 +619,19 @@ def setup_source_transformation_create(request, source_type, source_id):
form = SourceTransformationForm_create(request.POST)
if form.is_valid():
try:
source_tranformation = form.save(commit=False)
source_tranformation.content_object = source
source_tranformation.save()
messages.success(request, _(u'Source transformation created successfully'))
return HttpResponseRedirect(redirect_view)
except Exception, e:
messages.error(request, _(u'Error creating source transformation; %s') % e)
# Test the validity of the argument field
eval(form.cleaned_data['arguments'], {})
except:
messages.error(request, _(u'Source transformation argument error.'))
else:
try:
source_tranformation = form.save(commit=False)
source_tranformation.content_object = source
source_tranformation.save()
messages.success(request, _(u'Source transformation created successfully'))
return HttpResponseRedirect(redirect_view)
except Exception, e:
messages.error(request, _(u'Error creating source transformation; %s') % e)
else:
form = SourceTransformationForm_create()

View File

@@ -9,3 +9,5 @@ django-celery==2.2.2
django-sentry==1.6.0
django-taggit==0.9.3
-e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt
slate==0.3
PIL==1.1.7

View File

@@ -6,3 +6,5 @@ django-celery==2.2.2
django-sentry==1.6.0
django-taggit==0.9.3
-e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt
slate==0.3
PIL==1.1.7