Merge branch 'converter_export' into smart_staging
This commit is contained in:
26
README.md
26
README.md
@@ -5,7 +5,7 @@ Open source, Django based document manager with custom metadata indexing, file s
|
||||
|
||||
[Website](http://bit.ly/mayan-edms)
|
||||
|
||||
Requirements
|
||||
Basic requirements
|
||||
---
|
||||
|
||||
Python:
|
||||
@@ -15,6 +15,21 @@ Python:
|
||||
* django-filetransfers - File upload/download abstraction
|
||||
* celery- asynchronous task queue/job queue based on distributed message passing
|
||||
* django-celery - celery Django integration
|
||||
* django-mptt - Utilities for implementing a modified pre-order traversal tree in django
|
||||
* python-magic - A python wrapper for libmagic
|
||||
* django-taggit - Simple tagging for django
|
||||
* slate - The simplest way to extract text from PDFs in Python
|
||||
|
||||
|
||||
Execute pip install -r requirements/production.txt to install the python/django dependencies automatically.
|
||||
|
||||
Executables:
|
||||
|
||||
* tesseract-ocr - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google.
|
||||
* unpaper - post-processing scanned and photocopied book pages
|
||||
|
||||
Optional requirements
|
||||
---
|
||||
|
||||
For the GridFS storage backend:
|
||||
|
||||
@@ -22,13 +37,12 @@ For the GridFS storage backend:
|
||||
* GridFS - a storage specification for large objects in MongoDB
|
||||
* MongoDB - a scalable, open source, document-oriented database
|
||||
|
||||
Or execute pip install -r requirements/production.txt to install the dependencies automatically.
|
||||
Libraries:
|
||||
|
||||
Executables:
|
||||
* libmagic - MIME detection library, if not installed Mayan will fall back to using python's simpler mimetype built in library
|
||||
|
||||
Mayan has the ability to switch between different image conversion backends, at the moment these two are supported:
|
||||
|
||||
* libmagic - MIME detection library
|
||||
* tesseract-ocr - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google.
|
||||
* unpaper - post-processing scanned and photocopied book pages
|
||||
* ImageMagick - Convert, Edit, Or Compose Bitmap Images
|
||||
* GraphicMagick - Robust collection of tools and libraries to read, write, and manipulate an image.
|
||||
|
||||
|
||||
@@ -1,11 +1,16 @@
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
from django.core.exceptions import ImproperlyConfigured
|
||||
|
||||
from navigation.api import register_sidebar_template
|
||||
|
||||
TRANFORMATION_CHOICES = {
|
||||
u'rotate': u'-rotate %(degrees)d'
|
||||
}
|
||||
from converter.utils import load_backend
|
||||
from converter.conf.settings import GRAPHICS_BACKEND
|
||||
|
||||
formats_list = {'text': _('file formats'), 'view': 'formats_list', 'famfam': 'pictures'}
|
||||
|
||||
register_sidebar_template(['formats_list'], 'converter_file_formats_help.html')
|
||||
|
||||
try:
|
||||
backend = load_backend().ConverterClass()
|
||||
except ImproperlyConfigured:
|
||||
raise ImproperlyConfigured(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)
|
||||
|
||||
@@ -1,66 +1,29 @@
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from django.utils.importlib import import_module
|
||||
from django.template.defaultfilters import slugify
|
||||
|
||||
from converter.conf.settings import UNPAPER_PATH
|
||||
from converter.conf.settings import OCR_OPTIONS
|
||||
from converter.conf.settings import DEFAULT_OPTIONS
|
||||
from converter.conf.settings import LOW_QUALITY_OPTIONS
|
||||
from converter.conf.settings import HIGH_QUALITY_OPTIONS
|
||||
from converter.conf.settings import PRINT_QUALITY_OPTIONS
|
||||
from converter.conf.settings import GRAPHICS_BACKEND
|
||||
from converter.conf.settings import UNOCONV_PATH
|
||||
|
||||
from converter.exceptions import UnpaperError, OfficeConversionError
|
||||
import hashlib
|
||||
|
||||
from common import TEMPORARY_DIRECTORY
|
||||
from documents.utils import document_save_to_temp_dir
|
||||
|
||||
DEFAULT_ZOOM_LEVEL = 100
|
||||
DEFAULT_ROTATION = 0
|
||||
DEFAULT_PAGE_INDEX_NUMBER = 0
|
||||
DEFAULT_FILE_FORMAT = u'jpg'
|
||||
DEFAULT_OCR_FILE_FORMAT = u'tif'
|
||||
from converter.conf.settings import UNOCONV_PATH
|
||||
from converter.exceptions import UnpaperError, OfficeConversionError
|
||||
from converter.literals import DEFAULT_PAGE_NUMBER, \
|
||||
DEFAULT_OCR_FILE_FORMAT, QUALITY_DEFAULT, DEFAULT_ZOOM_LEVEL, \
|
||||
DEFAULT_ROTATION, DEFAULT_FILE_FORMAT, QUALITY_HIGH
|
||||
|
||||
QUALITY_DEFAULT = u'quality_default'
|
||||
QUALITY_LOW = u'quality_low'
|
||||
QUALITY_HIGH = u'quality_high'
|
||||
QUALITY_PRINT = u'quality_print'
|
||||
|
||||
QUALITY_SETTINGS = {
|
||||
QUALITY_DEFAULT: DEFAULT_OPTIONS,
|
||||
QUALITY_LOW: LOW_QUALITY_OPTIONS,
|
||||
QUALITY_HIGH: HIGH_QUALITY_OPTIONS,
|
||||
QUALITY_PRINT: PRINT_QUALITY_OPTIONS
|
||||
}
|
||||
from converter import backend
|
||||
from converter.literals import TRANSFORMATION_CHOICES
|
||||
from converter.literals import TRANSFORMATION_RESIZE, \
|
||||
TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \
|
||||
TRANSFORMATION_ZOOM
|
||||
from converter.literals import DIMENSION_SEPARATOR
|
||||
|
||||
HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
|
||||
|
||||
CONVERTER_OFFICE_FILE_EXTENSIONS = [
|
||||
u'ods', u'docx', u'doc'
|
||||
]
|
||||
|
||||
|
||||
def _lazy_load(fn):
|
||||
_cached = []
|
||||
|
||||
def _decorated():
|
||||
if not _cached:
|
||||
_cached.append(fn())
|
||||
return _cached[0]
|
||||
return _decorated
|
||||
|
||||
|
||||
@_lazy_load
|
||||
def _get_backend():
|
||||
return import_module(GRAPHICS_BACKEND)
|
||||
|
||||
try:
|
||||
backend = _get_backend()
|
||||
except ImportError:
|
||||
raise ImportError(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)
|
||||
|
||||
|
||||
def cleanup(filename):
|
||||
"""
|
||||
Tries to remove the given filename. Ignores non-existent files
|
||||
@@ -71,21 +34,6 @@ def cleanup(filename):
|
||||
pass
|
||||
|
||||
|
||||
def execute_unpaper(input_filepath, output_filepath):
|
||||
"""
|
||||
Executes the program unpaper using subprocess's Popen
|
||||
"""
|
||||
command = []
|
||||
command.append(UNPAPER_PATH)
|
||||
command.append(u'--overwrite')
|
||||
command.append(input_filepath)
|
||||
command.append(output_filepath)
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
raise UnpaperError(proc.stderr.readline())
|
||||
|
||||
|
||||
def execute_unoconv(input_filepath, arguments=''):
|
||||
"""
|
||||
Executes the program unoconv using subprocess's Popen
|
||||
@@ -109,19 +57,11 @@ def cache_cleanup(input_filepath, *args, **kwargs):
|
||||
|
||||
def create_image_cache_filename(input_filepath, *args, **kwargs):
|
||||
if input_filepath:
|
||||
temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
|
||||
temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
|
||||
|
||||
final_filepath = []
|
||||
[final_filepath.append(str(arg)) for arg in args]
|
||||
final_filepath.extend([u'%s_%s' % (key, value) for key, value in kwargs.items()])
|
||||
|
||||
temp_path += slugify(u'_'.join(final_filepath))
|
||||
|
||||
return temp_path
|
||||
hash_value = HASH_FUNCTION(u''.join([input_filepath, unicode(args), unicode(kwargs)]))
|
||||
return os.path.join(TEMPORARY_DIRECTORY, hash_value)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def convert_office_document(input_filepath):
|
||||
if os.path.exists(UNOCONV_PATH):
|
||||
@@ -138,15 +78,14 @@ def convert_document(document, *args, **kwargs):
|
||||
return convert(document_save_to_temp_dir(document, document.checksum), *args, **kwargs)
|
||||
|
||||
|
||||
def convert(input_filepath, *args, **kwargs):
|
||||
def convert(input_filepath, cleanup_files=True, *args, **kwargs):
|
||||
size = kwargs.get('size')
|
||||
file_format = kwargs.get('file_format', DEFAULT_FILE_FORMAT)
|
||||
extra_options = kwargs.get('extra_options', u'')
|
||||
zoom = kwargs.get('zoom', DEFAULT_ZOOM_LEVEL)
|
||||
rotation = kwargs.get('rotation', DEFAULT_ROTATION)
|
||||
page = kwargs.get('page', DEFAULT_PAGE_INDEX_NUMBER)
|
||||
cleanup_files = kwargs.get('cleanup_files', True)
|
||||
page = kwargs.get('page', DEFAULT_PAGE_NUMBER)
|
||||
quality = kwargs.get('quality', QUALITY_DEFAULT)
|
||||
transformations = kwargs.get('transformations', [])
|
||||
|
||||
unoconv_output = None
|
||||
|
||||
@@ -160,20 +99,32 @@ def convert(input_filepath, *args, **kwargs):
|
||||
if result:
|
||||
unoconv_output = result
|
||||
input_filepath = result
|
||||
extra_options = u''
|
||||
|
||||
input_arg = u'%s[%s]' % (input_filepath, page)
|
||||
extra_options += u' -resize %s' % size
|
||||
transformations.append(
|
||||
{
|
||||
'transformation': TRANSFORMATION_RESIZE,
|
||||
'arguments': dict(zip([u'width', u'height'], size.split(DIMENSION_SEPARATOR)))
|
||||
}
|
||||
)
|
||||
|
||||
if zoom != 100:
|
||||
extra_options += u' -resize %d%% ' % zoom
|
||||
transformations.append(
|
||||
{
|
||||
'transformation': TRANSFORMATION_ZOOM,
|
||||
'arguments': {'percent': zoom}
|
||||
}
|
||||
)
|
||||
|
||||
if rotation != 0 and rotation != 360:
|
||||
extra_options += u' -rotate %d ' % rotation
|
||||
transformations.append(
|
||||
{
|
||||
'transformation': TRANSFORMATION_ROTATE,
|
||||
'arguments': {'degrees': rotation}
|
||||
}
|
||||
)
|
||||
|
||||
if format == u'jpg':
|
||||
extra_options += u' -quality 85'
|
||||
try:
|
||||
backend.execute_convert(input_filepath=input_arg, arguments=extra_options, output_filepath=u'%s:%s' % (file_format, output_filepath), quality=quality)
|
||||
backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, quality=quality, transformations=transformations, page=page, file_format=file_format)
|
||||
finally:
|
||||
if cleanup_files:
|
||||
cleanup(input_filepath)
|
||||
@@ -184,51 +135,22 @@ def convert(input_filepath, *args, **kwargs):
|
||||
|
||||
|
||||
def get_page_count(input_filepath):
|
||||
try:
|
||||
return len(backend.execute_identify(unicode(input_filepath)).splitlines())
|
||||
except:
|
||||
#TODO: send to other page number identifying program
|
||||
return 1
|
||||
return backend.get_page_count(input_filepath)
|
||||
|
||||
|
||||
def get_document_dimensions(document, *args, **kwargs):
|
||||
document_filepath = create_image_cache_filename(document.checksum, *args, **kwargs)
|
||||
if os.path.exists(document_filepath):
|
||||
options = [u'-format', u'%w %h']
|
||||
return [int(dimension) for dimension in backend.execute_identify(unicode(document_filepath), options).split()]
|
||||
return [int(dimension) for dimension in backend.identify_file(unicode(document_filepath), options).split()]
|
||||
else:
|
||||
return [0, 0]
|
||||
|
||||
|
||||
def convert_document_for_ocr(document, page=DEFAULT_PAGE_INDEX_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT):
|
||||
#Extract document file
|
||||
input_filepath = document_save_to_temp_dir(document, document.uuid)
|
||||
|
||||
#Convert for OCR
|
||||
temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
|
||||
temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
|
||||
transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, file_format)
|
||||
unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
|
||||
unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
|
||||
convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format)
|
||||
|
||||
input_arg = u'%s[%s]' % (input_filepath, page)
|
||||
|
||||
try:
|
||||
document_page = document.documentpage_set.get(page_number=page + 1)
|
||||
transformation_string, warnings = document_page.get_transformation_string()
|
||||
|
||||
#Apply default transformations
|
||||
backend.execute_convert(input_filepath=input_arg, quality=QUALITY_HIGH, arguments=transformation_string, output_filepath=transformation_output_file)
|
||||
#Do OCR operations
|
||||
backend.execute_convert(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file)
|
||||
# Process by unpaper
|
||||
execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file)
|
||||
# Convert to tif
|
||||
backend.execute_convert(input_filepath=unpaper_output_file, output_filepath=convert_output_file)
|
||||
finally:
|
||||
cleanup(transformation_output_file)
|
||||
cleanup(unpaper_input_file)
|
||||
cleanup(unpaper_output_file)
|
||||
|
||||
return convert_output_file
|
||||
def get_available_transformations_choices():
|
||||
result = []
|
||||
for transformation in backend.get_available_transformations():
|
||||
transformation_template = u'%s %s' % (TRANSFORMATION_CHOICES[transformation]['label'], u','.join(['<%s>' % argument['name'] if argument['required'] else '[%s]' % argument['name'] for argument in TRANSFORMATION_CHOICES[transformation]['arguments']]))
|
||||
result.append([transformation, transformation_template])
|
||||
|
||||
return result
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
class ConverterBase(object):
|
||||
"""
|
||||
Base class that all backend classes must inherit
|
||||
"""
|
||||
def convert_file(self, input_filepath, *args, **kwargs):
|
||||
raise NotImplementedError("Your %s class has not defined a convert_file() method, which is required." % self.__class__.__name__)
|
||||
|
||||
def convert_document(self, document, *args, **kwargs):
|
||||
raise NotImplementedError("Your %s class has not defined a convert_document() method, which is required." % self.__class__.__name__)
|
||||
|
||||
def get_format_list(self):
|
||||
raise NotImplementedError("Your %s class has not defined a get_format_list() method, which is required." % self.__class__.__name__)
|
||||
|
||||
def get_available_transformations(self):
|
||||
raise NotImplementedError("Your %s class has not defined a get_available_transformations() method, which is required." % self.__class__.__name__)
|
||||
|
||||
def get_page_count(self):
|
||||
raise NotImplementedError("Your %s class has not defined a get_page_count() method, which is required." % self.__class__.__name__)
|
||||
|
||||
@@ -1,71 +0,0 @@
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
from converter.conf.settings import GM_PATH
|
||||
from converter.conf.settings import GM_SETTINGS
|
||||
from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS
|
||||
from converter.exceptions import ConvertError, UnknownFormat, IdentifyError
|
||||
|
||||
CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format'
|
||||
CONVERTER_ERROR_STARTS_WITH = u'starts with'
|
||||
|
||||
|
||||
def execute_identify(input_filepath, arguments=None):
|
||||
command = []
|
||||
command.append(unicode(GM_PATH))
|
||||
command.append(u'identify')
|
||||
if arguments:
|
||||
command.extend(arguments)
|
||||
command.append(unicode(input_filepath))
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
raise IdentifyError(proc.stderr.readline())
|
||||
return proc.stdout.read()
|
||||
|
||||
|
||||
def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None):
|
||||
command = []
|
||||
command.append(unicode(GM_PATH))
|
||||
command.append(u'convert')
|
||||
command.extend(unicode(QUALITY_SETTINGS[quality]).split())
|
||||
command.extend(unicode(GM_SETTINGS).split())
|
||||
command.append(unicode(input_filepath))
|
||||
if arguments:
|
||||
command.extend(unicode(arguments).split())
|
||||
command.append(unicode(output_filepath))
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
#Got an error from convert program
|
||||
error_line = proc.stderr.readline()
|
||||
if (CONVERTER_ERROR_STRING_NO_DECODER in error_line) or (CONVERTER_ERROR_STARTS_WITH in error_line):
|
||||
#Try to determine from error message which class of error is it
|
||||
raise UnknownFormat
|
||||
else:
|
||||
raise ConvertError(error_line)
|
||||
|
||||
|
||||
def get_format_list():
|
||||
"""
|
||||
Call GraphicsMagick to parse all of it's supported file formats, and
|
||||
return a list of the names and descriptions
|
||||
"""
|
||||
format_regex = re.compile(' *([A-Z0-9]+)[*]? +([A-Z0-9]+) +([rw\-+]+) *(.*).*')
|
||||
formats = []
|
||||
command = []
|
||||
command.append(unicode(GM_PATH))
|
||||
command.append(u'convert')
|
||||
command.append(u'-list')
|
||||
command.append(u'formats')
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
raise ConvertError(proc.stderr.readline())
|
||||
|
||||
for line in proc.stdout.readlines():
|
||||
fields = format_regex.findall(line)
|
||||
if fields:
|
||||
formats.append((fields[0][0], fields[0][3]))
|
||||
|
||||
return formats
|
||||
119
apps/converter/backends/graphicsmagick/base.py
Normal file
119
apps/converter/backends/graphicsmagick/base.py
Normal file
@@ -0,0 +1,119 @@
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
from converter.conf.settings import GM_PATH
|
||||
from converter.conf.settings import GM_SETTINGS
|
||||
from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS
|
||||
from converter.exceptions import ConvertError, UnknownFormat, \
|
||||
IdentifyError
|
||||
from converter.backends import ConverterBase
|
||||
from converter.literals import TRANSFORMATION_RESIZE, \
|
||||
TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \
|
||||
TRANSFORMATION_ZOOM
|
||||
from converter.literals import DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER, \
|
||||
DEFAULT_FILE_FORMAT
|
||||
|
||||
CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format'
|
||||
CONVERTER_ERROR_STARTS_WITH = u'starts with'
|
||||
|
||||
|
||||
class ConverterClass(ConverterBase):
|
||||
def identify_file(self, input_filepath, arguments=None):
|
||||
command = []
|
||||
command.append(unicode(GM_PATH))
|
||||
command.append(u'identify')
|
||||
if arguments:
|
||||
command.extend(arguments)
|
||||
command.append(unicode(input_filepath))
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
raise IdentifyError(proc.stderr.readline())
|
||||
return proc.stdout.read()
|
||||
|
||||
def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
|
||||
arguments = []
|
||||
if transformations:
|
||||
for transformation in transformations:
|
||||
if transformation['transformation'] == TRANSFORMATION_RESIZE:
|
||||
dimensions = []
|
||||
dimensions.append(unicode(transformation['arguments']['width']))
|
||||
if 'height' in transformation['arguments']:
|
||||
dimensions.append(unicode(transformation['arguments']['height']))
|
||||
arguments.append(u'-resize')
|
||||
arguments.append(u'%s' % DIMENSION_SEPARATOR.join(dimensions))
|
||||
|
||||
elif transformation['transformation'] == TRANSFORMATION_ZOOM:
|
||||
arguments.append(u'-resize')
|
||||
arguments.append(u'%d%%' % transformation['arguments']['percent'])
|
||||
|
||||
elif transformation['transformation'] == TRANSFORMATION_ROTATE:
|
||||
arguments.append(u'-rotate')
|
||||
arguments.append(u'%s' % transformation['arguments']['degrees'])
|
||||
|
||||
if format == u'jpeg':
|
||||
arguments.append(u'-quality')
|
||||
arguments.append(u'85')
|
||||
|
||||
# Graphicsmagick page number is 0 base
|
||||
input_arg = u'%s[%d]' % (input_filepath, page - 1)
|
||||
|
||||
# Specify the file format next to the output filename
|
||||
output_filepath = u'%s:%s' % (file_format, output_filepath)
|
||||
|
||||
command = []
|
||||
command.append(unicode(GM_PATH))
|
||||
command.append(u'convert')
|
||||
command.extend(unicode(QUALITY_SETTINGS[quality]).split())
|
||||
command.extend(unicode(GM_SETTINGS).split())
|
||||
command.append(unicode(input_arg))
|
||||
if arguments:
|
||||
command.extend(arguments)
|
||||
command.append(unicode(output_filepath))
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
#Got an error from convert program
|
||||
error_line = proc.stderr.readline()
|
||||
if (CONVERTER_ERROR_STRING_NO_DECODER in error_line) or (CONVERTER_ERROR_STARTS_WITH in error_line):
|
||||
#Try to determine from error message which class of error is it
|
||||
raise UnknownFormat
|
||||
else:
|
||||
raise ConvertError(error_line)
|
||||
|
||||
def get_format_list(self):
|
||||
"""
|
||||
Call GraphicsMagick to parse all of it's supported file formats, and
|
||||
return a list of the names and descriptions
|
||||
"""
|
||||
format_regex = re.compile(' *([A-Z0-9]+)[*]? +([A-Z0-9]+) +([rw\-+]+) *(.*).*')
|
||||
formats = []
|
||||
command = []
|
||||
command.append(unicode(GM_PATH))
|
||||
command.append(u'convert')
|
||||
command.append(u'-list')
|
||||
command.append(u'formats')
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
raise ConvertError(proc.stderr.readline())
|
||||
|
||||
for line in proc.stdout.readlines():
|
||||
fields = format_regex.findall(line)
|
||||
if fields:
|
||||
formats.append((fields[0][0], fields[0][3]))
|
||||
|
||||
return formats
|
||||
|
||||
def get_available_transformations(self):
|
||||
return [
|
||||
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \
|
||||
TRANSFORMATION_ZOOM
|
||||
]
|
||||
|
||||
def get_page_count(self, input_filepath):
|
||||
try:
|
||||
return len(self.identify_file(unicode(input_filepath)).splitlines())
|
||||
except:
|
||||
#TODO: send to other page number identifying program
|
||||
return 1
|
||||
@@ -1,68 +0,0 @@
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
from converter.conf.settings import IM_IDENTIFY_PATH
|
||||
from converter.conf.settings import IM_CONVERT_PATH
|
||||
from converter.api import QUALITY_DEFAULT, QUALITY_SETTINGS
|
||||
from converter.exceptions import ConvertError, UnknownFormat, \
|
||||
IdentifyError
|
||||
|
||||
CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format'
|
||||
|
||||
|
||||
def execute_identify(input_filepath, arguments=None):
|
||||
command = []
|
||||
command.append(unicode(IM_IDENTIFY_PATH))
|
||||
if arguments:
|
||||
command.extend(arguments)
|
||||
command.append(unicode(input_filepath))
|
||||
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
raise IdentifyError(proc.stderr.readline())
|
||||
return proc.stdout.read()
|
||||
|
||||
|
||||
def execute_convert(input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None):
|
||||
command = []
|
||||
command.append(unicode(IM_CONVERT_PATH))
|
||||
command.extend(unicode(QUALITY_SETTINGS[quality]).split())
|
||||
command.append(unicode(input_filepath))
|
||||
if arguments:
|
||||
command.extend(unicode(arguments).split())
|
||||
command.append(unicode(output_filepath))
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
#Got an error from convert program
|
||||
error_line = proc.stderr.readline()
|
||||
if CONVERTER_ERROR_STRING_NO_DECODER in error_line:
|
||||
#Try to determine from error message which class of error is it
|
||||
raise UnknownFormat
|
||||
else:
|
||||
raise ConvertError(error_line)
|
||||
|
||||
|
||||
def get_format_list():
|
||||
"""
|
||||
Call ImageMagick to parse all of it's supported file formats, and
|
||||
return a list of the names and descriptions
|
||||
"""
|
||||
format_regex = re.compile(' *([A-Z0-9]+)[*]? +([A-Z0-9]+) +([rw\-+]+) *(.*).*')
|
||||
formats = []
|
||||
command = []
|
||||
command.append(unicode(IM_CONVERT_PATH))
|
||||
command.append(u'-list')
|
||||
command.append(u'format')
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
raise ConvertError(proc.stderr.readline())
|
||||
|
||||
for line in proc.stdout.readlines():
|
||||
fields = format_regex.findall(line)
|
||||
if fields:
|
||||
formats.append((fields[0][0], fields[0][3]))
|
||||
|
||||
return formats
|
||||
0
apps/converter/backends/imagemagick/__init__.py
Normal file
0
apps/converter/backends/imagemagick/__init__.py
Normal file
118
apps/converter/backends/imagemagick/base.py
Normal file
118
apps/converter/backends/imagemagick/base.py
Normal file
@@ -0,0 +1,118 @@
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
from converter.conf.settings import IM_IDENTIFY_PATH
|
||||
from converter.conf.settings import IM_CONVERT_PATH
|
||||
from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS
|
||||
from converter.exceptions import ConvertError, UnknownFormat, \
|
||||
IdentifyError
|
||||
from converter.backends import ConverterBase
|
||||
from converter.literals import TRANSFORMATION_RESIZE, \
|
||||
TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \
|
||||
TRANSFORMATION_ZOOM
|
||||
from converter.literals import DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER, \
|
||||
DEFAULT_FILE_FORMAT
|
||||
|
||||
CONVERTER_ERROR_STRING_NO_DECODER = u'no decode delegate for this image format'
|
||||
|
||||
|
||||
class ConverterClass(ConverterBase):
|
||||
def identify_file(self, input_filepath, arguments=None):
|
||||
command = []
|
||||
command.append(unicode(IM_IDENTIFY_PATH))
|
||||
if arguments:
|
||||
command.extend(arguments)
|
||||
command.append(unicode(input_filepath))
|
||||
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
raise IdentifyError(proc.stderr.readline())
|
||||
return proc.stdout.read()
|
||||
|
||||
def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
|
||||
arguments = []
|
||||
if transformations:
|
||||
for transformation in transformations:
|
||||
if transformation['transformation'] == TRANSFORMATION_RESIZE:
|
||||
dimensions = []
|
||||
dimensions.append(unicode(transformation['arguments']['width']))
|
||||
if 'height' in transformation['arguments']:
|
||||
dimensions.append(unicode(transformation['arguments']['height']))
|
||||
arguments.append(u'-resize')
|
||||
arguments.append(u'%s' % DIMENSION_SEPARATOR.join(dimensions))
|
||||
|
||||
elif transformation['transformation'] == TRANSFORMATION_ZOOM:
|
||||
arguments.append(u'-resize')
|
||||
arguments.append(u'%d%%' % transformation['arguments']['percent'])
|
||||
|
||||
elif transformation['transformation'] == TRANSFORMATION_ROTATE:
|
||||
arguments.append(u'-rotate')
|
||||
arguments.append(u'%s' % transformation['arguments']['degrees'])
|
||||
|
||||
if format == u'jpeg':
|
||||
arguments.append(u'-quality')
|
||||
arguments.append(u'85')
|
||||
|
||||
# Imagemagick page number is 0 base
|
||||
input_arg = u'%s[%d]' % (input_filepath, page - 1)
|
||||
|
||||
# Specify the file format next to the output filename
|
||||
output_filepath = u'%s:%s' % (file_format, output_filepath)
|
||||
|
||||
command = []
|
||||
command.append(unicode(IM_CONVERT_PATH))
|
||||
command.extend(unicode(QUALITY_SETTINGS[quality]).split())
|
||||
command.append(unicode(input_arg))
|
||||
if arguments:
|
||||
command.extend(arguments)
|
||||
command.append(unicode(output_filepath))
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
#Got an error from convert program
|
||||
error_line = proc.stderr.readline()
|
||||
if CONVERTER_ERROR_STRING_NO_DECODER in error_line:
|
||||
#Try to determine from error message which class of error is it
|
||||
raise UnknownFormat
|
||||
else:
|
||||
raise ConvertError(error_line)
|
||||
|
||||
|
||||
def get_format_list(self):
|
||||
"""
|
||||
Call ImageMagick to parse all of it's supported file formats, and
|
||||
return a list of the names and descriptions
|
||||
"""
|
||||
format_regex = re.compile(' *([A-Z0-9]+)[*]? +([A-Z0-9]+) +([rw\-+]+) *(.*).*')
|
||||
formats = []
|
||||
command = []
|
||||
command.append(unicode(IM_CONVERT_PATH))
|
||||
command.append(u'-list')
|
||||
command.append(u'format')
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
raise ConvertError(proc.stderr.readline())
|
||||
|
||||
for line in proc.stdout.readlines():
|
||||
fields = format_regex.findall(line)
|
||||
if fields:
|
||||
formats.append((fields[0][0], fields[0][3]))
|
||||
|
||||
return formats
|
||||
|
||||
|
||||
def get_available_transformations(self):
|
||||
return [
|
||||
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \
|
||||
TRANSFORMATION_ZOOM
|
||||
]
|
||||
|
||||
|
||||
def get_page_count(self, input_filepath):
|
||||
try:
|
||||
return len(self.identify_file(unicode(input_filepath)).splitlines())
|
||||
except:
|
||||
#TODO: send to other page number identifying program
|
||||
return 1
|
||||
3
apps/converter/backends/python/__init__.py
Normal file
3
apps/converter/backends/python/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from PIL import Image
|
||||
|
||||
Image.init()
|
||||
93
apps/converter/backends/python/base.py
Normal file
93
apps/converter/backends/python/base.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import slate
|
||||
from PIL import Image
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS
|
||||
from converter.exceptions import ConvertError, UnknownFormat, IdentifyError
|
||||
from converter.backends import ConverterBase
|
||||
from converter.literals import TRANSFORMATION_RESIZE, \
|
||||
TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM
|
||||
from converter.literals import QUALITY_DEFAULT, DEFAULT_PAGE_NUMBER, \
|
||||
DEFAULT_FILE_FORMAT
|
||||
from converter.utils import get_mimetype
|
||||
|
||||
|
||||
class ConverterClass(ConverterBase):
|
||||
def get_page_count(self, input_filepath):
|
||||
page_count = 1
|
||||
|
||||
mimetype, encoding = get_mimetype(input_filepath)
|
||||
if mimetype == 'application/pdf':
|
||||
# If file is a PDF open it with slate to determine the page
|
||||
# count
|
||||
with open(input_filepath) as fd:
|
||||
pages = slate.PDF(fd)
|
||||
return len(pages)
|
||||
|
||||
try:
|
||||
im = Image.open(input_filepath)
|
||||
except IOError: #cannot identify image file
|
||||
# Return a page count of 1, to atleast allow the document
|
||||
# to be created
|
||||
return 1
|
||||
|
||||
try:
|
||||
while 1:
|
||||
im.seek(im.tell()+1)
|
||||
page_count += 1
|
||||
# do something to im
|
||||
except EOFError:
|
||||
pass # end of sequence
|
||||
|
||||
return page_count
|
||||
|
||||
def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
|
||||
try:
|
||||
im = Image.open(input_filepath)
|
||||
except Exception: # Python Imaging Library doesn't recognize it as an image
|
||||
raise UnknownFormat
|
||||
|
||||
current_page = 0
|
||||
try:
|
||||
while current_page == page - 1:
|
||||
im.seek(im.tell() + 1)
|
||||
current_page += 1
|
||||
# do something to im
|
||||
except EOFError:
|
||||
pass # end of sequence
|
||||
|
||||
if transformations:
|
||||
for transformation in transformations:
|
||||
aspect = 1.0 * im.size[1] / im.size[0]
|
||||
if transformation['transformation'] == TRANSFORMATION_RESIZE:
|
||||
width = int(transformation['arguments']['width'])
|
||||
height = int(transformation['arguments'].get('height', 1.0 * width * aspect))
|
||||
im = im.resize((width, height), Image.ANTIALIAS)
|
||||
elif transformation['transformation'] == TRANSFORMATION_ZOOM:
|
||||
decimal_value = float(transformation['arguments']['percent']) / 100
|
||||
im = im.transform((im.size[0] * decimal_value, im.size[1] * decimal_value), Image.EXTENT, (0, 0, im.size[0], im.size[1]))
|
||||
elif transformation['transformation'] == TRANSFORMATION_ROTATE:
|
||||
# PIL counter degress counter-clockwise, reverse them
|
||||
im = im.rotate(360 - transformation['arguments']['degrees'])
|
||||
|
||||
if im.mode not in ('L', 'RGB'):
|
||||
im = im.convert('RGB')
|
||||
im.save(output_filepath, format=file_format)
|
||||
|
||||
def get_format_list(self):
|
||||
"""
|
||||
Introspect PIL's internal registry to obtain a list of the
|
||||
supported file types
|
||||
"""
|
||||
formats = []
|
||||
for format_name in Image.ID:
|
||||
formats.append((format_name, u''))
|
||||
|
||||
return formats
|
||||
|
||||
def get_available_transformations(self):
|
||||
return [
|
||||
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \
|
||||
TRANSFORMATION_ZOOM
|
||||
]
|
||||
@@ -9,12 +9,11 @@ register_settings(
|
||||
settings=[
|
||||
{'name': u'IM_CONVERT_PATH', 'global_name': u'CONVERTER_IM_CONVERT_PATH', 'default': u'/usr/bin/convert', 'description': _(u'File path to imagemagick\'s convert program.'), 'exists': True},
|
||||
{'name': u'IM_IDENTIFY_PATH', 'global_name': u'CONVERTER_IM_IDENTIFY_PATH', 'default': u'/usr/bin/identify', 'description': _(u'File path to imagemagick\'s identify program.'), 'exists': True},
|
||||
{'name': u'UNPAPER_PATH', 'global_name': u'CONVERTER_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
|
||||
{'name': u'GM_PATH', 'global_name': u'CONVERTER_GM_PATH', 'default': u'/usr/bin/gm', 'description': _(u'File path to graphicsmagick\'s program.'), 'exists': True},
|
||||
{'name': u'GM_SETTINGS', 'global_name': u'CONVERTER_GM_SETTINGS', 'default': u''},
|
||||
{'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.imagemagick', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick and converter.backends.graphicsmagick.')},
|
||||
{'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick, converter.backends.graphicsmagick and converter.backends.python.')},
|
||||
{'name': u'UNOCONV_PATH', 'global_name': u'CONVERTER_UNOCONV_PATH', 'default': u'/usr/bin/unoconv', 'exists': True},
|
||||
{'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'},
|
||||
#{'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'},
|
||||
{'name': u'DEFAULT_OPTIONS', 'global_name': u'CONVERTER_DEFAULT_OPTIONS', 'default': u''},
|
||||
{'name': u'LOW_QUALITY_OPTIONS', 'global_name': u'CONVERTER_LOW_QUALITY_OPTIONS', 'default': u''},
|
||||
{'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'},
|
||||
|
||||
64
apps/converter/literals.py
Normal file
64
apps/converter/literals.py
Normal file
@@ -0,0 +1,64 @@
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from converter.conf.settings import DEFAULT_OPTIONS
|
||||
from converter.conf.settings import LOW_QUALITY_OPTIONS
|
||||
from converter.conf.settings import HIGH_QUALITY_OPTIONS
|
||||
from converter.conf.settings import PRINT_QUALITY_OPTIONS
|
||||
|
||||
DEFAULT_ZOOM_LEVEL = 100
|
||||
DEFAULT_ROTATION = 0
|
||||
DEFAULT_PAGE_NUMBER = 1
|
||||
DEFAULT_FILE_FORMAT = u'jpeg'
|
||||
DEFAULT_OCR_FILE_FORMAT = u'tif'
|
||||
|
||||
QUALITY_DEFAULT = u'quality_default'
|
||||
QUALITY_LOW = u'quality_low'
|
||||
QUALITY_HIGH = u'quality_high'
|
||||
QUALITY_PRINT = u'quality_print'
|
||||
|
||||
QUALITY_SETTINGS = {
|
||||
QUALITY_DEFAULT: DEFAULT_OPTIONS,
|
||||
QUALITY_LOW: LOW_QUALITY_OPTIONS,
|
||||
QUALITY_HIGH: HIGH_QUALITY_OPTIONS,
|
||||
QUALITY_PRINT: PRINT_QUALITY_OPTIONS
|
||||
}
|
||||
|
||||
DIMENSION_SEPARATOR = u'x'
|
||||
|
||||
TRANSFORMATION_RESIZE = u'resize'
|
||||
TRANSFORMATION_ROTATE = u'rotate'
|
||||
TRANSFORMATION_DENSITY = u'density'
|
||||
TRANSFORMATION_ZOOM = u'zoom'
|
||||
|
||||
TRANSFORMATION_CHOICES = {
|
||||
TRANSFORMATION_RESIZE: {
|
||||
'label': _(u'Resize'),
|
||||
'description': _(u'Resize.'),
|
||||
'arguments': [
|
||||
{'name': 'width', 'label': _(u'width'), 'required': True},
|
||||
{'name': 'height', 'label': _(u'height'), 'required': False},
|
||||
]
|
||||
},
|
||||
TRANSFORMATION_ROTATE: {
|
||||
'label': _(u'Rotate'),
|
||||
'description': _(u'Rotate by n degress.'),
|
||||
'arguments': [
|
||||
{'name': 'degrees', 'label': _(u'degrees'), 'required': True}
|
||||
]
|
||||
},
|
||||
TRANSFORMATION_DENSITY: {
|
||||
'label': _(u'Density'),
|
||||
'description': _(u'Change the resolution (ie: DPI) without resizing.'),
|
||||
'arguments': [
|
||||
{'name': 'width', 'label': _(u'width'), 'required': True},
|
||||
{'name': 'height', 'label': _(u'height'), 'required': False},
|
||||
]
|
||||
},
|
||||
TRANSFORMATION_ZOOM: {
|
||||
'label': _(u'Zoom'),
|
||||
'description': _(u'Zoom by n percent.'),
|
||||
'arguments': [
|
||||
{'name': 'percent', 'label': _(u'percent'), 'required': True}
|
||||
]
|
||||
},
|
||||
}
|
||||
@@ -1,6 +1,18 @@
|
||||
import os
|
||||
|
||||
from django.core.exceptions import ImproperlyConfigured
|
||||
from django.utils.importlib import import_module
|
||||
|
||||
try:
|
||||
from python_magic import magic
|
||||
USE_PYTHON_MAGIC = True
|
||||
except:
|
||||
import mimetypes
|
||||
mimetypes.init()
|
||||
USE_PYTHON_MAGIC = False
|
||||
|
||||
|
||||
#http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python
|
||||
|
||||
|
||||
def copyfile(source, dest, buffer_size=1024 * 1024):
|
||||
"""
|
||||
Copy a file from source to dest. source and dest
|
||||
@@ -21,3 +33,79 @@ def copyfile(source, dest, buffer_size=1024 * 1024):
|
||||
|
||||
source.close()
|
||||
dest.close()
|
||||
|
||||
|
||||
def _lazy_load(fn):
|
||||
_cached = []
|
||||
|
||||
def _decorated():
|
||||
if not _cached:
|
||||
_cached.append(fn())
|
||||
return _cached[0]
|
||||
return _decorated
|
||||
|
||||
|
||||
@_lazy_load
|
||||
def load_backend():
|
||||
from converter.conf.settings import GRAPHICS_BACKEND as backend_name
|
||||
|
||||
try:
|
||||
module = import_module('.base', 'converter.backends.%s' % backend_name)
|
||||
import warnings
|
||||
warnings.warn(
|
||||
"Short names for CONVERTER_BACKEND are deprecated; prepend with 'converter.backends.'",
|
||||
PendingDeprecationWarning
|
||||
)
|
||||
return module
|
||||
except ImportError, e:
|
||||
# Look for a fully qualified converter backend name
|
||||
try:
|
||||
return import_module('.base', backend_name)
|
||||
except ImportError, e_user:
|
||||
# The converter backend wasn't found. Display a helpful error message
|
||||
# listing all possible (built-in) converter backends.
|
||||
backend_dir = os.path.join(os.path.dirname(__file__), 'backends')
|
||||
try:
|
||||
available_backends = [f for f in os.listdir(backend_dir)
|
||||
if os.path.isdir(os.path.join(backend_dir, f))
|
||||
and not f.startswith('.')]
|
||||
except EnvironmentError:
|
||||
available_backends = []
|
||||
available_backends.sort()
|
||||
if backend_name not in available_backends:
|
||||
error_msg = ("%r isn't an available converter backend. \n" +
|
||||
"Try using converter.backends.XXX, where XXX is one of:\n %s\n" +
|
||||
"Error was: %s") % \
|
||||
(backend_name, ", ".join(map(repr, available_backends)), e_user)
|
||||
raise ImproperlyConfigured(error_msg)
|
||||
else:
|
||||
raise # If there's some other error, this must be an error in Mayan itself.
|
||||
|
||||
|
||||
def get_mimetype(filepath):
|
||||
"""
|
||||
Determine a file's mimetype by calling the system's libmagic
|
||||
library via python-magic or fallback to use python's mimetypes
|
||||
library
|
||||
"""
|
||||
file_mimetype = u''
|
||||
file_mime_encoding = u''
|
||||
|
||||
if USE_PYTHON_MAGIC:
|
||||
if os.path.exists(filepath):
|
||||
try:
|
||||
source = open(filepath, 'r')
|
||||
mime = magic.Magic(mime=True)
|
||||
file_mimetype = mime.from_buffer(source.read())
|
||||
source.seek(0)
|
||||
mime_encoding = magic.Magic(mime_encoding=True)
|
||||
file_mime_encoding = mime_encoding.from_buffer(source.read())
|
||||
finally:
|
||||
if source:
|
||||
source.close()
|
||||
else:
|
||||
path, filename = os.path.split(filepath)
|
||||
file_mimetype, file_mime_encoding = mimetypes.guess_type(filename)
|
||||
|
||||
return file_mimetype, file_mime_encoding
|
||||
|
||||
|
||||
@@ -1,38 +1,18 @@
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
from django.shortcuts import render_to_response
|
||||
from django.template import RequestContext
|
||||
from django.utils.importlib import import_module
|
||||
|
||||
from converter import backend
|
||||
|
||||
from converter.conf.settings import GRAPHICS_BACKEND
|
||||
|
||||
|
||||
def _lazy_load(fn):
|
||||
_cached = []
|
||||
|
||||
def _decorated():
|
||||
if not _cached:
|
||||
_cached.append(fn())
|
||||
return _cached[0]
|
||||
return _decorated
|
||||
|
||||
|
||||
@_lazy_load
|
||||
def _get_backend():
|
||||
return import_module(GRAPHICS_BACKEND)
|
||||
|
||||
try:
|
||||
backend = _get_backend()
|
||||
except ImportError:
|
||||
raise ImportError(u'Missing or incorrect converter backend: %s' % GRAPHICS_BACKEND)
|
||||
|
||||
|
||||
def formats_list(request):
|
||||
#check_permissions(request.user, [PERMISSION_DOCUMENT_VIEW])
|
||||
|
||||
context = {
|
||||
'title': _(u'suported file formats'),
|
||||
'hide_object': True,
|
||||
'object_list': backend.get_format_list(),
|
||||
'object_list': sorted(backend.get_format_list()),
|
||||
'extra_columns': [
|
||||
{
|
||||
'name': _(u'name'),
|
||||
|
||||
@@ -18,10 +18,6 @@ def default_uuid():
|
||||
"""unicode(uuid.uuid4())"""
|
||||
return unicode(uuid.uuid4())
|
||||
|
||||
available_transformations = {
|
||||
'rotate': {'label': _(u'Rotate [degrees]'), 'arguments': [{'name': 'degrees'}]}
|
||||
}
|
||||
|
||||
register_settings(
|
||||
namespace=u'documents',
|
||||
module=u'documents.conf.settings',
|
||||
@@ -31,8 +27,6 @@ register_settings(
|
||||
{'name': u'UUID_FUNCTION', 'global_name': u'DOCUMENTS_UUID_FUNCTION', 'default': default_uuid},
|
||||
# Storage
|
||||
{'name': u'STORAGE_BACKEND', 'global_name': u'DOCUMENTS_STORAGE_BACKEND', 'default': FileBasedStorage},
|
||||
# Transformations
|
||||
{'name': u'AVAILABLE_TRANSFORMATIONS', 'global_name': u'DOCUMENTS_AVAILABLE_TRANSFORMATIONS', 'default': available_transformations},
|
||||
# Usage
|
||||
{'name': u'PREVIEW_SIZE', 'global_name': u'DOCUMENTS_PREVIEW_SIZE', 'default': u'640x480'},
|
||||
{'name': u'PRINT_SIZE', 'global_name': u'DOCUMENTS_PRINT_SIZE', 'default': u'1400'},
|
||||
|
||||
@@ -13,3 +13,24 @@ class RecentDocumentManager(models.Manager):
|
||||
to_delete = self.model.objects.filter(user=user)[RECENT_COUNT:]
|
||||
for recent_to_delete in to_delete:
|
||||
recent_to_delete.delete()
|
||||
|
||||
|
||||
class DocumentPageTransformationManager(models.Manager):
|
||||
def get_for_document_page(self, document_page):
|
||||
return self.model.objects.filter(document_page=document_page)
|
||||
|
||||
def get_for_document_page_as_list(self, document_page):
|
||||
warnings = []
|
||||
transformations = []
|
||||
for transformation in self.get_for_document_page(document_page).values('transformation', 'arguments'):
|
||||
try:
|
||||
transformations.append(
|
||||
{
|
||||
'transformation': transformation['transformation'],
|
||||
'arguments': eval(transformation['arguments'], {})
|
||||
}
|
||||
)
|
||||
except Exception, e:
|
||||
warnings.append(e)
|
||||
|
||||
return transformations, warnings
|
||||
|
||||
@@ -12,15 +12,13 @@ from python_magic import magic
|
||||
from taggit.managers import TaggableManager
|
||||
from dynamic_search.api import register
|
||||
from converter.api import get_page_count
|
||||
from converter import TRANFORMATION_CHOICES
|
||||
from converter.api import get_available_transformations_choices
|
||||
|
||||
from documents.conf.settings import CHECKSUM_FUNCTION
|
||||
from documents.conf.settings import UUID_FUNCTION
|
||||
from documents.conf.settings import STORAGE_BACKEND
|
||||
from documents.conf.settings import AVAILABLE_TRANSFORMATIONS
|
||||
from documents.managers import RecentDocumentManager
|
||||
|
||||
available_transformations = ([(name, data['label']) for name, data in AVAILABLE_TRANSFORMATIONS.items()])
|
||||
from documents.managers import RecentDocumentManager, \
|
||||
DocumentPageTransformationManager
|
||||
|
||||
|
||||
def get_filename_from_uuid(instance, filename):
|
||||
@@ -92,7 +90,7 @@ class Document(models.Model):
|
||||
mimetype, page count and transformation when originally created
|
||||
"""
|
||||
new_document = not self.pk
|
||||
|
||||
transformations = kwargs.pop('transformations', None)
|
||||
super(Document, self).save(*args, **kwargs)
|
||||
|
||||
if new_document:
|
||||
@@ -101,7 +99,8 @@ class Document(models.Model):
|
||||
self.update_mimetype(save=False)
|
||||
self.save()
|
||||
self.update_page_count(save=False)
|
||||
self.apply_default_transformations()
|
||||
if transformations:
|
||||
self.apply_default_transformations(transformations)
|
||||
|
||||
@models.permalink
|
||||
def get_absolute_url(self):
|
||||
@@ -202,21 +201,21 @@ class Document(models.Model):
|
||||
exists in storage
|
||||
"""
|
||||
return self.file.storage.exists(self.file.path)
|
||||
|
||||
|
||||
def apply_default_transformations(self):
|
||||
def apply_default_transformations(self, transformations):
|
||||
#Only apply default transformations on new documents
|
||||
if DEFAULT_TRANSFORMATIONS and reduce(lambda x, y: x + y, [page.documentpagetransformation_set.count() for page in self.documentpage_set.all()]) == 0:
|
||||
for transformation in DEFAULT_TRANSFORMATIONS:
|
||||
if 'name' in transformation:
|
||||
for document_page in self.documentpage_set.all():
|
||||
page_transformation = DocumentPageTransformation(
|
||||
document_page=document_page,
|
||||
order=0,
|
||||
transformation=transformation['name'])
|
||||
if 'arguments' in transformation:
|
||||
page_transformation.arguments = transformation['arguments']
|
||||
if reduce(lambda x, y: x + y, [page.documentpagetransformation_set.count() for page in self.documentpage_set.all()]) == 0:
|
||||
for transformation in transformations:
|
||||
for document_page in self.documentpage_set.all():
|
||||
page_transformation = DocumentPageTransformation(
|
||||
document_page=document_page,
|
||||
order=0,
|
||||
transformation=transformation.get('transformation'),
|
||||
arguments=transformation.get('arguments')
|
||||
)
|
||||
|
||||
page_transformation.save()
|
||||
page_transformation.save()
|
||||
|
||||
|
||||
class DocumentTypeFilename(models.Model):
|
||||
@@ -258,26 +257,13 @@ class DocumentPage(models.Model):
|
||||
verbose_name = _(u'document page')
|
||||
verbose_name_plural = _(u'document pages')
|
||||
|
||||
def get_transformation_list(self):
|
||||
return DocumentPageTransformation.objects.get_for_document_page_as_list(self)
|
||||
|
||||
@models.permalink
|
||||
def get_absolute_url(self):
|
||||
return ('document_page_view', [self.pk])
|
||||
|
||||
def get_transformation_string(self):
|
||||
transformation_list = []
|
||||
warnings = []
|
||||
for page_transformation in self.documentpagetransformation_set.all():
|
||||
try:
|
||||
if page_transformation.transformation in TRANFORMATION_CHOICES:
|
||||
transformation_list.append(
|
||||
TRANFORMATION_CHOICES[page_transformation.transformation] % eval(
|
||||
page_transformation.arguments
|
||||
)
|
||||
)
|
||||
except Exception, e:
|
||||
warnings.append(e)
|
||||
|
||||
return u' '.join(transformation_list), warnings
|
||||
|
||||
|
||||
class DocumentPageTransformation(models.Model):
|
||||
"""
|
||||
@@ -286,9 +272,11 @@ class DocumentPageTransformation(models.Model):
|
||||
"""
|
||||
document_page = models.ForeignKey(DocumentPage, verbose_name=_(u'document page'))
|
||||
order = models.PositiveIntegerField(default=0, blank=True, null=True, verbose_name=_(u'order'), db_index=True)
|
||||
transformation = models.CharField(choices=available_transformations, max_length=128, verbose_name=_(u'transformation'))
|
||||
transformation = models.CharField(choices=get_available_transformations_choices(), max_length=128, verbose_name=_(u'transformation'))
|
||||
arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: {\'degrees\':90}'))
|
||||
|
||||
objects = DocumentPageTransformationManager()
|
||||
|
||||
def __unicode__(self):
|
||||
return u'"%s" for %s' % (self.get_transformation_display(), unicode(self.document_page))
|
||||
|
||||
|
||||
@@ -1,14 +1,12 @@
|
||||
from django.conf.urls.defaults import patterns, url
|
||||
|
||||
from converter.api import QUALITY_HIGH, QUALITY_PRINT
|
||||
from converter.literals import QUALITY_HIGH, QUALITY_PRINT
|
||||
|
||||
from documents.conf.settings import PREVIEW_SIZE
|
||||
from documents.conf.settings import PRINT_SIZE
|
||||
from documents.conf.settings import THUMBNAIL_SIZE
|
||||
from documents.conf.settings import DISPLAY_SIZE
|
||||
from documents.conf.settings import MULTIPAGE_PREVIEW_SIZE
|
||||
#from documents.literals import UPLOAD_SOURCE_LOCAL, \
|
||||
# UPLOAD_SOURCE_STAGING, UPLOAD_SOURCE_USER_STAGING
|
||||
|
||||
urlpatterns = patterns('documents.views',
|
||||
url(r'^list/$', 'document_list', (), 'document_list'),
|
||||
|
||||
@@ -20,10 +20,11 @@ from common.widgets import two_state_template
|
||||
from common.literals import PAGE_SIZE_DIMENSIONS, \
|
||||
PAGE_ORIENTATION_PORTRAIT, PAGE_ORIENTATION_LANDSCAPE
|
||||
from common.conf.settings import DEFAULT_PAPER_SIZE
|
||||
from converter.api import convert_document, QUALITY_DEFAULT
|
||||
from converter.api import convert_document
|
||||
from converter.exceptions import UnkownConvertError, UnknownFormat
|
||||
from converter.api import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, \
|
||||
DEFAULT_FILE_FORMAT, QUALITY_PRINT
|
||||
from converter.literals import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, \
|
||||
DEFAULT_FILE_FORMAT, QUALITY_PRINT, QUALITY_DEFAULT, \
|
||||
DEFAULT_PAGE_NUMBER
|
||||
from filetransfers.api import serve_file
|
||||
from grouping.utils import get_document_group_subtemplate
|
||||
from metadata.api import save_metadata_list, \
|
||||
@@ -286,38 +287,14 @@ def document_edit(request, document_id):
|
||||
}, context_instance=RequestContext(request))
|
||||
|
||||
|
||||
def calculate_converter_arguments(document, *args, **kwargs):
|
||||
size = kwargs.pop('size', PREVIEW_SIZE)
|
||||
quality = kwargs.pop('quality', QUALITY_DEFAULT)
|
||||
page = kwargs.pop('page', 1)
|
||||
file_format = kwargs.pop('file_format', DEFAULT_FILE_FORMAT)
|
||||
zoom = kwargs.pop('zoom', DEFAULT_ZOOM_LEVEL)
|
||||
rotation = kwargs.pop('rotation', DEFAULT_ROTATION)
|
||||
|
||||
document_page = DocumentPage.objects.get(document=document, page_number=page)
|
||||
transformation_string, warnings = document_page.get_transformation_string()
|
||||
|
||||
arguments = {
|
||||
'size': size,
|
||||
'file_format': file_format,
|
||||
'quality': quality,
|
||||
'extra_options': transformation_string,
|
||||
'page': page - 1,
|
||||
'zoom': zoom,
|
||||
'rotation': rotation
|
||||
}
|
||||
|
||||
return arguments, warnings
|
||||
|
||||
|
||||
def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_DEFAULT):
|
||||
check_permissions(request.user, [PERMISSION_DOCUMENT_VIEW])
|
||||
|
||||
document = get_object_or_404(Document, pk=document_id)
|
||||
|
||||
page = int(request.GET.get('page', 1))
|
||||
page = int(request.GET.get('page', DEFAULT_PAGE_NUMBER))
|
||||
|
||||
zoom = int(request.GET.get('zoom', 100))
|
||||
zoom = int(request.GET.get('zoom', DEFAULT_ZOOM_LEVEL))
|
||||
|
||||
if zoom < ZOOM_MIN_LEVEL:
|
||||
zoom = ZOOM_MIN_LEVEL
|
||||
@@ -325,16 +302,17 @@ def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_
|
||||
if zoom > ZOOM_MAX_LEVEL:
|
||||
zoom = ZOOM_MAX_LEVEL
|
||||
|
||||
rotation = int(request.GET.get('rotation', 0)) % 360
|
||||
rotation = int(request.GET.get('rotation', DEFAULT_ROTATION)) % 360
|
||||
|
||||
arguments, warnings = calculate_converter_arguments(document, size=size, file_format=DEFAULT_FILE_FORMAT, quality=quality, page=page, zoom=zoom, rotation=rotation)
|
||||
document_page = get_object_or_404(document.documentpage_set, page_number=page)
|
||||
transformations, warnings = document_page.get_transformation_list()
|
||||
|
||||
if warnings and (request.user.is_staff or request.user.is_superuser):
|
||||
for warning in warnings:
|
||||
messages.warning(request, _(u'Page transformation error: %s') % warning)
|
||||
|
||||
|
||||
try:
|
||||
output_file = convert_document(document, **arguments)
|
||||
output_file = convert_document(document, size=size, file_format=DEFAULT_FILE_FORMAT, quality=quality, page=page, zoom=zoom, rotation=rotation, transformations=transformations)
|
||||
except UnkownConvertError, e:
|
||||
if request.user.is_staff or request.user.is_superuser:
|
||||
messages.error(request, e)
|
||||
@@ -592,13 +570,13 @@ def document_page_view(request, document_page_id):
|
||||
|
||||
document_page = get_object_or_404(DocumentPage, pk=document_page_id)
|
||||
|
||||
zoom = int(request.GET.get('zoom', 100))
|
||||
rotation = int(request.GET.get('rotation', 0))
|
||||
zoom = int(request.GET.get('zoom', DEFAULT_ZOOM_LEVEL))
|
||||
rotation = int(request.GET.get('rotation', DEFAULT_ROTATION))
|
||||
document_page_form = DocumentPageForm(instance=document_page, zoom=zoom, rotation=rotation)
|
||||
|
||||
base_title = _(u'details for: %s') % document_page
|
||||
|
||||
if zoom != 100:
|
||||
if zoom != DEFAULT_ZOOM_LEVEL:
|
||||
zoom_text = u'(%d%%)' % zoom
|
||||
else:
|
||||
zoom_text = u''
|
||||
|
||||
@@ -9,7 +9,7 @@ from documents.models import Document
|
||||
from main.api import register_tool
|
||||
|
||||
from ocr.conf.settings import AUTOMATIC_OCR
|
||||
from ocr.models import DocumentQueue
|
||||
from ocr.models import DocumentQueue, QueueTransformation
|
||||
|
||||
#Permissions
|
||||
PERMISSION_OCR_DOCUMENT = {'namespace': 'ocr', 'name': 'ocr_document', 'label': _(u'Submit document for OCR')}
|
||||
@@ -30,20 +30,27 @@ re_queue_multiple_document = {'text': _('re-queue'), 'view': 're_queue_multiple_
|
||||
queue_document_delete = {'text': _(u'delete'), 'view': 'queue_document_delete', 'args': 'object.id', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]}
|
||||
queue_document_multiple_delete = {'text': _(u'delete'), 'view': 'queue_document_multiple_delete', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]}
|
||||
|
||||
document_queue_disable = {'text': _(u'stop queue'), 'view': 'document_queue_disable', 'args': 'object.id', 'famfam': 'control_stop_blue', 'permissions': [PERMISSION_OCR_QUEUE_ENABLE_DISABLE]}
|
||||
document_queue_enable = {'text': _(u'activate queue'), 'view': 'document_queue_enable', 'args': 'object.id', 'famfam': 'control_play_blue', 'permissions': [PERMISSION_OCR_QUEUE_ENABLE_DISABLE]}
|
||||
document_queue_disable = {'text': _(u'stop queue'), 'view': 'document_queue_disable', 'args': 'queue.id', 'famfam': 'control_stop_blue', 'permissions': [PERMISSION_OCR_QUEUE_ENABLE_DISABLE]}
|
||||
document_queue_enable = {'text': _(u'activate queue'), 'view': 'document_queue_enable', 'args': 'queue.id', 'famfam': 'control_play_blue', 'permissions': [PERMISSION_OCR_QUEUE_ENABLE_DISABLE]}
|
||||
|
||||
all_document_ocr_cleanup = {'text': _(u'clean up pages content'), 'view': 'all_document_ocr_cleanup', 'famfam': 'text_strikethrough', 'permissions': [PERMISSION_OCR_CLEAN_ALL_PAGES], 'description': _(u'Runs a language filter to remove common OCR mistakes from document pages content.')}
|
||||
|
||||
queue_document_list = {'text': _(u'queue document list'), 'view': 'queue_document_list', 'famfam': 'hourglass', 'permissions': [PERMISSION_OCR_DOCUMENT]}
|
||||
node_active_list = {'text': _(u'active tasks'), 'view': 'node_active_list', 'famfam': 'server_chart', 'permissions': [PERMISSION_OCR_DOCUMENT]}
|
||||
|
||||
setup_queue_transformation_list = {'text': _(u'transformations'), 'view': 'setup_queue_transformation_list', 'args': 'queue.pk', 'famfam': 'shape_move_front'}
|
||||
setup_queue_transformation_create = {'text': _(u'add transformation'), 'view': 'setup_queue_transformation_create', 'args': 'queue.pk', 'famfam': 'shape_square_add'}
|
||||
setup_queue_transformation_edit = {'text': _(u'edit'), 'view': 'setup_queue_transformation_edit', 'args': 'transformation.pk', 'famfam': 'shape_square_edit'}
|
||||
setup_queue_transformation_delete = {'text': _(u'delete'), 'view': 'setup_queue_transformation_delete', 'args': 'transformation.pk', 'famfam': 'shape_square_delete'}
|
||||
|
||||
register_links(Document, [submit_document])
|
||||
register_links(DocumentQueue, [document_queue_disable, document_queue_enable])
|
||||
register_links(DocumentQueue, [document_queue_disable, document_queue_enable, setup_queue_transformation_list])
|
||||
register_links(QueueTransformation, [setup_queue_transformation_edit, setup_queue_transformation_delete])
|
||||
|
||||
register_multi_item_links(['queue_document_list'], [re_queue_multiple_document, queue_document_multiple_delete])
|
||||
|
||||
register_links(['queue_document_list', 'node_active_list'], [queue_document_list, node_active_list], menu_name='secondary_menu')
|
||||
register_links(['setup_queue_transformation_create', 'setup_queue_transformation_edit', 'setup_queue_transformation_delete', 'document_queue_disable', 'document_queue_enable', 'queue_document_list', 'node_active_list', 'setup_queue_transformation_list'], [queue_document_list, node_active_list], menu_name='secondary_menu')
|
||||
register_links(['setup_queue_transformation_edit', 'setup_queue_transformation_delete', 'setup_queue_transformation_list', 'setup_queue_transformation_create'], [setup_queue_transformation_create], menu_name='sidebar')
|
||||
|
||||
|
||||
register_tool(all_document_ocr_cleanup, namespace='ocr', title=_(u'OCR'))
|
||||
|
||||
145
apps/ocr/api.py
145
apps/ocr/api.py
@@ -9,13 +9,15 @@ import sys
|
||||
from django.utils.translation import ugettext as _
|
||||
from django.utils.importlib import import_module
|
||||
|
||||
from converter.api import convert_document_for_ocr
|
||||
from converter.api import convert
|
||||
from documents.models import DocumentPage
|
||||
|
||||
from ocr.conf.settings import TESSERACT_PATH
|
||||
from ocr.conf.settings import TESSERACT_LANGUAGE
|
||||
from ocr.conf.settings import PDFTOTEXT_PATH
|
||||
from ocr.exceptions import TesseractError, PdftotextError
|
||||
from ocr.exceptions import TesseractError
|
||||
from ocr.conf.settings import UNPAPER_PATH
|
||||
from ocr.parsers import parse_document_page
|
||||
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
|
||||
|
||||
|
||||
def get_language_backend():
|
||||
@@ -30,7 +32,7 @@ def get_language_backend():
|
||||
return None
|
||||
return module
|
||||
|
||||
backend = get_language_backend()
|
||||
language_backend = get_language_backend()
|
||||
|
||||
|
||||
def cleanup(filename):
|
||||
@@ -58,63 +60,38 @@ def run_tesseract(input_filename, output_filename_base, lang=None):
|
||||
raise TesseractError(error_text)
|
||||
|
||||
|
||||
def run_pdftotext(input_filename, output_filename, page_number=None):
|
||||
"""
|
||||
Execute the command line binary of pdftotext
|
||||
"""
|
||||
command = [unicode(PDFTOTEXT_PATH)]
|
||||
if page_number:
|
||||
command.extend([u'-nopgbrk', u'-f', unicode(page_number), u'-l', unicode(page_number)])
|
||||
command.extend([unicode(input_filename), unicode(output_filename)])
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
error_text = proc.stderr.read()
|
||||
raise PdftotextError(error_text)
|
||||
|
||||
|
||||
def do_document_ocr(document):
|
||||
"""
|
||||
Do OCR on all the pages of the given document object, first
|
||||
trying to extract text from PDF using pdftotext then by calling
|
||||
tesseract
|
||||
first try to extract text from document pages using the registered
|
||||
parser if the parser fails or if there is no parser registered for
|
||||
the document mimetype do a visual OCR by calling tesseract
|
||||
"""
|
||||
for page_index, document_page in enumerate(document.documentpage_set.all()):
|
||||
desc, filepath = tempfile.mkstemp()
|
||||
imagefile = None
|
||||
source = u''
|
||||
for document_page in document.documentpage_set.all():
|
||||
try:
|
||||
if document.file_mimetype == u'application/pdf':
|
||||
pdf_filename = os.extsep.join([filepath, u'pdf'])
|
||||
document.save_to_file(pdf_filename)
|
||||
run_pdftotext(pdf_filename, filepath, document_page.page_number)
|
||||
cleanup(pdf_filename)
|
||||
if os.stat(filepath).st_size == 0:
|
||||
#PDF page had no text, run tesseract on the page
|
||||
imagefile = convert_document_for_ocr(document, page=page_index)
|
||||
run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
|
||||
ocr_output = os.extsep.join([filepath, u'txt'])
|
||||
source = _(u'Text from OCR')
|
||||
else:
|
||||
ocr_output = filepath
|
||||
source = _(u'Text extracted from PDF')
|
||||
else:
|
||||
imagefile = convert_document_for_ocr(document, page=page_index)
|
||||
run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
|
||||
ocr_output = os.extsep.join([filepath, u'txt'])
|
||||
source = _(u'Text from OCR')
|
||||
f = codecs.open(ocr_output, 'r', 'utf-8')
|
||||
document_page = document.documentpage_set.get(page_number=page_index + 1)
|
||||
document_page.content = ocr_cleanup(f.read().strip())
|
||||
document_page.page_label = source
|
||||
document_page.save()
|
||||
f.close()
|
||||
cleanup(ocr_output)
|
||||
finally:
|
||||
os.close(desc)
|
||||
cleanup(filepath)
|
||||
if imagefile:
|
||||
cleanup(imagefile)
|
||||
# Try to extract text by means of a parser
|
||||
parse_document_page(document_page)
|
||||
except (ParserError, ParserUnknownFile):
|
||||
# Fall back to doing visual OCR
|
||||
pass
|
||||
#desc, filepath = tempfile.mkstemp()
|
||||
#imagefile = None
|
||||
#source = u''
|
||||
#imagefile = convert_document_for_ocr(document, page=document_page.page_number)
|
||||
#run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
|
||||
#ocr_output = os.extsep.join([filepath, u'txt'])
|
||||
#source = _(u'Text from OCR')
|
||||
#f = codecs.open(ocr_output, 'r', 'utf-8')
|
||||
#document_page.content = ocr_cleanup(f.read().strip())
|
||||
#document_page.page_label = source
|
||||
#document_page.save()
|
||||
#f.close()
|
||||
#cleanup(ocr_output)
|
||||
#finally:
|
||||
# pass
|
||||
#os.close(desc)
|
||||
#cleanup(filepath)
|
||||
#if imagefile:
|
||||
# cleanup(imagefile)
|
||||
|
||||
|
||||
def ocr_cleanup(text):
|
||||
@@ -127,8 +104,8 @@ def ocr_cleanup(text):
|
||||
for line in text.splitlines():
|
||||
line = line.strip()
|
||||
for word in line.split():
|
||||
if backend:
|
||||
result = backend.check_word(word)
|
||||
if language_backend:
|
||||
result = language_backend.check_word(word)
|
||||
else:
|
||||
result = word
|
||||
if result:
|
||||
@@ -147,3 +124,53 @@ def clean_pages():
|
||||
if page.content:
|
||||
page.content = ocr_cleanup(page.content)
|
||||
page.save()
|
||||
|
||||
|
||||
def execute_unpaper(input_filepath, output_filepath):
|
||||
"""
|
||||
Executes the program unpaper using subprocess's Popen
|
||||
"""
|
||||
command = []
|
||||
command.append(UNPAPER_PATH)
|
||||
command.append(u'--overwrite')
|
||||
command.append(input_filepath)
|
||||
command.append(output_filepath)
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
raise UnpaperError(proc.stderr.readline())
|
||||
|
||||
'''
|
||||
def convert_document_for_ocr(document, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT):
|
||||
#Extract document file
|
||||
input_filepath = document_save_to_temp_dir(document, document.uuid)
|
||||
|
||||
#Convert for OCR
|
||||
temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
|
||||
temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
|
||||
transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, file_format)
|
||||
unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
|
||||
unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
|
||||
convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format)
|
||||
|
||||
try:
|
||||
document_page = document.documentpage_set.get(page_number=page)
|
||||
transformations, warnings = document_page.get_transformation_list()
|
||||
|
||||
#Apply default transformations
|
||||
backend.convert_file(input_filepath=input_filepath, page=page, quality=QUALITY_HIGH, transformations=transformations, output_filepath=transformation_output_file)
|
||||
#Do OCR operations
|
||||
backend.convert_file(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file)
|
||||
# Process by unpaper
|
||||
execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file)
|
||||
# Convert to tif
|
||||
backend.convert_file(input_filepath=unpaper_output_file, output_filepath=convert_output_file)
|
||||
finally:
|
||||
cleanup(transformation_output_file)
|
||||
cleanup(unpaper_input_file)
|
||||
cleanup(unpaper_output_file)
|
||||
|
||||
return convert_output_file
|
||||
'''
|
||||
|
||||
|
||||
|
||||
@@ -13,8 +13,9 @@ register_settings(
|
||||
{'name': u'REPLICATION_DELAY', 'global_name': u'OCR_REPLICATION_DELAY', 'default': 10, 'description': _(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.')},
|
||||
{'name': u'NODE_CONCURRENT_EXECUTION', 'global_name': u'OCR_NODE_CONCURRENT_EXECUTION', 'default': 1, 'description': _(u'Maximum amount of concurrent document OCRs a node can perform.')},
|
||||
{'name': u'AUTOMATIC_OCR', 'global_name': u'OCR_AUTOMATIC_OCR', 'default': False, 'description': _(u'Automatically queue newly created documents for OCR.')},
|
||||
{'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'exists': True},
|
||||
{'name': u'QUEUE_PROCESSING_INTERVAL', 'global_name': u'OCR_QUEUE_PROCESSING_INTERVAL', 'default': 10},
|
||||
{'name': u'CACHE_URI', 'global_name': u'OCR_CACHE_URI', 'default': None, 'description': _(u'URI in the form: "memcached://127.0.0.1:11211/" to specify a cache backend to use for locking. Multiple hosts can be specified separated by a semicolon.')}
|
||||
{'name': u'CACHE_URI', 'global_name': u'OCR_CACHE_URI', 'default': None, 'description': _(u'URI in the form: "memcached://127.0.0.1:11211/" to specify a cache backend to use for locking. Multiple hosts can be specified separated by a semicolon.')},
|
||||
{'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
|
||||
{'name': u'PARSERS_PDFTOTEXT_PATH', 'global_name': u'OCR_PARSERS_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'exists': True},
|
||||
]
|
||||
)
|
||||
|
||||
@@ -4,7 +4,3 @@ class AlreadyQueued(Exception):
|
||||
|
||||
class TesseractError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class PdftotextError(Exception):
|
||||
pass
|
||||
|
||||
21
apps/ocr/forms.py
Normal file
21
apps/ocr/forms.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from django import forms
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
from django.utils.translation import ugettext
|
||||
|
||||
from ocr.models import QueueTransformation
|
||||
|
||||
|
||||
class QueueTransformationForm(forms.ModelForm):
|
||||
class Meta:
|
||||
model = QueueTransformation
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(QueueTransformationForm, self).__init__(*args, **kwargs)
|
||||
self.fields['content_type'].widget = forms.HiddenInput()
|
||||
self.fields['object_id'].widget = forms.HiddenInput()
|
||||
|
||||
|
||||
class QueueTransformationForm_create(forms.ModelForm):
|
||||
class Meta:
|
||||
model = QueueTransformation
|
||||
exclude = ('content_type', 'object_id')
|
||||
@@ -1,18 +0,0 @@
|
||||
from django.db import models
|
||||
|
||||
from ocr.exceptions import AlreadyQueued
|
||||
|
||||
|
||||
class DocumentQueueManager(models.Manager):
|
||||
"""
|
||||
Module manager class to handle adding documents to an OCR document
|
||||
queue
|
||||
"""
|
||||
def queue_document(self, document, queue_name='default'):
|
||||
document_queue = self.model.objects.get(name=queue_name)
|
||||
if document_queue.queuedocument_set.filter(document=document):
|
||||
raise AlreadyQueued
|
||||
|
||||
document_queue.queuedocument_set.create(document=document, delay=True)
|
||||
|
||||
return document_queue
|
||||
41
apps/ocr/managers.py
Normal file
41
apps/ocr/managers.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from django.db import models
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
|
||||
from ocr.exceptions import AlreadyQueued
|
||||
|
||||
|
||||
class DocumentQueueManager(models.Manager):
|
||||
"""
|
||||
Module manager class to handle adding documents to an OCR document
|
||||
queue
|
||||
"""
|
||||
def queue_document(self, document, queue_name='default'):
|
||||
document_queue = self.model.objects.get(name=queue_name)
|
||||
if document_queue.queuedocument_set.filter(document=document):
|
||||
raise AlreadyQueued
|
||||
|
||||
document_queue.queuedocument_set.create(document=document, delay=True)
|
||||
|
||||
return document_queue
|
||||
|
||||
|
||||
class QueueTransformationManager(models.Manager):
|
||||
def get_for_object(self, obj):
|
||||
ct = ContentType.objects.get_for_model(obj)
|
||||
return self.model.objects.filter(content_type=ct).filter(object_id=obj.pk)
|
||||
|
||||
def get_for_object_as_list(self, obj):
|
||||
warnings = []
|
||||
transformations = []
|
||||
for transformation in self.get_for_object(obj).values('transformation', 'arguments'):
|
||||
try:
|
||||
transformations.append(
|
||||
{
|
||||
'transformation': transformation['transformation'],
|
||||
'arguments': eval(transformation['arguments'], {})
|
||||
}
|
||||
)
|
||||
except Exception, e:
|
||||
warnings.append(e)
|
||||
|
||||
return transformations, warnings
|
||||
@@ -2,13 +2,16 @@ from django.db import models
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
from django.utils.translation import ugettext
|
||||
from django.core.exceptions import ObjectDoesNotExist
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
from django.contrib.contenttypes import generic
|
||||
|
||||
from documents.models import Document
|
||||
from converter.api import get_available_transformations_choices
|
||||
|
||||
from ocr.literals import DOCUMENTQUEUE_STATE_STOPPED, \
|
||||
DOCUMENTQUEUE_STATE_CHOICES, QUEUEDOCUMENT_STATE_PENDING, \
|
||||
QUEUEDOCUMENT_STATE_CHOICES
|
||||
from ocr.manager import DocumentQueueManager
|
||||
from ocr.managers import DocumentQueueManager, QueueTransformationManager
|
||||
|
||||
|
||||
class DocumentQueue(models.Model):
|
||||
@@ -51,3 +54,26 @@ class QueueDocument(models.Model):
|
||||
return unicode(self.document)
|
||||
except ObjectDoesNotExist:
|
||||
return ugettext(u'Missing document.')
|
||||
|
||||
|
||||
class QueueTransformation(models.Model):
|
||||
"""
|
||||
Model that stores the transformation and transformation arguments
|
||||
for a given document queue
|
||||
"""
|
||||
content_type = models.ForeignKey(ContentType)
|
||||
object_id = models.PositiveIntegerField()
|
||||
content_object = generic.GenericForeignKey('content_type', 'object_id')
|
||||
order = models.PositiveIntegerField(default=0, blank=True, null=True, verbose_name=_(u'order'), db_index=True)
|
||||
transformation = models.CharField(choices=get_available_transformations_choices(), max_length=128, verbose_name=_(u'transformation'))
|
||||
arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: %s') % u'{\'degrees\':90}')
|
||||
|
||||
objects = QueueTransformationManager()
|
||||
|
||||
def __unicode__(self):
|
||||
return self.get_transformation_display()
|
||||
|
||||
class Meta:
|
||||
ordering = ('order',)
|
||||
verbose_name = _(u'document queue transformation')
|
||||
verbose_name_plural = _(u'document queue transformations')
|
||||
|
||||
40
apps/ocr/parsers/__init__.py
Normal file
40
apps/ocr/parsers/__init__.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import codecs
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import sys
|
||||
|
||||
import slate
|
||||
|
||||
from django.utils.translation import ugettext as _
|
||||
|
||||
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
|
||||
|
||||
mimetype_registry = {}
|
||||
|
||||
|
||||
def register_parser(mimetype, function):
|
||||
mimetype_registry[mimetype] = {'function': function}
|
||||
|
||||
|
||||
def pdf_parser(document_page):
|
||||
fd = document_page.document.open()
|
||||
pdf_pages = slate.PDF(fd)
|
||||
fd.close()
|
||||
|
||||
if pdf_pages[document_page.page_number - 1] == '\x0c':
|
||||
raise ParserError
|
||||
|
||||
document_page.content = pdf_pages[document_page.page_number - 1]
|
||||
document_page.page_label = _(u'Text extracted from PDF')
|
||||
document_page.save()
|
||||
|
||||
|
||||
def parse_document_page(document_page):
|
||||
try:
|
||||
mimetype_registry[document_page.document.file_mimetype]['function'](document_page)
|
||||
except KeyError:
|
||||
raise ParserUnknownFile
|
||||
|
||||
|
||||
register_parser('application/pdf', pdf_parser)
|
||||
10
apps/ocr/parsers/exceptions.py
Normal file
10
apps/ocr/parsers/exceptions.py
Normal file
@@ -0,0 +1,10 @@
|
||||
class ParserError(Exception):
|
||||
"""
|
||||
Raised when a text parser fails to understand a file it been passed
|
||||
or the resulting parsed text is invalid
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class ParserUnknownFile(Exception):
|
||||
pass
|
||||
@@ -1,16 +1,22 @@
|
||||
from django.conf.urls.defaults import patterns, url
|
||||
|
||||
urlpatterns = patterns('ocr.views',
|
||||
url(r'^(?P<document_id>\d+)/submit/$', 'submit_document', (), 'submit_document'),
|
||||
url(r'^ocr/queue/document/list/$', 'queue_document_list', (), 'queue_document_list'),
|
||||
url(r'^ocr/queue/document/(?P<queue_document_id>\d+)/delete/$', 'queue_document_delete', (), 'queue_document_delete'),
|
||||
url(r'^ocr/queue/document/multiple/delete/$', 'queue_document_multiple_delete', (), 'queue_document_multiple_delete'),
|
||||
url(r'^ocr/queue/document/(?P<queue_document_id>\d+)/re-queue/$', 're_queue_document', (), 're_queue_document'),
|
||||
url(r'^ocr/queue/document/multiple/re-queue/$', 're_queue_multiple_document', (), 're_queue_multiple_document'),
|
||||
url(r'^document/(?P<document_id>\d+)/submit/$', 'submit_document', (), 'submit_document'),
|
||||
url(r'^queue/document/list/$', 'queue_document_list', (), 'queue_document_list'),
|
||||
url(r'^queue/document/(?P<queue_document_id>\d+)/delete/$', 'queue_document_delete', (), 'queue_document_delete'),
|
||||
url(r'^queue/document/multiple/delete/$', 'queue_document_multiple_delete', (), 'queue_document_multiple_delete'),
|
||||
url(r'^queue/document/(?P<queue_document_id>\d+)/re-queue/$', 're_queue_document', (), 're_queue_document'),
|
||||
url(r'^queue/document/multiple/re-queue/$', 're_queue_multiple_document', (), 're_queue_multiple_document'),
|
||||
|
||||
url(r'^ocr/queue/(?P<document_queue_id>\d+)/enable/$', 'document_queue_enable', (), 'document_queue_enable'),
|
||||
url(r'^ocr/queue/(?P<document_queue_id>\d+)/disable/$', 'document_queue_disable', (), 'document_queue_disable'),
|
||||
url(r'^queue/(?P<document_queue_id>\d+)/enable/$', 'document_queue_enable', (), 'document_queue_enable'),
|
||||
url(r'^queue/(?P<document_queue_id>\d+)/disable/$', 'document_queue_disable', (), 'document_queue_disable'),
|
||||
|
||||
url(r'^ocr/document/all/clean_up/$', 'all_document_ocr_cleanup', (), 'all_document_ocr_cleanup'),
|
||||
url(r'^ocr/node/active/list/$', 'node_active_list', (), 'node_active_list'),
|
||||
url(r'^document/all/clean_up/$', 'all_document_ocr_cleanup', (), 'all_document_ocr_cleanup'),
|
||||
url(r'^node/active/list/$', 'node_active_list', (), 'node_active_list'),
|
||||
|
||||
url(r'^queue/(?P<document_queue_id>\d+)/transformation/list/$', 'setup_queue_transformation_list', (), 'setup_queue_transformation_list'),
|
||||
url(r'^queue/(?P<document_queue_id>\w+)/transformation/create/$', 'setup_queue_transformation_create', (), 'setup_queue_transformation_create'),
|
||||
url(r'^queue/transformation/(?P<transformation_id>\w+)/edit/$', 'setup_queue_transformation_edit', (), 'setup_queue_transformation_edit'),
|
||||
url(r'^queue/transformation/(?P<transformation_id>\w+)/delete/$', 'setup_queue_transformation_delete', (), 'setup_queue_transformation_delete'),
|
||||
|
||||
)
|
||||
|
||||
@@ -6,9 +6,8 @@ from django.shortcuts import render_to_response, get_object_or_404
|
||||
from django.template import RequestContext
|
||||
from django.contrib import messages
|
||||
from django.views.generic.list_detail import object_list
|
||||
from django.core.urlresolvers import reverse
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
from django.conf import settings
|
||||
from django.core.urlresolvers import reverse
|
||||
|
||||
from celery.task.control import inspect
|
||||
from permissions.api import check_permissions
|
||||
@@ -18,12 +17,13 @@ from documents.widgets import document_link, document_thumbnail
|
||||
from ocr import PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE, \
|
||||
PERMISSION_OCR_QUEUE_ENABLE_DISABLE, PERMISSION_OCR_CLEAN_ALL_PAGES
|
||||
|
||||
from ocr.models import DocumentQueue, QueueDocument
|
||||
from ocr.models import DocumentQueue, QueueDocument, QueueTransformation
|
||||
from ocr.literals import QUEUEDOCUMENT_STATE_PENDING, \
|
||||
QUEUEDOCUMENT_STATE_PROCESSING, DOCUMENTQUEUE_STATE_STOPPED, \
|
||||
DOCUMENTQUEUE_STATE_ACTIVE
|
||||
from ocr.exceptions import AlreadyQueued
|
||||
from ocr.api import clean_pages
|
||||
from ocr.forms import QueueTransformationForm, QueueTransformationForm_create
|
||||
|
||||
|
||||
def queue_document_list(request, queue_name='default'):
|
||||
@@ -38,8 +38,10 @@ def queue_document_list(request, queue_name='default'):
|
||||
extra_context={
|
||||
'title': _(u'documents in queue: %s') % document_queue,
|
||||
'hide_object': True,
|
||||
'object': document_queue,
|
||||
'queue': document_queue,
|
||||
'object_name': _(u'document queue'),
|
||||
'navigation_object_name': 'queue',
|
||||
'list_object_variable_name': 'queue_document',
|
||||
'extra_columns': [
|
||||
{'name': 'document', 'attribute': lambda x: document_link(x.document) if hasattr(x, 'document') else _(u'Missing document.')},
|
||||
{'name': _(u'thumbnail'), 'attribute': lambda x: document_thumbnail(x.document)},
|
||||
@@ -212,7 +214,8 @@ def document_queue_disable(request, document_queue_id):
|
||||
return HttpResponseRedirect(next)
|
||||
|
||||
return render_to_response('generic_confirm.html', {
|
||||
'object': document_queue,
|
||||
'queue': document_queue,
|
||||
'navigation_object_name': 'queue',
|
||||
'title': _(u'Are you sure you wish to disable document queue: %s') % document_queue,
|
||||
'next': next,
|
||||
'previous': previous,
|
||||
@@ -238,7 +241,8 @@ def document_queue_enable(request, document_queue_id):
|
||||
return HttpResponseRedirect(next)
|
||||
|
||||
return render_to_response('generic_confirm.html', {
|
||||
'object': document_queue,
|
||||
'queue': document_queue,
|
||||
'navigation_object_name': 'queue',
|
||||
'title': _(u'Are you sure you wish to activate document queue: %s') % document_queue,
|
||||
'next': next,
|
||||
'previous': previous,
|
||||
@@ -317,3 +321,141 @@ def node_active_list(request):
|
||||
{'name': _(u'related object'), 'attribute': lambda x: display_link(x['related_object']) if x['related_object'] else u''}
|
||||
],
|
||||
}, context_instance=RequestContext(request))
|
||||
|
||||
|
||||
def setup_queue_transformation_list(request, document_queue_id):
|
||||
#check_permissions(request.user, [PERMISSION_SOURCES_SETUP_EDIT])
|
||||
|
||||
document_queue = get_object_or_404(DocumentQueue, pk=document_queue_id)
|
||||
|
||||
context = {
|
||||
'object_list': QueueTransformation.objects.get_for_object(document_queue),
|
||||
'title': _(u'transformations for: %s') % document_queue,
|
||||
#'object_name': _(u'document queue'),
|
||||
#'object': document_queue,
|
||||
'queue': document_queue,
|
||||
'object_name': _(u'document queue'),
|
||||
'navigation_object_name': 'queue',
|
||||
'list_object_variable_name': 'transformation',
|
||||
'extra_columns': [
|
||||
{'name': _(u'order'), 'attribute': 'order'},
|
||||
{'name': _(u'transformation'), 'attribute': lambda x: x.get_transformation_display()},
|
||||
{'name': _(u'arguments'), 'attribute': 'arguments'}
|
||||
],
|
||||
'hide_link': True,
|
||||
'hide_object': True,
|
||||
}
|
||||
|
||||
return render_to_response('generic_list.html', context,
|
||||
context_instance=RequestContext(request))
|
||||
|
||||
|
||||
def setup_queue_transformation_edit(request, transformation_id):
|
||||
#check_permissions(request.user, [PERMISSION_SOURCES_SETUP_EDIT])
|
||||
|
||||
transformation = get_object_or_404(QueueTransformation, pk=transformation_id)
|
||||
redirect_view = reverse('setup_queue_transformation_list', args=[transformation.content_object.pk])
|
||||
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', redirect_view)))
|
||||
|
||||
if request.method == 'POST':
|
||||
form = QueueTransformationForm(instance=transformation, data=request.POST)
|
||||
if form.is_valid():
|
||||
try:
|
||||
# Test the validity of the argument field
|
||||
eval(form.cleaned_data['arguments'], {})
|
||||
except:
|
||||
messages.error(request, _(u'Queue transformation argument error.'))
|
||||
else:
|
||||
try:
|
||||
form.save()
|
||||
messages.success(request, _(u'Queue transformation edited successfully'))
|
||||
return HttpResponseRedirect(next)
|
||||
except Exception, e:
|
||||
messages.error(request, _(u'Error editing queue transformation; %s') % e)
|
||||
else:
|
||||
form = QueueTransformationForm(instance=transformation)
|
||||
|
||||
return render_to_response('generic_form.html', {
|
||||
'title': _(u'Edit transformation: %s') % transformation,
|
||||
'form': form,
|
||||
'queue': transformation.content_object,
|
||||
'transformation': transformation,
|
||||
'navigation_object_list': [
|
||||
{'object': 'queue', 'name': _(u'document queue')},
|
||||
{'object': 'transformation', 'name': _(u'transformation')}
|
||||
],
|
||||
'next': next,
|
||||
},
|
||||
context_instance=RequestContext(request))
|
||||
|
||||
|
||||
def setup_queue_transformation_delete(request, transformation_id):
|
||||
#check_permissions(request.user, [PERMISSION_SOURCES_SETUP_EDIT])
|
||||
|
||||
transformation = get_object_or_404(QueueTransformation, pk=transformation_id)
|
||||
redirect_view = reverse('setup_queue_transformation_list', args=[transformation.content_object.pk])
|
||||
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', redirect_view)))
|
||||
|
||||
if request.method == 'POST':
|
||||
try:
|
||||
transformation.delete()
|
||||
messages.success(request, _(u'Queue transformation deleted successfully.'))
|
||||
except Exception, e:
|
||||
messages.error(request, _(u'Error deleting queue transformation; %(error)s') % {
|
||||
'error': e}
|
||||
)
|
||||
return HttpResponseRedirect(redirect_view)
|
||||
|
||||
return render_to_response('generic_confirm.html', {
|
||||
'delete_view': True,
|
||||
'transformation': transformation,
|
||||
'queue': transformation.content_object,
|
||||
'navigation_object_list': [
|
||||
{'object': 'queue', 'name': _(u'document queue')},
|
||||
{'object': 'transformation', 'name': _(u'transformation')}
|
||||
],
|
||||
'title': _(u'Are you sure you wish to delete queue transformation "%(transformation)s"') % {
|
||||
'transformation': transformation.get_transformation_display(),
|
||||
},
|
||||
'previous': previous,
|
||||
'form_icon': u'shape_square_delete.png',
|
||||
},
|
||||
context_instance=RequestContext(request))
|
||||
|
||||
|
||||
def setup_queue_transformation_create(request, document_queue_id):
|
||||
#check_permissions(request.user, [PERMISSION_SOURCES_SETUP_EDIT])
|
||||
|
||||
document_queue = get_object_or_404(DocumentQueue, pk=document_queue_id)
|
||||
|
||||
redirect_view = reverse('setup_queue_transformation_list', args=[document_queue.pk])
|
||||
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', redirect_view)))
|
||||
|
||||
if request.method == 'POST':
|
||||
form = QueueTransformationForm_create(request.POST)
|
||||
if form.is_valid():
|
||||
try:
|
||||
# Test the validity of the argument field
|
||||
eval(form.cleaned_data['arguments'], {})
|
||||
except:
|
||||
messages.error(request, _(u'Queue transformation argument error.'))
|
||||
else:
|
||||
try:
|
||||
queue_tranformation = form.save(commit=False)
|
||||
queue_tranformation.content_object = document_queue
|
||||
queue_tranformation.save()
|
||||
messages.success(request, _(u'Queue transformation created successfully'))
|
||||
return HttpResponseRedirect(redirect_view)
|
||||
except Exception, e:
|
||||
messages.error(request, _(u'Error creating queue transformation; %s') % e)
|
||||
else:
|
||||
form = QueueTransformationForm_create()
|
||||
|
||||
return render_to_response('generic_form.html', {
|
||||
'form': form,
|
||||
'queue': document_queue,
|
||||
'object_name': _(u'document queue'),
|
||||
'navigation_object_name': 'queue',
|
||||
'title': _(u'Create new transformation for queue: %s') % document_queue,
|
||||
}, context_instance=RequestContext(request))
|
||||
|
||||
|
||||
@@ -6,3 +6,19 @@ class SourceTransformationManager(models.Manager):
|
||||
def get_for_object(self, obj):
|
||||
ct = ContentType.objects.get_for_model(obj)
|
||||
return self.model.objects.filter(content_type=ct).filter(object_id=obj.pk)
|
||||
|
||||
def get_for_object_as_list(self, obj):
|
||||
warnings = []
|
||||
transformations = []
|
||||
for transformation in self.get_for_object(obj).values('transformation', 'arguments'):
|
||||
try:
|
||||
transformations.append(
|
||||
{
|
||||
'transformation': transformation['transformation'],
|
||||
'arguments': eval(transformation['arguments'], {})
|
||||
}
|
||||
)
|
||||
except Exception, e:
|
||||
warnings.append(e)
|
||||
|
||||
return transformations, warnings
|
||||
|
||||
@@ -4,14 +4,13 @@ from django.contrib.contenttypes.models import ContentType
|
||||
from django.contrib.contenttypes import generic
|
||||
|
||||
from documents.models import DocumentType
|
||||
from documents.conf.settings import AVAILABLE_TRANSFORMATIONS
|
||||
from documents.managers import RecentDocumentManager
|
||||
from metadata.models import MetadataType
|
||||
from converter.api import get_available_transformations_choices
|
||||
from converter.literals import DIMENSION_SEPARATOR
|
||||
|
||||
from sources.managers import SourceTransformationManager
|
||||
|
||||
available_transformations = ([(name, data['label']) for name, data in AVAILABLE_TRANSFORMATIONS.items()])
|
||||
|
||||
SOURCE_UNCOMPRESS_CHOICE_Y = 'y'
|
||||
SOURCE_UNCOMPRESS_CHOICE_N = 'n'
|
||||
SOURCE_UNCOMPRESS_CHOICE_ASK = 'a'
|
||||
@@ -120,7 +119,7 @@ class StagingFolder(InteractiveBaseModel):
|
||||
if self.preview_height:
|
||||
dimensions.append(unicode(self.preview_height))
|
||||
|
||||
return u'x'.join(dimensions)
|
||||
return DIMENSION_SEPARATOR.join(dimensions)
|
||||
|
||||
class Meta(InteractiveBaseModel.Meta):
|
||||
verbose_name = _(u'staging folder')
|
||||
@@ -164,8 +163,8 @@ class SourceTransformation(models.Model):
|
||||
object_id = models.PositiveIntegerField()
|
||||
content_object = generic.GenericForeignKey('content_type', 'object_id')
|
||||
order = models.PositiveIntegerField(default=0, blank=True, null=True, verbose_name=_(u'order'), db_index=True)
|
||||
transformation = models.CharField(choices=available_transformations, max_length=128, verbose_name=_(u'transformation'))
|
||||
arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: {\'degrees\':90}'))
|
||||
transformation = models.CharField(choices=get_available_transformations_choices(), max_length=128, verbose_name=_(u'transformation'))
|
||||
arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use dictionaries to indentify arguments, example: %s') % u'{\'degrees\':90}')
|
||||
|
||||
objects = SourceTransformationManager()
|
||||
|
||||
|
||||
@@ -8,11 +8,9 @@ from django.utils.translation import ugettext
|
||||
from django.contrib import messages
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from converter import TRANFORMATION_CHOICES
|
||||
from converter.api import convert, cache_cleanup
|
||||
|
||||
DEFAULT_STAGING_DIRECTORY = u'/tmp'
|
||||
#from documents.conf.settings import DEFAULT_TRANSFORMATIONS
|
||||
|
||||
HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
|
||||
#TODO: Do benchmarks
|
||||
@@ -107,16 +105,15 @@ class StagingFile(object):
|
||||
def upload(self):
|
||||
"""
|
||||
Return a StagingFile encapsulated in a File class instance to
|
||||
allow for easier upload a staging files
|
||||
allow for easier upload of staging files
|
||||
"""
|
||||
try:
|
||||
return File(file(self.filepath, 'rb'), name=self.filename)
|
||||
except Exception, exc:
|
||||
raise Exception(ugettext(u'Unable to upload staging file: %s') % exc)
|
||||
|
||||
def delete(self, preview_size):
|
||||
# tranformation_string, errors = get_transformation_string(DEFAULT_TRANSFORMATIONS)
|
||||
cache_cleanup(self.filepath, size=preview_size)# , extra_options=tranformation_string)
|
||||
def delete(self, preview_size, transformations):
|
||||
cache_cleanup(self.filepath, size=preview_size, transformations=transformations)
|
||||
try:
|
||||
os.unlink(self.filepath)
|
||||
except OSError, exc:
|
||||
@@ -125,24 +122,7 @@ class StagingFile(object):
|
||||
else:
|
||||
raise OSError(ugettext(u'Unable to delete staging file: %s') % exc)
|
||||
|
||||
def preview(self, preview_size):
|
||||
def preview(self, preview_size, transformations):
|
||||
errors = []
|
||||
# tranformation_string, errors = get_transformation_string(DEFAULT_TRANSFORMATIONS)
|
||||
# output_file = convert(self.filepath, size=STAGING_FILES_PREVIEW_SIZE, extra_options=tranformation_string, cleanup_files=False)
|
||||
output_file = convert(self.filepath, size=preview_size, cleanup_files=False)
|
||||
output_file = convert(self.filepath, size=preview_size, cleanup_files=False, transformations=transformations)
|
||||
return output_file, errors
|
||||
|
||||
|
||||
def get_transformation_string(transformations):
|
||||
transformation_list = []
|
||||
errors = []
|
||||
for transformation in transformations:
|
||||
try:
|
||||
if transformation['name'] in TRANFORMATION_CHOICES:
|
||||
output = TRANFORMATION_CHOICES[transformation['name']] % eval(transformation['arguments'])
|
||||
transformation_list.append(output)
|
||||
except Exception, e:
|
||||
errors.append(e)
|
||||
|
||||
tranformation_string = ' '.join(transformation_list)
|
||||
return tranformation_string, errors
|
||||
|
||||
@@ -16,12 +16,12 @@ urlpatterns = patterns('sources.views',
|
||||
url(r'^setup/interactive/staging_folder/list/$', 'setup_source_list', {'source_type': SOURCE_CHOICE_STAGING}, 'setup_staging_folder_list'),
|
||||
|
||||
url(r'^setup/interactive/(?P<source_type>\w+)/list/$', 'setup_source_list', (), 'setup_source_list'),
|
||||
url(r'^setup/interactive/(?P<source_type>\w+)/(?P<source_id>\w+)/edit/$', 'setup_source_edit', (), 'setup_source_edit'),
|
||||
url(r'^setup/interactive/(?P<source_type>\w+)/(?P<source_id>\w+)/delete/$', 'setup_source_delete', (), 'setup_source_delete'),
|
||||
url(r'^setup/interactive/(?P<source_type>\w+)/(?P<source_id>\d+)/edit/$', 'setup_source_edit', (), 'setup_source_edit'),
|
||||
url(r'^setup/interactive/(?P<source_type>\w+)/(?P<source_id>\d+)/delete/$', 'setup_source_delete', (), 'setup_source_delete'),
|
||||
url(r'^setup/interactive/(?P<source_type>\w+)/create/$', 'setup_source_create', (), 'setup_source_create'),
|
||||
|
||||
url(r'^setup/interactive/(?P<source_type>\w+)/(?P<source_id>\w+)/transformation/list/$', 'setup_source_transformation_list', (), 'setup_source_transformation_list'),
|
||||
url(r'^setup/interactive/(?P<source_type>\w+)/(?P<source_id>\w+)/transformation/create/$', 'setup_source_transformation_create', (), 'setup_source_transformation_create'),
|
||||
url(r'^setup/interactive/source/transformation/(?P<transformation_id>\w+)/edit/$', 'setup_source_transformation_edit', (), 'setup_source_transformation_edit'),
|
||||
url(r'^setup/interactive/source/transformation/(?P<transformation_id>\w+)/delete/$', 'setup_source_transformation_delete', (), 'setup_source_transformation_delete'),
|
||||
url(r'^setup/interactive/(?P<source_type>\w+)/(?P<source_id>\d+)/transformation/list/$', 'setup_source_transformation_list', (), 'setup_source_transformation_list'),
|
||||
url(r'^setup/interactive/(?P<source_type>\w+)/(?P<source_id>\d+)/transformation/create/$', 'setup_source_transformation_create', (), 'setup_source_transformation_create'),
|
||||
url(r'^setup/interactive/source/transformation/(?P<transformation_id>\d+)/edit/$', 'setup_source_transformation_edit', (), 'setup_source_transformation_edit'),
|
||||
url(r'^setup/interactive/source/transformation/(?P<transformation_id>\d+)/delete/$', 'setup_source_transformation_delete', (), 'setup_source_transformation_delete'),
|
||||
)
|
||||
|
||||
@@ -129,9 +129,13 @@ def upload_interactive(request, source_type=None, source_id=None):
|
||||
expand = True
|
||||
else:
|
||||
expand = False
|
||||
if (not expand) or (expand and not _handle_zip_file(request, request.FILES['file'], document_type)):
|
||||
|
||||
transformations, errors = SourceTransformation.objects.get_for_object_as_list(web_form)
|
||||
|
||||
if (not expand) or (expand and not _handle_zip_file(request, request.FILES['file'], document_type=document_type, transformations=transformations)):
|
||||
instance = form.save()
|
||||
instance.save()
|
||||
instance.apply_default_transformations(transformations)
|
||||
if document_type:
|
||||
instance.document_type = document_type
|
||||
_handle_save_document(request, instance, form)
|
||||
@@ -174,16 +178,18 @@ def upload_interactive(request, source_type=None, source_id=None):
|
||||
expand = True
|
||||
else:
|
||||
expand = False
|
||||
if (not expand) or (expand and not _handle_zip_file(request, staging_file.upload(), document_type)):
|
||||
transformations, errors = SourceTransformation.objects.get_for_object_as_list(staging_folder)
|
||||
if (not expand) or (expand and not _handle_zip_file(request, staging_file.upload(), document_type=document_type, transformations=transformations)):
|
||||
document = Document(file=staging_file.upload())
|
||||
if document_type:
|
||||
document.document_type = document_type
|
||||
document.save()
|
||||
document.apply_default_transformations(transformations)
|
||||
_handle_save_document(request, document, form)
|
||||
messages.success(request, _(u'Staging file: %s, uploaded successfully.') % staging_file.filename)
|
||||
|
||||
if staging_folder.delete_after_upload:
|
||||
staging_file.delete(staging_folder.get_preview_size())
|
||||
staging_file.delete(preview_size=staging_folder.get_preview_size(), transformations=transformations)
|
||||
messages.success(request, _(u'Staging file: %s, deleted successfully.') % staging_file.filename)
|
||||
except Exception, e:
|
||||
messages.error(request, e)
|
||||
@@ -260,7 +266,7 @@ def _handle_save_document(request, document, form=None):
|
||||
create_history(HISTORY_DOCUMENT_CREATED, document, {'user': request.user})
|
||||
|
||||
|
||||
def _handle_zip_file(request, uploaded_file, document_type=None):
|
||||
def _handle_zip_file(request, uploaded_file, document_type=None, transformations=None):
|
||||
filename = getattr(uploaded_file, 'filename', getattr(uploaded_file, 'name', ''))
|
||||
if filename.lower().endswith('zip'):
|
||||
zfobj = zipfile.ZipFile(uploaded_file)
|
||||
@@ -285,7 +291,12 @@ def staging_file_preview(request, source_type, source_id, staging_file_id):
|
||||
staging_folder = get_object_or_404(StagingFolder, pk=source_id)
|
||||
StagingFile = create_staging_file_class(request, staging_folder.folder_path)
|
||||
try:
|
||||
output_file, errors = StagingFile.get(staging_file_id).preview(staging_folder.get_preview_size())
|
||||
transformations, errors=SourceTransformation.objects.get_for_object_as_list(staging_folder)
|
||||
|
||||
output_file, errors = StagingFile.get(staging_file_id).preview(
|
||||
preview_size=staging_folder.get_preview_size(),
|
||||
transformations=transformations
|
||||
)
|
||||
if errors and (request.user.is_staff or request.user.is_superuser):
|
||||
for error in errors:
|
||||
messages.warning(request, _(u'Staging file transformation error: %(error)s') % {
|
||||
@@ -313,15 +324,19 @@ def staging_file_delete(request, source_type, source_id, staging_file_id):
|
||||
StagingFile = create_staging_file_class(request, staging_folder.folder_path)
|
||||
|
||||
staging_file = StagingFile.get(staging_file_id)
|
||||
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None)))
|
||||
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
|
||||
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', '/')))
|
||||
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', '/')))
|
||||
|
||||
if request.method == 'POST':
|
||||
try:
|
||||
staging_file.delete(staging_folder.get_preview_size())
|
||||
transformations, errors=SourceTransformation.objects.get_for_object_as_list(staging_folder)
|
||||
staging_file.delete(
|
||||
preview_size=staging_folder.get_preview_size(),
|
||||
transformations=transformations
|
||||
)
|
||||
messages.success(request, _(u'Staging file delete successfully.'))
|
||||
except Exception, e:
|
||||
messages.error(request, e)
|
||||
messages.error(request, _(u'Staging file delete error; %s.') % e)
|
||||
return HttpResponseRedirect(next)
|
||||
|
||||
results = get_active_tab_links()
|
||||
@@ -509,11 +524,17 @@ def setup_source_transformation_edit(request, transformation_id):
|
||||
form = SourceTransformationForm(instance=source_transformation, data=request.POST)
|
||||
if form.is_valid():
|
||||
try:
|
||||
form.save()
|
||||
messages.success(request, _(u'Source transformation edited successfully'))
|
||||
return HttpResponseRedirect(next)
|
||||
except Exception, e:
|
||||
messages.error(request, _(u'Error editing source transformation; %s') % e)
|
||||
# Test the validity of the argument field
|
||||
eval(form.cleaned_data['arguments'], {})
|
||||
except:
|
||||
messages.error(request, _(u'Source transformation argument error.'))
|
||||
else:
|
||||
try:
|
||||
form.save()
|
||||
messages.success(request, _(u'Source transformation edited successfully'))
|
||||
return HttpResponseRedirect(next)
|
||||
except Exception, e:
|
||||
messages.error(request, _(u'Error editing source transformation; %s') % e)
|
||||
else:
|
||||
form = SourceTransformationForm(instance=source_transformation)
|
||||
|
||||
@@ -541,9 +562,9 @@ def setup_source_transformation_delete(request, transformation_id):
|
||||
if request.method == 'POST':
|
||||
try:
|
||||
source_transformation.delete()
|
||||
messages.success(request, _(u'Transformation deleted successfully.'))
|
||||
messages.success(request, _(u'Source transformation deleted successfully.'))
|
||||
except Exception, e:
|
||||
messages.error(request, _(u'Error deleting transformation; %(error)s') % {
|
||||
messages.error(request, _(u'Error deleting source transformation; %(error)s') % {
|
||||
'error': e}
|
||||
)
|
||||
return HttpResponseRedirect(redirect_view)
|
||||
@@ -556,7 +577,7 @@ def setup_source_transformation_delete(request, transformation_id):
|
||||
{'object': 'source', 'name': _(u'source')},
|
||||
{'object': 'transformation', 'name': _(u'transformation')}
|
||||
],
|
||||
'title': _(u'Are you sure you wish to delete transformation "%(transformation)s"') % {
|
||||
'title': _(u'Are you sure you wish to delete source transformation "%(transformation)s"') % {
|
||||
'transformation': source_transformation.get_transformation_display(),
|
||||
},
|
||||
'previous': previous,
|
||||
@@ -598,13 +619,19 @@ def setup_source_transformation_create(request, source_type, source_id):
|
||||
form = SourceTransformationForm_create(request.POST)
|
||||
if form.is_valid():
|
||||
try:
|
||||
source_tranformation = form.save(commit=False)
|
||||
source_tranformation.content_object = source
|
||||
source_tranformation.save()
|
||||
messages.success(request, _(u'Source transformation created successfully'))
|
||||
return HttpResponseRedirect(redirect_view)
|
||||
except Exception, e:
|
||||
messages.error(request, _(u'Error creating source transformation; %s') % e)
|
||||
# Test the validity of the argument field
|
||||
eval(form.cleaned_data['arguments'], {})
|
||||
except:
|
||||
messages.error(request, _(u'Source transformation argument error.'))
|
||||
else:
|
||||
try:
|
||||
source_tranformation = form.save(commit=False)
|
||||
source_tranformation.content_object = source
|
||||
source_tranformation.save()
|
||||
messages.success(request, _(u'Source transformation created successfully'))
|
||||
return HttpResponseRedirect(redirect_view)
|
||||
except Exception, e:
|
||||
messages.error(request, _(u'Error creating source transformation; %s') % e)
|
||||
else:
|
||||
form = SourceTransformationForm_create()
|
||||
|
||||
|
||||
@@ -9,3 +9,5 @@ django-celery==2.2.2
|
||||
django-sentry==1.6.0
|
||||
django-taggit==0.9.3
|
||||
-e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt
|
||||
slate==0.3
|
||||
PIL==1.1.7
|
||||
|
||||
@@ -6,3 +6,5 @@ django-celery==2.2.2
|
||||
django-sentry==1.6.0
|
||||
django-taggit==0.9.3
|
||||
-e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt
|
||||
slate==0.3
|
||||
PIL==1.1.7
|
||||
|
||||
Reference in New Issue
Block a user