Merge branch 'feature/improved_office_doc_support' into development

2011-11-21 06:28:30 -04:00
parent 91203a9593 e9ff82f696
commit 9212a7027f
22 changed files with 236 additions and 86 deletions
--- a/apps/common/templates/generic_confirm.html
+++ b/apps/common/templates/generic_confirm.html
@@ -22,7 +22,11 @@
            <form action="" method="post" class="form login">{% csrf_token %}
                {% if next %}
                    <input name="next" type="hidden" value="{{ next }}" />
-                {% endif %}          
+                {% endif %}   
+                       
+                {% if previous %}
+                    <input name="previous" type="hidden" value="{{ previous }}" />
+                {% endif %}   
                
                <div style="float: left; margin-right: 10px;">
                    <img style="margin-top: 12px;" src="{{ STATIC_URL }}images/icons/{{ form_icon|default:'question.png' }}" alt="{% trans 'form icon' %}" />
--- a/apps/converter/api.py
+++ b/apps/converter/api.py
@@ -4,8 +4,6 @@ import hashlib

 from common.conf.settings import TEMPORARY_DIRECTORY

-from converter.conf.settings import UNOCONV_PATH
-from converter.exceptions import OfficeConversionError
 from converter.literals import DEFAULT_PAGE_NUMBER, \
    DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, DEFAULT_FILE_FORMAT

@@ -16,28 +14,12 @@ from converter.literals import TRANSFORMATION_RESIZE, \
 from converter.literals import DIMENSION_SEPARATOR
 from converter.literals import FILE_FORMATS
 from converter.utils import cleanup
+from converter.office_converter import OfficeConverter
+

 HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
-    
-CONVERTER_OFFICE_FILE_EXTENSIONS = [
-    u'ods', u'docx', u'doc'
-]
-
-
-def execute_unoconv(input_filepath, arguments=''):
-    """
-    Executes the program unoconv using subprocess's Popen
-    """
-    command = []
-    command.append(UNOCONV_PATH)
-    command.extend(unicode(arguments).split())
-    command.append(input_filepath)
-    proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE)
-    return_code = proc.wait()
-    if return_code != 0:
-        raise OfficeConversionError(proc.stderr.readline())
-

+            
 def cache_cleanup(input_filepath, *args, **kwargs):
    try:
        os.remove(create_image_cache_filename(input_filepath, *args, **kwargs))
@@ -53,37 +35,34 @@ def create_image_cache_filename(input_filepath, *args, **kwargs):
        return None


-def convert_office_document(input_filepath):
-    if os.path.exists(UNOCONV_PATH):
-        execute_unoconv(input_filepath, arguments='-f pdf')
-        return input_filepath + u'.pdf'
-    return None
-
-
-def convert(input_filepath, output_filepath=None, cleanup_files=False, *args, **kwargs):
+def convert(input_filepath, output_filepath=None, cleanup_files=False, mimetype=None, *args, **kwargs):
    size = kwargs.get('size')
    file_format = kwargs.get('file_format', DEFAULT_FILE_FORMAT)
    zoom = kwargs.get('zoom', DEFAULT_ZOOM_LEVEL)
    rotation = kwargs.get('rotation', DEFAULT_ROTATION)
    page = kwargs.get('page', DEFAULT_PAGE_NUMBER)
    transformations = kwargs.get('transformations', [])
+    
    if transformations is None:
        transformations = []

-    unoconv_output = None
-
    if output_filepath is None:
        output_filepath = create_image_cache_filename(input_filepath, *args, **kwargs)
        
    if os.path.exists(output_filepath):
        return output_filepath
-
-    path, extension = os.path.splitext(input_filepath)
-    if extension[1:].lower() in CONVERTER_OFFICE_FILE_EXTENSIONS:
-        result = convert_office_document(input_filepath)
-        if result:
-            unoconv_output = result
-            input_filepath = result
+    
+    office_converter = OfficeConverter()
+    office_converter.convert(input_filepath, mimetype=mimetype)
+    if office_converter:
+        try:
+            input_filepath = office_converter.output_filepath
+            mimetype = 'application/pdf'
+        except OfficeConverter:
+            raise UnknownFileFormat('office converter exception')
+    else:
+        # Recycle the already detected mimetype
+        mimetype = office_converter.mimetype

    if size:
        transformations.append(
@@ -110,17 +89,23 @@ def convert(input_filepath, output_filepath=None, cleanup_files=False, *args, **
        )           

    try:
-        backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, transformations=transformations, page=page, file_format=file_format)
+        backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, transformations=transformations, page=page, file_format=file_format, mimetype=mimetype)
    finally:
        if cleanup_files:
            cleanup(input_filepath)
-        if unoconv_output:
-            cleanup(unoconv_output)

    return output_filepath


 def get_page_count(input_filepath):
+    office_converter = OfficeConverter()
+    office_converter.convert(input_filepath)    
+    if office_converter:
+        try:
+            input_filepath = office_converter.output_filepath
+        except OfficeConverter:
+            raise UnknownFileFormat('office converter exception')
+                
    return backend.get_page_count(input_filepath)


--- a/apps/converter/backends/graphicsmagick/base.py
+++ b/apps/converter/backends/graphicsmagick/base.py
@@ -29,7 +29,7 @@ class ConverterClass(ConverterBase):
            raise IdentifyError(proc.stderr.readline())
        return proc.stdout.read()

-    def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
+    def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT, **kwargs):
        arguments = []

        try:
--- a/apps/converter/backends/imagemagick/base.py
+++ b/apps/converter/backends/imagemagick/base.py
@@ -29,7 +29,7 @@ class ConverterClass(ConverterBase):
            raise IdentifyError(proc.stderr.readline())
        return proc.stdout.read()

-    def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
+    def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT, **kwargs):
        arguments = []
        try:
            if transformations:
--- a/apps/converter/backends/python/base.py
+++ b/apps/converter/backends/python/base.py
@@ -25,7 +25,7 @@ class ConverterClass(ConverterBase):
    def get_page_count(self, input_filepath):
        page_count = 1

-        mimetype, encoding = get_mimetype(open(input_filepath, 'rb'), input_filepath)
+        mimetype, encoding = get_mimetype(open(input_filepath, 'rb'), input_filepath, mimetype_only=True)
        if mimetype == 'application/pdf':
            # If file is a PDF open it with slate to determine the page
            # count
@@ -48,9 +48,12 @@ class ConverterClass(ConverterBase):
            
        return page_count

-    def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
+    def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT, **kwargs):
        tmpfile = None
-        mimetype, encoding = get_mimetype(open(input_filepath, 'rb'), input_filepath)
+        mimetype = kwargs.get('mimetype', None)
+        if not mimetype:
+            mimetype, encoding = get_mimetype(open(input_filepath, 'rb'), input_filepath, mimetype_only=True)
+
        if mimetype == 'application/pdf' and USE_GHOSTSCRIPT:
            # If file is a PDF open it with ghostscript and convert it to
            # TIFF
@@ -64,7 +67,7 @@ class ConverterClass(ConverterBase):
                'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH',
                '-dNOPAUSE', '-dNOPROMPT', 
                first_page_tmpl, last_page_tmpl,
-                '-sDEVICE=jpeg', '-dJPEGQ=75',
+                '-sDEVICE=jpeg', '-dJPEGQ=95',
                '-r150', output_file_tmpl,
                input_file_tmpl,
                '-c "60000000 setvmthreshold"',  # use 30MB
--- a/apps/converter/conf/settings.py
+++ b/apps/converter/conf/settings.py
@@ -1,4 +1,5 @@
-"""Configuration options for the converter app"""
+'''Configuration options for the converter app'''
+
 from django.utils.translation import ugettext_lazy as _

 from smart_settings.api import register_settings
@@ -12,7 +13,9 @@ register_settings(
        {'name': u'GM_PATH', 'global_name': u'CONVERTER_GM_PATH', 'default': u'/usr/bin/gm', 'description': _(u'File path to graphicsmagick\'s program.'), 'exists': True},
        {'name': u'GM_SETTINGS', 'global_name': u'CONVERTER_GM_SETTINGS', 'default': u''},
        {'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python', 'description': _(u'Graphics conversion backend to use.  Options are: converter.backends.imagemagick, converter.backends.graphicsmagick and converter.backends.python.')},
-        {'name': u'UNOCONV_PATH', 'global_name': u'CONVERTER_UNOCONV_PATH', 'default': u'/usr/bin/unoconv', 'exists': True},
+        {'name': u'UNOCONV_PATH', 'global_name': u'CONVERTER_UNOCONV_PATH', 'default': u'/usr/bin/unoconv', 'exists': True, 'description': _(u'Path to the unoconv program.')},
+        {'name': u'UNOCONV_USE_PIPE', 'global_name': u'CONVERTER_UNOCONV_USE_PIPE', 'default': True, 'description': _(u'Use alternate method of connection to LibreOffice using a pipe, it is slower but less prone to segmentation faults.')},
+        
        #{'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'},
        #{'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'},
        #{'name': u'PRINT_QUALITY_OPTIONS', 'global_name': u'CONVERTER_PRINT_QUALITY_OPTIONS', 'default': u'-density 500'},
--- a/apps/converter/exceptions.py
+++ b/apps/converter/exceptions.py
@@ -29,3 +29,7 @@ class UnkownConvertError(ConvertError):

 class OfficeConversionError(ConvertError):
    pass
+
+
+class OfficeBackendError(OfficeConversionError):
+    pass
--- a/apps/converter/office_converter.py
+++ b/apps/converter/office_converter.py
@@ -0,0 +1,105 @@
+import os
+import subprocess
+
+from mimetype.api import get_mimetype
+from common.conf.settings import TEMPORARY_DIRECTORY
+
+from converter.conf.settings import UNOCONV_PATH, UNOCONV_USE_PIPE
+from converter.exceptions import (OfficeConversionError,
+    OfficeBackendError, UnknownFileFormat)
+
+CACHED_FILE_SUFFIX = u'_office_converter'
+    
+CONVERTER_OFFICE_FILE_MIMETYPES = [
+    'application/msword',
+    'application/mswrite',
+    'application/mspowerpoint',
+    'application/msexcel',
+    'application/vnd.ms-excel',
+    'application/vnd.ms-powerpoint',    
+    'text/plain',
+    'application/vnd.oasis.opendocument.presentation',
+    'application/vnd.oasis.opendocument.text',
+    'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+    'application/vnd.oasis.opendocument.spreadsheet',
+    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+    'application/vnd.oasis.opendocument.graphics',
+]
+
+
+class OfficeConverter(object):
+    def __init__(self):
+        self.backend_class = OfficeConverterBackendUnoconv
+        self.exists = False
+        self.mimetype = None
+        self.encoding = None
+    
+    def mimetypes(self):
+        return CONVERTER_OFFICE_FILE_MIMETYPES
+
+    def convert(self, input_filepath, mimetype=None):
+        self.input_filepath = input_filepath
+
+        # Make sure file is of a known office format
+        if mimetype:
+            self.mimetype = mimetype
+        else:
+            self.mimetype, self.encoding = get_mimetype(open(self.input_filepath), self.input_filepath, mimetype_only=True)
+
+        if self.mimetype in CONVERTER_OFFICE_FILE_MIMETYPES:
+            # Cache results of conversion
+            self.output_filepath = os.path.join(TEMPORARY_DIRECTORY, u''.join([self.input_filepath, CACHED_FILE_SUFFIX]))
+            self.exists = os.path.exists(self.output_filepath)
+            if not self.exists:
+                try:
+                    self.backend = self.backend_class()
+                    self.backend.convert(self.input_filepath, self.output_filepath)
+                    self.exists = True
+                except OfficeBackendError, msg:
+                    # convert exception so that at least the mime type icon is displayed
+                    raise UnknownFileFormat(msg)
+        
+    def __unicode__(self):
+        return getattr(self, 'output_filepath', None)
+        
+    def __str__(self):
+        return str(self.__unicode__())
+                
+    def __nonzero__(self):
+        return self.exists
+
+    __bool__ = __nonzero__
+    
+
+class OfficeConverterBackendUnoconv(object):
+    def __init__(self):
+        self.unoconv_path = UNOCONV_PATH if UNOCONV_PATH else u'/usr/bin/unoconv'
+        if not os.path.exists(self.unoconv_path):
+            raise OfficeBackendError('cannot find unoconv executable')
+
+    def convert(self, input_filepath, output_filepath):
+        """
+        Executes the program unoconv using subprocess's Popen
+        """
+        self.input_filepath = input_filepath
+        self.output_filepath = output_filepath
+        
+        command = []
+        command.append(self.unoconv_path)
+
+        if UNOCONV_USE_PIPE:
+            command.append(u'--pipe')
+            command.append(u'mayan')
+
+        command.append(u'--format=pdf')
+        command.append(u'--output=%s' % self.output_filepath)
+        command.append(self.input_filepath)
+        
+        try:
+            proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+            return_code = proc.wait()
+            readline = proc.stderr.readline()
+            if return_code != 0:
+                raise OfficeBackendError(proc.stderr.readline())
+        except OSError, msg:
+            raise OfficeBackendError(msg)
--- a/apps/document_indexing/init.py
+++ b/apps/document_indexing/init.py
@@ -3,7 +3,7 @@ from django.utils.translation import ugettext_lazy as _
 from navigation.api import register_top_menu, register_sidebar_template, \
    register_links
 from permissions.api import register_permission, set_namespace_title
-from main.api import register_maintenance
+from main.api import register_maintenance_links
 from documents.literals import PERMISSION_DOCUMENT_VIEW
 from documents.models import Document

@@ -24,7 +24,7 @@ register_top_menu('indexes', link={'text': _('indexes'), 'famfam': 'folder_page'

 rebuild_index_instances = {'text': _('rebuild indexes'), 'view': 'rebuild_index_instances', 'famfam': 'folder_page', 'permissions': [PERMISSION_DOCUMENT_INDEXING_REBUILD_INDEXES], 'description': _(u'Deletes and creates from scratch all the document indexes.')}

-register_maintenance(rebuild_index_instances, namespace='document_indexing', title=_(u'Indexes'))
+register_maintenance_links([rebuild_index_instances], namespace='document_indexing', title=_(u'Indexes'))

 register_sidebar_template(['index_instance_list'], 'indexing_help.html')

--- a/apps/documents/init.py
+++ b/apps/documents/init.py
@@ -6,7 +6,7 @@ from common.utils import validate_path, encapsulate
 from navigation.api import register_links, register_top_menu, \
    register_model_list_columns, register_multi_item_links, \
    register_sidebar_template
-from main.api import register_diagnostic, register_maintenance
+from main.api import register_diagnostic, register_maintenance_links
 from permissions.api import register_permission, set_namespace_title
 from tags.widgets import get_tags_inline_widget_simple
 from history.api import register_history_type
@@ -29,7 +29,6 @@ from documents.conf.settings import ZOOM_MIN_LEVEL
 from documents.conf import settings as document_settings
 from documents.widgets import document_thumbnail

-
 # Document page links expressions
 def is_first_page(context):
    return context['page'].page_number <= 1
@@ -81,6 +80,7 @@ document_preview = {'text': _(u'preview'), 'class': 'fancybox', 'view': 'documen
 document_download = {'text': _(u'download'), 'view': 'document_download', 'args': 'object.id', 'famfam': 'page_save', 'permissions': [PERMISSION_DOCUMENT_DOWNLOAD]}
 document_find_duplicates = {'text': _(u'find duplicates'), 'view': 'document_find_duplicates', 'args': 'object.id', 'famfam': 'page_refresh', 'permissions': [PERMISSION_DOCUMENT_VIEW]}
 document_find_all_duplicates = {'text': _(u'find all duplicates'), 'view': 'document_find_all_duplicates', 'famfam': 'page_refresh', 'permissions': [PERMISSION_DOCUMENT_VIEW], 'description': _(u'Search all the documents\' checksums and return a list of the exact matches.')}
+document_update_page_count = {'text': _(u'update office documents\' page count'), 'view': 'document_update_page_count', 'famfam': 'page_white_csharp', 'permissions': [PERMISSION_DOCUMENT_TOOLS], 'description': _(u'Update the page count of the office type documents.  This is useful when enabling office document support after there were already office type documents in the database.')}
 document_clear_transformations = {'text': _(u'clear transformations'), 'view': 'document_clear_transformations', 'args': 'object.id', 'famfam': 'page_paintbrush', 'permissions': [PERMISSION_DOCUMENT_TRANSFORM]}
 document_multiple_clear_transformations = {'text': _(u'clear transformations'), 'view': 'document_multiple_clear_transformations', 'famfam': 'page_paintbrush', 'permissions': [PERMISSION_DOCUMENT_TRANSFORM]}
 document_print = {'text': _(u'print'), 'view': 'document_print', 'args': 'object.id', 'famfam': 'printer', 'permissions': [PERMISSION_DOCUMENT_VIEW]}
@@ -158,17 +158,16 @@ register_links(['document_page_transformation_edit', 'document_page_transformati

 register_diagnostic('documents', _(u'Documents'), document_missing_list)

-register_maintenance(document_find_all_duplicates, namespace='documents', title=_(u'documents'))
+register_maintenance_links([document_find_all_duplicates, document_update_page_count], namespace='documents', title=_(u'documents'))

-
-def document_exists(document):
-    try:
-        if document.exists():
-            return u'<span class="famfam active famfam-tick"></span>'
-        else:
-            return u'<span class="famfam active famfam-cross"></span>'
-    except Exception, exc:
-        return exc
+#def document_exists(document):
+#    try:
+#        if document.exists():
+#            return u'<span class="famfam active famfam-tick"></span>'
+#        else:
+#            return u'<span class="famfam active famfam-cross"></span>'
+#    except Exception, exc:
+#        return exc

 register_model_list_columns(Document, [
        {'name':_(u'thumbnail'), 'attribute':
--- a/apps/documents/models.py
+++ b/apps/documents/models.py
@@ -194,6 +194,10 @@ class Document(models.Model):
            self.save()

        return detected_pages
+        
+    @property
+    def page_count(self):
+        return self.documentpage_set.count()

    def save_to_file(self, filepath, buffer_size=1024 * 1024):
        """
@@ -246,7 +250,7 @@ class Document(models.Model):
            return cache_file_path
        else:
            document_file = document_save_to_temp_dir(self, self.checksum)
-            return convert(document_file, output_filepath=cache_file_path, page=page, transformations=transformations)
+            return convert(document_file, output_filepath=cache_file_path, page=page, transformations=transformations, mimetype=self.file_mimetype)

    def get_valid_image(self, size=DISPLAY_SIZE, page=DEFAULT_PAGE_NUMBER, zoom=DEFAULT_ZOOM_LEVEL, rotation=DEFAULT_ROTATION):
        image_cache_name = self.get_image_cache_name(page=page)
@@ -274,6 +278,13 @@ class Document(models.Model):
    def delete(self, *args, **kwargs):
        super(Document, self).delete(*args, **kwargs)
        return self.file.storage.delete(self.file.path)
+        
+    @property
+    def size(self):
+        if self.exists():
+            return self.file.storage.size(self.file.path)
+        else:
+            return None
            

 class DocumentTypeFilename(models.Model):
--- a/apps/documents/static/images/icons/page_white_csharp.png
+++ b/apps/documents/static/images/icons/page_white_csharp.png
--- a/apps/documents/urls.py
+++ b/apps/documents/urls.py
@@ -32,6 +32,7 @@ urlpatterns = patterns('documents.views',

    url(r'^multiple/clear_transformations/$', 'document_multiple_clear_transformations', (), 'document_multiple_clear_transformations'),
    url(r'^duplicates/list/$', 'document_find_all_duplicates', (), 'document_find_all_duplicates'),
+    url(r'^maintenance/update_page_count/$', 'document_update_page_count', (), 'document_update_page_count'),

    url(r'^page/(?P<document_page_id>\d+)/$', 'document_page_view', (), 'document_page_view'),
    url(r'^page/(?P<document_page_id>\d+)/text/$', 'document_page_text', (), 'document_page_text'),
--- a/apps/documents/views.py
+++ b/apps/documents/views.py
@@ -19,6 +19,7 @@ from common.literals import PAGE_SIZE_DIMENSIONS, \
 from common.conf.settings import DEFAULT_PAPER_SIZE
 from converter.literals import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, \
    DEFAULT_PAGE_NUMBER
+from converter.office_converter import OfficeConverter
 from filetransfers.api import serve_file
 from metadata.forms import MetadataFormSet, MetadataSelectionForm
 from navigation.utils import resolve_to_name
@@ -40,7 +41,7 @@ from documents.literals import PERMISSION_DOCUMENT_CREATE, \
    PERMISSION_DOCUMENT_VIEW, \
    PERMISSION_DOCUMENT_DELETE, PERMISSION_DOCUMENT_DOWNLOAD, \
    PERMISSION_DOCUMENT_TRANSFORM, \
-    PERMISSION_DOCUMENT_EDIT
+    PERMISSION_DOCUMENT_EDIT, PERMISSION_DOCUMENT_TOOLS
 from documents.literals import HISTORY_DOCUMENT_CREATED, \
    HISTORY_DOCUMENT_EDITED, HISTORY_DOCUMENT_DELETED

@@ -117,14 +118,14 @@ def document_view(request, document_id, advanced=False):
            {'label': _(u'File extension'), 'field': 'file_extension'},
            {'label': _(u'File mimetype'), 'field': 'file_mimetype'},
            {'label': _(u'File mime encoding'), 'field': 'file_mime_encoding'},
-            {'label': _(u'File size'), 'field':lambda x: pretty_size(x.file.storage.size(x.file.path)) if x.exists() else '-'},
+            {'label': _(u'File size'), 'field':lambda x: pretty_size(x.size) if x.size else '-'},
            {'label': _(u'Exists in storage'), 'field': 'exists'},
            {'label': _(u'File path in storage'), 'field': 'file'},
            {'label': _(u'Date added'), 'field':lambda x: x.date_added.date()},
            {'label': _(u'Time added'), 'field':lambda x: unicode(x.date_added.time()).split('.')[0]},
            {'label': _(u'Checksum'), 'field': 'checksum'},
            {'label': _(u'UUID'), 'field': 'uuid'},
-            {'label': _(u'Pages'), 'field': lambda x: x.documentpage_set.count()},
+            {'label': _(u'Pages'), 'field': 'page_count'},
        ])

        subtemplates_list.append(
@@ -471,6 +472,37 @@ def document_find_all_duplicates(request):
    return _find_duplicate_list(request, include_source=True)


+def document_update_page_count(request):
+    check_permissions(request.user, [PERMISSION_DOCUMENT_TOOLS])
+
+    previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', '/')))
+    office_converter = OfficeConverter()
+    qs = Document.objects.exclude(file_extension__iendswith='dxf').filter(file_mimetype__in=office_converter.mimetypes())
+
+    if request.method == 'POST':
+        updated = 0
+        processed = 0
+        for document in qs:
+            old_page_count = document.page_count
+            document.update_page_count()
+            processed += 1
+            if old_page_count != document.page_count:
+                updated += 1
+            
+        messages.success(request, _(u'Page count update complete.  Documents processed: %(total)d, documents with changed page count: %(change)d') % {
+            'total': processed,
+            'change': updated
+        })
+        return HttpResponseRedirect(previous)
+
+    return render_to_response('generic_confirm.html', {
+        'previous': previous,
+        'title': _(u'Are you sure you wish to update the page count for the office documents (%d)?') % qs.count(),
+        'message': _(u'On large databases this operation may take some time to execute.'),
+        'form_icon': u'page_white_csharp.png',
+    }, context_instance=RequestContext(request))
+
+
 def document_clear_transformations(request, document_id=None, document_id_list=None):
    check_permissions(request.user, [PERMISSION_DOCUMENT_TRANSFORM])

--- a/apps/documents/widgets.py
+++ b/apps/documents/widgets.py
@@ -48,7 +48,7 @@ def document_html_widget(document, size='document_thumbnail', click_view=None, p
        if click_view:
            result.append('</a>')
        result.append('</div>')
-    except UnknownFileFormat, UnkownConvertError:
+    except (UnknownFileFormat, UnkownConvertError):
        result.append('<div class="tc">')
        result.append('<img class="lazy-load" data-href="%s" src="%s/images/ajax-loader.gif" alt="%s" />' % (preview_view, settings.STATIC_URL, alt_text))
        result.append('<noscript><img src="%s" alt="%s" /></noscript>' % (preview_view, alt_text))
--- a/apps/main/api.py
+++ b/apps/main/api.py
@@ -14,9 +14,10 @@ def register_diagnostic(namespace, title, link):
    diagnostics[namespace] = namespace_dict


-def register_maintenance(link, title=None, namespace=None):
+def register_maintenance_links(links, title=None, namespace=None):
    namespace_dict = tools.get(namespace, {'title': None, 'links': []})
    namespace_dict['title'] = title
-    link['url'] = link.get('url', reverse_lazy(link['view']))
-    namespace_dict['links'].append(link)
+    for link in links:
+        link['url'] = link.get('url', reverse_lazy(link['view']))
+        namespace_dict['links'].append(link)
    tools[namespace] = namespace_dict
--- a/apps/main/views.py
+++ b/apps/main/views.py
@@ -24,14 +24,14 @@ def home(request):
 def maintenance_menu(request):
    user_tools = {}
    for namespace, values in tools.items():
+        user_tools[namespace] = {
+            'title': values['title']
+            }
+        user_tools[namespace].setdefault('links', [])
        for link in values['links']:
            try:
                permissions = link.get('permissions', [])
                check_permissions(request.user, permissions)
-                user_tools[namespace] = {
-                    'title': values['title']
-                    }
-                user_tools[namespace].setdefault('links', [])
                user_tools[namespace]['links'].append(link)
            except PermissionDenied:
                pass
--- a/apps/mimetype/api.py
+++ b/apps/mimetype/api.py
@@ -84,7 +84,7 @@ def get_error_icon_file_path():
        return os.path.join(settings.STATIC_ROOT, MIMETYPE_ICONS_DIRECTORY_NAME, ERROR_FILE_NAME)

   
-def get_mimetype(file_description, filepath):
+def get_mimetype(file_description, filepath, mimetype_only=False):
    """
    Determine a file's mimetype by calling the system's libmagic
    library via python-magic or fallback to use python's mimetypes
@@ -95,9 +95,10 @@ def get_mimetype(file_description, filepath):
    if USE_PYTHON_MAGIC:
        mime = magic.Magic(mime=True)
        file_mimetype = mime.from_buffer(file_description.read())
-        file_description.seek(0)
-        mime_encoding = magic.Magic(mime_encoding=True)
-        file_mime_encoding = mime_encoding.from_buffer(file_description.read())
+        if not mimetype_only:
+            file_description.seek(0)
+            mime_encoding = magic.Magic(mime_encoding=True)
+            file_mime_encoding = mime_encoding.from_buffer(file_description.read())
    else:
        path, filename = os.path.split(filepath)
        file_mimetype, file_mime_encoding = mimetypes.guess_type(filename)
--- a/apps/ocr/init.py
+++ b/apps/ocr/init.py
@@ -14,7 +14,7 @@ from django.db.models.signals import post_save
 from navigation.api import register_links, register_top_menu, register_multi_item_links
 from permissions.api import register_permission, set_namespace_title
 from documents.models import Document
-from main.api import register_maintenance
+from main.api import register_maintenance_links
 from project_tools.api import register_tool

 from scheduler.api import register_interval_job
@@ -71,7 +71,7 @@ register_multi_item_links(['queue_document_list'], [re_queue_multiple_document,
 register_links(['setup_queue_transformation_create', 'setup_queue_transformation_edit', 'setup_queue_transformation_delete', 'document_queue_disable', 'document_queue_enable', 'queue_document_list', 'node_active_list', 'setup_queue_transformation_list'], [queue_document_list, node_active_list], menu_name='secondary_menu')
 register_links(['setup_queue_transformation_edit', 'setup_queue_transformation_delete', 'setup_queue_transformation_list', 'setup_queue_transformation_create'], [setup_queue_transformation_create], menu_name='sidebar')

-register_maintenance(all_document_ocr_cleanup, namespace='ocr', title=_(u'OCR'))
+register_maintenance_links([all_document_ocr_cleanup], namespace='ocr', title=_(u'OCR'))


@transaction.commit_manually
--- a/apps/scheduler/init.py
+++ b/apps/scheduler/init.py
@@ -1,4 +1 @@
-from apscheduler.scheduler import Scheduler

-scheduler = Scheduler()
-scheduler.start()
--- a/apps/scheduler/api.py
+++ b/apps/scheduler/api.py
@@ -1,4 +1,4 @@
-from scheduler import scheduler
+from scheduler.runtime import scheduler
 from scheduler.exceptions import AlreadyScheduled

 registered_jobs = {}
--- a/apps/scheduler/runtime.py
+++ b/apps/scheduler/runtime.py
@@ -0,0 +1,4 @@
+from apscheduler.scheduler import Scheduler
+
+scheduler = Scheduler()
+scheduler.start()