diff --git a/apps/converter/api.py b/apps/converter/api.py index df3c5a790e..0c339bb55f 100644 --- a/apps/converter/api.py +++ b/apps/converter/api.py @@ -1,13 +1,43 @@ import os - +import shlex import subprocess import tempfile -#from django.core.files.base import File -#from documents.conf.settings import TEMPORARY_DIRECTORY +from documents.utils import from_descriptor_to_tempfile + +from converter.conf.settings import CONVERT_PATH +from converter.conf.settings import OCR_OPTIONS + from converter import TEMPORARY_DIRECTORY +class ConvertError(Exception): + def __init__(self, status, message): + self.status = status + self.message = message + + +def get_errors(error_string): + ''' + returns all lines in the error_string that start with the string "error" + + ''' + + lines = error_string.splitlines() + return lines[0] + #error_lines = (line for line in lines if line.find('error') >= 0) + #return '\n'.join(error_lines) + + +def execute_convert(input_filepath, arguments, output_filepath): + command = [CONVERT_PATH, input_filepath] + command.extend(shlex.split(str(arguments))) + command.append(output_filepath) + + proc = subprocess.Popen(command, stderr=subprocess.PIPE) + return (proc.wait(), proc.stderr.read()) + + def in_cache(input_filepath, size, page=0, format='jpg'): #temp_directory = TEMPORARY_DIRECTORY if TEMPORARY_DIRECTORY else tempfile.mkdtemp() temp_filename, separator = os.path.splitext(os.path.basename(input_filepath)) @@ -24,8 +54,6 @@ def convert(input_filepath, size, cache=True, page=0, format='jpg'): #temp_directory = TEMPORARY_DIRECTORY if TEMPORARY_DIRECTORY else tempfile.mkdtemp() #TODO: generate output file using lightweight hash function on #file name or file content - #descriptor, temp_filepath = tempfile.mkstemp() - temp_filename, separator = os.path.splitext(os.path.basename(input_filepath)) temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename) output_arg = '%s_%s%s%s' % (temp_path, size, os.extsep, format) @@ -34,10 +62,31 @@ def convert(input_filepath, size, cache=True, page=0, format='jpg'): return output_arg #TODO: Check mimetype and use corresponding utility - convert = subprocess.Popen(['convert', input_arg, '-resize', size, output_arg]) + convert = subprocess.Popen([CONVERT_PATH, input_arg, '-resize', size, output_arg]) return_code = convert.wait() if return_code: raise Exception #TODO: check return code & messages #TODO: Timeout & kill child return output_arg + + +#TODO: slugify OCR_OPTIONS and add to file name to cache +def convert_document_for_ocr(document, page=0, format='tif'): + #Extract document file + document.file.open() + desc = document.file.storage.open(document.file.path) + input_filepath = from_descriptor_to_tempfile(desc, document.uuid) + + #Convert for OCR + temp_filename, separator = os.path.splitext(os.path.basename(input_filepath)) + temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename) + output_arg = '%s_ocr%s%s%s' % (temp_path, page, os.extsep, format) + input_arg = '%s[%s]' % (input_filepath, page) + try: + status, error_string = execute_convert(input_arg, OCR_OPTIONS, output_arg) + if status: + errors = get_errors(error_string) + raise ConvertError(status, errors) + finally: + return output_arg diff --git a/apps/converter/conf/__init__.py b/apps/converter/conf/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/apps/converter/conf/settings.py b/apps/converter/conf/settings.py new file mode 100644 index 0000000000..28ac04f8cb --- /dev/null +++ b/apps/converter/conf/settings.py @@ -0,0 +1,4 @@ +from django.conf import settings + +CONVERT_PATH = getattr(settings, 'CONVERTER_CONVERT_PATH', u'/usr/bin/convert') +OCR_OPTIONS = getattr(settings, 'CONVERTER_OCR_OPTIONS', u'-colorspace Gray -depth 8 -resample 200x200') diff --git a/apps/documents/__init__.py b/apps/documents/__init__.py index 23f4030aa4..6ed79ffa1b 100644 --- a/apps/documents/__init__.py +++ b/apps/documents/__init__.py @@ -1,3 +1,5 @@ +import tempfile + from django.utils.translation import ugettext_lazy as _ from common.api import register_links, register_menu @@ -5,6 +7,9 @@ from common.api import register_links, register_menu from models import Document from staging import StagingFile +from documents.conf import settings as documents_settings + + document_list = {'text':_(u'documents list'), 'view':'document_list', 'famfam':'page'} document_create = {'text':_('upload a document'), 'view':'document_create', 'famfam':'page_add'} document_create_multiple = {'text':_('upload multiple documents'), 'view':'document_create_multiple', 'famfam':'page_add'} @@ -28,3 +33,4 @@ register_menu([ document_list ],'famfam':'page','position':4}]) +TEMPORARY_DIRECTORY = documents_settings.TEMPORARY_DIRECTORY if documents_settings.TEMPORARY_DIRECTORY else tempfile.mkdtemp() diff --git a/apps/documents/admin.py b/apps/documents/admin.py index 660d8b781b..3347d39e8d 100644 --- a/apps/documents/admin.py +++ b/apps/documents/admin.py @@ -2,7 +2,7 @@ from django.contrib import admin from models import MetadataType, DocumentType, Document, \ DocumentTypeMetadataType, DocumentMetadata, DocumentTypeFilename, \ - MetadataIndex, DocumentMetadataIndex + MetadataIndex, DocumentMetadataIndex, DocumentPage class MetadataTypeAdmin(admin.ModelAdmin): @@ -49,9 +49,15 @@ class DocumentMetadataIndexInline(admin.StackedInline): allow_add = True readonly_fields = ('metadata_index', 'filename') +class DocumentPageInline(admin.StackedInline): + model = DocumentPage + extra = 1 + classes = ('collapse-open',) + allow_add = True + class DocumentAdmin(admin.ModelAdmin): - inlines = [DocumentMetadataInline, DocumentMetadataIndexInline] + inlines = [DocumentMetadataInline, DocumentMetadataIndexInline, DocumentPageInline] list_display = ('uuid', 'file_filename', 'file_extension') diff --git a/apps/documents/conf/settings.py b/apps/documents/conf/settings.py index 8381a95809..27fc6575d2 100644 --- a/apps/documents/conf/settings.py +++ b/apps/documents/conf/settings.py @@ -40,3 +40,5 @@ FILESYSTEM_SLUGIFY_PATHS = getattr(settings, 'DOCUMENTS_SLUGIFY_PATHS', False) FILESYSTEM_MAX_RENAME_COUNT = getattr(settings, 'DOCUMENTS_FILESYSTEM_MAX_RENAME_COUNT', 200) #misc TEMPORARY_DIRECTORY = getattr(settings, 'DOCUMENTS_TEMPORARY_DIRECTORY', u'/tmp') + + diff --git a/apps/documents/forms.py b/apps/documents/forms.py index c63a0edf7d..3ca7498305 100644 --- a/apps/documents/forms.py +++ b/apps/documents/forms.py @@ -35,6 +35,7 @@ class DocumentForm(forms.ModelForm): class Meta: model = Document + exclude = ('description',) class DocumentForm_view(DetailForm): diff --git a/apps/documents/models.py b/apps/documents/models.py index 74931ffb29..a9479c7de0 100644 --- a/apps/documents/models.py +++ b/apps/documents/models.py @@ -65,6 +65,7 @@ class Document(models.Model): date_added = models.DateTimeField(verbose_name=_(u'added'), auto_now_add=True) date_updated = models.DateTimeField(verbose_name=_(u'updated'), auto_now=True) checksum = models.TextField(blank=True, null=True, verbose_name=_(u'checksum'), editable=False) + description = models.TextField(blank=True, null=True, verbose_name=_(u'description')) class Meta: verbose_name = _(u'document') @@ -98,7 +99,7 @@ class Document(models.Model): #topics/db/queries.html#topics-db-queries-delete self.delete_fs_links() super(Document, self).delete(*args, **kwargs) - + def create_fs_links(self): if FILESYSTEM_FILESERVING_ENABLE: metadata_dict = {'document':self} @@ -292,6 +293,20 @@ class DocumentTypeFilename(models.Model): ordering = ['filename'] verbose_name = _(u'document type filename') verbose_name_plural = _(u'document types filenames') - -register(Document, _(u'document'), ['document_type__name', 'file_mimetype', 'file_filename', 'file_extension', 'documentmetadata__value']) + +class DocumentPage(models.Model): + document = models.ForeignKey(Document, verbose_name=_(u'document')) + content = models.TextField(blank=True, null=True, verbose_name=_(u'content')) + page_label = models.CharField(max_length=32, blank=True, null=True, verbose_name=_(u'page label')) + page_number = models.PositiveIntegerField(default=0, verbose_name=_(u'page number')) + + def __unicode__(self): + return '%s - %s' % (self.page_number, self.page_label) + + class Meta: + verbose_name = _(u'document page') + verbose_name_plural = _(u'document pages') + + +register(Document, _(u'document'), ['document_type__name', 'file_mimetype', 'file_filename', 'file_extension', 'documentmetadata__value', 'documentpage__content']) diff --git a/apps/documents/utils.py b/apps/documents/utils.py index 5a0e01b042..3d61962159 100644 --- a/apps/documents/utils.py +++ b/apps/documents/utils.py @@ -1,7 +1,7 @@ import os import tempfile -from documents.conf.settings import TEMPORARY_DIRECTORY +from documents import TEMPORARY_DIRECTORY #http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python diff --git a/apps/documents/views.py b/apps/documents/views.py index e9b19a7ed6..7969fdb6cd 100644 --- a/apps/documents/views.py +++ b/apps/documents/views.py @@ -292,7 +292,6 @@ def get_document_image(request, document_id, size=PREVIEW_SIZE): desc = document.file.storage.open(document.file.path) filepath = from_descriptor_to_tempfile(desc, document.uuid) output_file = convert(filepath, size) - print document_id, output_file return serve_file(request, File(file=open(output_file, 'r'))) except Exception, e: if size == THUMBNAIL_SIZE: @@ -328,5 +327,5 @@ def staging_file_preview(request, staging_file_id): output_file = convert(filepath, STAGING_FILES_PREVIEW_SIZE) return serve_file(request, File(file=open(output_file, 'r'))) except Exception, e: - #messages.error(request, e) - return HttpResponse('') + return serve_file(request, File(file=open('%simages/1297211435_error.png' % settings.MEDIA_ROOT, 'r'))) + diff --git a/apps/ocr/__init__.py b/apps/ocr/__init__.py new file mode 100644 index 0000000000..d0dd088682 --- /dev/null +++ b/apps/ocr/__init__.py @@ -0,0 +1,15 @@ +from django.utils.translation import ugettext_lazy as _ + +from common.api import register_links, register_menu + +from documents.models import Document + + +submit_document = {'text':_('submit to ocr'), 'view':'submit_document', 'args':'object.id', 'famfam':'page_lightning'} + +register_links(Document, [submit_document]) + +#register_menu([ +# {'text':_('OCR'), 'view':'ocr_queue', 'links':[ +# ocr_queue +# ],'famfam':'hourglass','position':5}]) diff --git a/apps/ocr/api.py b/apps/ocr/api.py new file mode 100644 index 0000000000..dcf6dda2c1 --- /dev/null +++ b/apps/ocr/api.py @@ -0,0 +1,66 @@ +#Some code from http://wiki.github.com/hoffstaetter/python-tesseract + +import os + +import subprocess +import tempfile + +from django.utils.translation import ugettext as _ + +from documents.models import DocumentPage +from documents.conf.settings import TEMPORARY_DIRECTORY +from converter.api import convert_document_for_ocr + +from ocr.conf.settings import TESSERACT_PATH + + +def cleanup(filename): + ''' tries to remove the given filename. Ignores non-existent files ''' + try: + os.remove(filename) + except OSError: + pass + +class TesseractError(Exception): + def __init__(self, status, message): + self.status = status + self.message = message + + +def run_tesseract(input_filename, output_filename_base, lang=None): + command = [TESSERACT_PATH, input_filename, output_filename_base] + if lang is not None: + command += ['-l', lang] + + proc = subprocess.Popen(command, stderr=subprocess.PIPE) + return (proc.wait(), proc.stderr.read()) + + +def ocr_document(document): + total_pages = 1 + page = 0 + while page < total_pages: + imagefile = convert_document_for_ocr(document, page=page) + desc, filepath = tempfile.mkstemp() + try: + status, error_string = run_tesseract(imagefile, filepath) + if status: + errors = get_errors(error_string) + raise TesseractError(status, errors) + finally: + ocr_output = os.extsep.join([filepath, 'txt']) + f = file(ocr_output) + try: + document_page, created = DocumentPage.objects.get_or_create(document=document, + page_number=page) + document_page.content = f.read().strip() + document_page.page_label = _(u'Text from OCR') + document_page.save() + finally: + f.close() + cleanup(filepath) + cleanup(ocr_output) + cleanup(imagefile) + + page += 1 + diff --git a/apps/ocr/conf/__init__.py b/apps/ocr/conf/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/apps/ocr/conf/settings.py b/apps/ocr/conf/settings.py new file mode 100644 index 0000000000..de3c3980da --- /dev/null +++ b/apps/ocr/conf/settings.py @@ -0,0 +1,3 @@ +from django.conf import settings + +TESSERACT_PATH = getattr(settings, 'OCR_TESSERACT_PATH', u'/usr/bin/tesseract') diff --git a/apps/ocr/models.py b/apps/ocr/models.py new file mode 100644 index 0000000000..71a8362390 --- /dev/null +++ b/apps/ocr/models.py @@ -0,0 +1,3 @@ +from django.db import models + +# Create your models here. diff --git a/apps/ocr/tests.py b/apps/ocr/tests.py new file mode 100644 index 0000000000..2247054b35 --- /dev/null +++ b/apps/ocr/tests.py @@ -0,0 +1,23 @@ +""" +This file demonstrates two different styles of tests (one doctest and one +unittest). These will both pass when you run "manage.py test". + +Replace these with more appropriate tests for your application. +""" + +from django.test import TestCase + +class SimpleTest(TestCase): + def test_basic_addition(self): + """ + Tests that 1 + 1 always equals 2. + """ + self.failUnlessEqual(1 + 1, 2) + +__test__ = {"doctest": """ +Another way to test that 1 + 1 is equal to 2. + +>>> 1 + 1 == 2 +True +"""} + diff --git a/apps/ocr/urls.py b/apps/ocr/urls.py new file mode 100644 index 0000000000..39dc2a35ad --- /dev/null +++ b/apps/ocr/urls.py @@ -0,0 +1,6 @@ +from django.conf.urls.defaults import * +from django.utils.translation import ugettext_lazy as _ + +urlpatterns = patterns('ocr.views', + url(r'^(?P\d+)/submit/$', 'submit_document', (), 'submit_document'), +) diff --git a/apps/ocr/views.py b/apps/ocr/views.py new file mode 100644 index 0000000000..f1ec3e3838 --- /dev/null +++ b/apps/ocr/views.py @@ -0,0 +1,27 @@ +from django.http import HttpResponse, HttpResponseRedirect +from django.shortcuts import render_to_response, get_object_or_404, redirect +from django.template import RequestContext +from django.contrib import messages +from django.views.generic.list_detail import object_detail, object_list +from django.core.urlresolvers import reverse +#from django.views.generic.create_update import create_object, delete_object, update_object +from django.conf import settings +from django.utils.translation import ugettext as _ + + +from documents.models import Document + + +from api import ocr_document + +def submit_document(request, document_id): + document = get_object_or_404(Document, pk=document_id) + + try: + result = ocr_document(document) + except Exception, e: + messages.error(request, e.message) + return HttpResponseRedirect(request.META['HTTP_REFERER']) + + messages.success(request, _(u'Document OCR was successful.')) + return HttpResponseRedirect(request.META['HTTP_REFERER']) diff --git a/docs/TODO b/docs/TODO index acaf40268a..ceb1e5f145 100644 --- a/docs/TODO +++ b/docs/TODO @@ -14,10 +14,13 @@ * Database storage backend (sql, nosql: [mongodb]) - DEFERRED, provided by https://bitbucket.org/david/django-storages/wiki/Home * Staging file previews - DONE * Display file size in list and details - DONE +* Document previews - DONE +* Document previews on demand w/ imagemagick - DONE +* Add document description - DONE +* Integrate with http://code.google.com/p/pytesser/ - DEFERRED, done using Popen * Document list filtering by metadata * Filterform date filtering widget * Validate GET data before saving file -* Integrate with http://code.google.com/p/pytesser/ * Update symlinks when document or metadata changed - ALMOST * Check duplicated files using checksum * If theres only one document type on db skip step 1 of wizard @@ -27,14 +30,12 @@ * Encrypting storage backend * Indicate in generic list which document don't exist in storage backend * Add css grids -* Document previews * Recognize multi-page documents * Document model's delete method might not get called when deleting in bulk from a queryset * Allow metadata entry form to mix required and non required metadata * Link to delete and recreate all document links * MuliThreading deferred OCR -* Document previews on demand w/ imagemagick * Versioning support * Generic document anotations using layer overlays * Permissions @@ -43,3 +44,5 @@ * Scheduled maintenance (cleanup, deferred OCR's) * Show document metadata in document list * Show abbreviated uuid in document list +* Add tags to documents +* Field for document language or autodetect diff --git a/settings.py b/settings.py index be9a8e5df1..6bd8198272 100644 --- a/settings.py +++ b/settings.py @@ -125,6 +125,7 @@ INSTALLED_APPS = ( 'dynamic_search', 'filetransfers', 'converter', + 'ocr', ) TEMPLATE_CONTEXT_PROCESSORS = ( @@ -191,6 +192,7 @@ LOGIN_EXEMPT_URLS = ( #DOCUMENTS_FILESYSTEM_MAX_RENAME_COUNT = 200 # Misc #DOCUMENTS_TEMPORARY_DIRECTORY = u'/tmp' +#CONVERTER_CONVERT_PATH = u'/usr/bin/convert' #======== End of configuration options ======= try: diff --git a/urls.py b/urls.py index e3a9d4662a..9ba2632073 100644 --- a/urls.py +++ b/urls.py @@ -9,6 +9,7 @@ urlpatterns = patterns('', (r'^', include('main.urls')), (r'^documents/', include('documents.urls')), (r'^search/', include('dynamic_search.urls')), + (r'^ocr/', include('ocr.urls')), (r'^admin/doc/', include('django.contrib.admindocs.urls')), (r'^admin/', include(admin.site.urls)), )