Issue #87, Per document language selection

This commit is contained in:
Roberto Rosario
2014-10-22 02:35:16 -04:00
parent 9b1b89cc21
commit e8762e4792
13 changed files with 167 additions and 32 deletions

View File

@@ -127,6 +127,15 @@ Default: ``image_cache`` (inside the `media` folder)
The path where the visual representations of the documents are stored for fast display.
.. setting:: DOCUMENTS_LANGUAGE
**DOCUMENTS_LANGUAGE**
Default: ``eng``
Default language selection when creating a document.
Converter
=========
.. setting:: CONVERTER_GRAPHICS_BACKEND
@@ -279,15 +288,6 @@ File path to the ``tesseract`` executable, used to perform OCR on document
page's images.
.. setting:: OCR_TESSERACT_LANGUAGE
**OCR_TESSERACT_LANGUAGE**
Default: ``eng``
Language code passed to the ``tesseract`` executable.
.. setting:: OCR_UNPAPER_PATH
**OCR_UNPAPER_PATH**

View File

@@ -178,6 +178,7 @@ class DocumentForm_edit(DocumentForm):
if kwargs['instance'].latest_version:
self.fields.pop('version_update')
self.fields.pop('comment')
self.fields['language'].initial = kwargs['instance'].language
else:
self.fields.pop('new_filename')
@@ -190,7 +191,7 @@ class DocumentPropertiesForm(DetailForm):
"""
class Meta:
model = Document
fields = ('document_type', 'description',)
fields = ('document_type', 'description', 'language')
class DocumentContentForm(forms.Form):

View File

@@ -12,6 +12,8 @@ try:
except ImportError:
from StringIO import StringIO
import pycountry
from django.db import models
from django.contrib.auth.models import User
from django.core.exceptions import ValidationError
@@ -35,7 +37,7 @@ from .literals import (VERSION_UPDATE_MAJOR, VERSION_UPDATE_MICRO,
from .managers import (DocumentPageTransformationManager, DocumentTypeManager,
RecentDocumentManager)
from .runtime import storage_backend
from .settings import (CACHE_PATH, CHECKSUM_FUNCTION, DISPLAY_SIZE,
from .settings import (CACHE_PATH, CHECKSUM_FUNCTION, DISPLAY_SIZE, LANGUAGE,
UUID_FUNCTION, ZOOM_MAX_LEVEL, ZOOM_MIN_LEVEL)
HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest() # document image cache name hash function
@@ -83,6 +85,10 @@ class Document(models.Model):
document_type = models.ForeignKey(DocumentType, verbose_name=_(u'Document type'), related_name='documents')
description = models.TextField(blank=True, null=True, verbose_name=_(u'Description'))
date_added = models.DateTimeField(verbose_name=_(u'Added'), auto_now_add=True)
language = models.CharField(
choices=[(i.bibliographic, i.name) for i in list(pycountry.languages)],
default=LANGUAGE, max_length=8, verbose_name=_('Language')
)
@staticmethod
def clear_image_cache():
@@ -338,6 +344,8 @@ class DocumentVersion(models.Model):
file = models.FileField(upload_to=get_filename_from_uuid, storage=storage_backend, verbose_name=_(u'File'))
mimetype = models.CharField(max_length=255, null=True, blank=True, editable=False)
encoding = models.CharField(max_length=64, null=True, blank=True, editable=False)
# TODO: move filename to Document model, is should not be a version's field
filename = models.CharField(max_length=255, default=u'', editable=False, db_index=True)
checksum = models.TextField(blank=True, null=True, verbose_name=_(u'Checksum'), editable=False)

View File

@@ -42,5 +42,6 @@ register_settings(
{'name': u'ROTATION_STEP', 'global_name': u'DOCUMENTS_ROTATION_STEP', 'default': 90, 'description': _(u'Amount in degrees to rotate a document page per user interaction.')},
#
{'name': u'CACHE_PATH', 'global_name': u'DOCUMENTS_CACHE_PATH', 'default': os.path.join(settings.MEDIA_ROOT, 'image_cache'), 'exists': True},
{'name': u'LANGUAGE', 'global_name': u'DOCUMENTS_LANGUAGE', 'default': u'eng', 'description': _('Default documents language (in ISO639-2 format).')},
]
)

View File

@@ -0,0 +1,121 @@
# -*- coding: utf-8 -*-
from south.utils import datetime_utils as datetime
from south.db import db
from south.v2 import SchemaMigration
from django.db import models
class Migration(SchemaMigration):
def forwards(self, orm):
# Adding field 'Document.language'
db.add_column(u'documents_document', 'language',
self.gf('django.db.models.fields.CharField')(default=u'eng', max_length=8),
keep_default=False)
def backwards(self, orm):
# Deleting field 'Document.language'
db.delete_column(u'documents_document', 'language')
models = {
u'auth.group': {
'Meta': {'object_name': 'Group'},
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '80'}),
'permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': u"orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'})
},
u'auth.permission': {
'Meta': {'ordering': "(u'content_type__app_label', u'content_type__model', u'codename')", 'unique_together': "((u'content_type', u'codename'),)", 'object_name': 'Permission'},
'codename': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
'content_type': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['contenttypes.ContentType']"}),
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'name': ('django.db.models.fields.CharField', [], {'max_length': '50'})
},
u'auth.user': {
'Meta': {'object_name': 'User'},
'date_joined': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}),
'email': ('django.db.models.fields.EmailField', [], {'max_length': '75', 'blank': 'True'}),
'first_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}),
'groups': ('django.db.models.fields.related.ManyToManyField', [], {'symmetrical': 'False', 'related_name': "u'user_set'", 'blank': 'True', 'to': u"orm['auth.Group']"}),
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'is_active': ('django.db.models.fields.BooleanField', [], {'default': 'True'}),
'is_staff': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
'is_superuser': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
'last_login': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}),
'last_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}),
'password': ('django.db.models.fields.CharField', [], {'max_length': '128'}),
'user_permissions': ('django.db.models.fields.related.ManyToManyField', [], {'symmetrical': 'False', 'related_name': "u'user_set'", 'blank': 'True', 'to': u"orm['auth.Permission']"}),
'username': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '30'})
},
u'contenttypes.contenttype': {
'Meta': {'ordering': "('name',)", 'unique_together': "(('app_label', 'model'),)", 'object_name': 'ContentType', 'db_table': "'django_content_type'"},
'app_label': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'model': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
'name': ('django.db.models.fields.CharField', [], {'max_length': '100'})
},
u'documents.document': {
'Meta': {'ordering': "['-date_added']", 'object_name': 'Document'},
'date_added': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}),
'description': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}),
'document_type': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'documents'", 'to': u"orm['documents.DocumentType']"}),
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'language': ('django.db.models.fields.CharField', [], {'default': "u'eng'", 'max_length': '8'}),
'uuid': ('django.db.models.fields.CharField', [], {'max_length': '48', 'blank': 'True'})
},
u'documents.documentpage': {
'Meta': {'ordering': "['page_number']", 'object_name': 'DocumentPage'},
'content': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}),
'document_version': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'pages'", 'to': u"orm['documents.DocumentVersion']"}),
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'page_label': ('django.db.models.fields.CharField', [], {'max_length': '40', 'null': 'True', 'blank': 'True'}),
'page_number': ('django.db.models.fields.PositiveIntegerField', [], {'default': '1', 'db_index': 'True'})
},
u'documents.documentpagetransformation': {
'Meta': {'ordering': "('order',)", 'object_name': 'DocumentPageTransformation'},
'arguments': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}),
'document_page': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['documents.DocumentPage']"}),
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'order': ('django.db.models.fields.PositiveIntegerField', [], {'default': '0', 'null': 'True', 'db_index': 'True', 'blank': 'True'}),
'transformation': ('django.db.models.fields.CharField', [], {'max_length': '128'})
},
u'documents.documenttype': {
'Meta': {'ordering': "['name']", 'object_name': 'DocumentType'},
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '32'}),
'ocr': ('django.db.models.fields.BooleanField', [], {'default': 'True'})
},
u'documents.documenttypefilename': {
'Meta': {'ordering': "['filename']", 'object_name': 'DocumentTypeFilename'},
'document_type': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['documents.DocumentType']"}),
'enabled': ('django.db.models.fields.BooleanField', [], {'default': 'True'}),
'filename': ('django.db.models.fields.CharField', [], {'max_length': '128', 'db_index': 'True'}),
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'})
},
u'documents.documentversion': {
'Meta': {'unique_together': "(('document', 'major', 'minor', 'micro'),)", 'object_name': 'DocumentVersion'},
'checksum': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}),
'comment': ('django.db.models.fields.TextField', [], {'blank': 'True'}),
'document': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'versions'", 'to': u"orm['documents.Document']"}),
'encoding': ('django.db.models.fields.CharField', [], {'max_length': '64', 'null': 'True', 'blank': 'True'}),
'file': ('django.db.models.fields.files.FileField', [], {'max_length': '100'}),
'filename': ('django.db.models.fields.CharField', [], {'default': "u''", 'max_length': '255', 'db_index': 'True'}),
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'major': ('django.db.models.fields.PositiveIntegerField', [], {'default': '1'}),
'micro': ('django.db.models.fields.PositiveIntegerField', [], {'default': '0'}),
'mimetype': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True', 'blank': 'True'}),
'minor': ('django.db.models.fields.PositiveIntegerField', [], {'default': '0'}),
'timestamp': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'})
},
u'documents.recentdocument': {
'Meta': {'ordering': "('-datetime_accessed',)", 'object_name': 'RecentDocument'},
'datetime_accessed': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'db_index': 'True', 'blank': 'True'}),
'document': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['documents.Document']"}),
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'user': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['auth.User']"})
}
}
complete_apps = ['documents']

View File

@@ -237,6 +237,7 @@ def document_edit(request, document_id):
if form.is_valid():
document.filename = form.cleaned_data['new_filename']
document.description = form.cleaned_data['description']
document.language = form.cleaned_data['language']
if 'document_type_available_filenames' in form.cleaned_data:
if form.cleaned_data['document_type_available_filenames']:

View File

@@ -9,7 +9,7 @@ import sh
from django.utils.translation import ugettext as _
from common.settings import TEMPORARY_DIRECTORY
from common.utils import fs_cleanup
from common.utils import fs_cleanup, load_backend
from converter.api import convert
from documents.models import DocumentPage
@@ -18,8 +18,8 @@ from .literals import (DEFAULT_OCR_FILE_EXTENSION, DEFAULT_OCR_FILE_FORMAT,
UNPAPER_FILE_FORMAT)
from .parsers import parse_document_page
from .parsers.exceptions import ParserError, ParserUnknownFile
from .runtime import language_backend, ocr_backend
from .settings import UNPAPER_PATH, LANGUAGE
from .runtime import ocr_backend
from .settings import UNPAPER_PATH
logger = logging.getLogger(__name__)
@@ -68,9 +68,9 @@ def do_document_ocr(document):
os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
try:
ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, LANGUAGE)
ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document.language)
document_page.content = ocr_cleanup(ocr_text)
document_page.content = ocr_cleanup(document.language, ocr_text)
document_page.page_label = _(u'Text from OCR')
document_page.save()
finally:
@@ -80,11 +80,15 @@ def do_document_ocr(document):
fs_cleanup(unpaper_output)
def ocr_cleanup(text):
def ocr_cleanup(language, text):
"""
Cleanup the OCR's output passing it thru the selected language's
cleanup filter
"""
try:
language_backend = load_backend(u'.'.join([u'ocr', u'lang', language, u'LanguageBackend']))()
except ImportError:
language_backend = None
output = []
for line in text.splitlines():
@@ -108,7 +112,7 @@ def clean_pages():
"""
for page in DocumentPage.objects.all():
if page.content:
page.content = ocr_cleanup(page.content)
page.content = ocr_cleanup(document.language, page.content)
page.save()

View File

@@ -2,11 +2,6 @@ from __future__ import absolute_import
from common.utils import load_backend
from .settings import BACKEND, LANGUAGE
try:
language_backend = load_backend(u'.'.join([u'ocr', u'lang', LANGUAGE, u'LanguageBackend']))()
except ImportError:
language_backend = None
from .settings import BACKEND
ocr_backend = load_backend(BACKEND)()

View File

@@ -9,7 +9,6 @@ register_settings(
module=u'ocr.settings',
settings=[
{'name': u'TESSERACT_PATH', 'global_name': u'OCR_TESSERACT_PATH', 'default': u'/usr/bin/tesseract', 'exists': True},
{'name': u'LANGUAGE', 'global_name': u'OCR_LANGUAGE', 'default': u'eng'},
{'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
{'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True},
{'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract.Tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')},

View File

@@ -59,7 +59,7 @@ class Source(models.Model):
def get_transformation_list(self):
return SourceTransformation.transformations.get_for_object_as_list(self)
def upload_file(self, file_object, filename=None, use_file_name=False, document_type=None, expand=False, metadata_dict_list=None, user=None, document=None, new_version_data=None, command_line=False, description=None):
def upload_file(self, file_object, filename=None, use_file_name=False, document_type=None, expand=False, metadata_dict_list=None, user=None, document=None, new_version_data=None, command_line=False, description=None, language=None):
is_compressed = None
if expand:
@@ -69,7 +69,7 @@ class Source(models.Model):
for fp in cf.children():
if command_line:
print 'Uploading file #%d: %s' % (count, fp)
self.upload_single_file(file_object=fp, filename=None, document_type=document_type, metadata_dict_list=metadata_dict_list, user=user, description=description)
self.upload_single_file(file_object=fp, filename=None, document_type=document_type, metadata_dict_list=metadata_dict_list, user=user, description=description, language=language)
fp.close()
count += 1
@@ -78,17 +78,17 @@ class Source(models.Model):
logging.debug('Exception: NotACompressedFile')
if command_line:
raise
self.upload_single_file(file_object=file_object, filename=filename, document_type=document_type, metadata_dict_list=metadata_dict_list, user=user, description=description)
self.upload_single_file(file_object=file_object, filename=filename, document_type=document_type, metadata_dict_list=metadata_dict_list, user=user, description=description, language=language)
else:
is_compressed = True
else:
self.upload_single_file(file_object, filename, use_file_name, document_type, metadata_dict_list, user, document, new_version_data, description=description)
self.upload_single_file(file_object, filename, use_file_name, document_type, metadata_dict_list, user, document, new_version_data, description=description, language=language)
file_object.close()
return {'is_compressed': is_compressed}
@transaction.atomic
def upload_single_file(self, file_object, filename=None, use_file_name=False, document_type=None, metadata_dict_list=None, user=None, document=None, new_version_data=None, description=None):
def upload_single_file(self, file_object, filename=None, use_file_name=False, document_type=None, metadata_dict_list=None, user=None, document=None, new_version_data=None, description=None, language=None):
new_document = not document
if new_document:
@@ -99,6 +99,9 @@ class Source(models.Model):
if description:
document.description = description
if language:
document.language = language
document.save(user=user)
else:
if use_file_name:

View File

@@ -21,7 +21,7 @@ def task_check_interval_source(source_id):
@app.task(ignore_result=True)
def task_upload_document(source_id, file_path, filename=None, use_file_name=False, document_type_id=None, expand=False, metadata_dict_list=None, user_id=None, document_id=None, new_version_data=None, command_line=False, description=None):
def task_upload_document(source_id, file_path, filename=None, use_file_name=False, document_type_id=None, expand=False, metadata_dict_list=None, user_id=None, document_id=None, new_version_data=None, command_line=False, description=None, language=None):
source = Source.objects.get_subclass(pk=source_id)
if document_type_id:
@@ -41,7 +41,7 @@ def task_upload_document(source_id, file_path, filename=None, use_file_name=Fals
with File(file=open(file_path, mode='rb')) as file_object:
#try:
result = source.upload_file(file_object, filename, use_file_name, document_type, expand, metadata_dict_list, user, document, new_version_data, command_line, description)
result = source.upload_file(file_object, filename=filename, use_file_name=use_file_name, document_type=document_type, expand=expand, metadata_dict_list=metadata_dict_list, user=user, document=document, new_version_data=new_version_data, command_line=command_line, description=description, language=language)
#except NewDocumentVersionNotAllowed:
# messages.error(request, _(u'New version uploads are not allowed for this document.'))

View File

@@ -217,6 +217,7 @@ def upload_interactive(request, source_id=None, document_pk=None):
document_id=document_id,
new_version_data=form.cleaned_data.get('new_version_data'),
description=form.cleaned_data.get('description'),
language=form.cleaned_data.get('language')
), queue='uploads')
# TODO: Notify user

View File

@@ -21,6 +21,7 @@ Pillow==2.5.0
PyYAML==3.11
pdfminer==20110227
psutil==2.1.1
pycountry==1.8
pytz==2014.4
python-gnupg==0.3.6
python-hkp==0.1.3