diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst index 88777a055b..31b6eda67d 100644 --- a/docs/topics/settings.rst +++ b/docs/topics/settings.rst @@ -127,6 +127,15 @@ Default: ``image_cache`` (inside the `media` folder) The path where the visual representations of the documents are stored for fast display. +.. setting:: DOCUMENTS_LANGUAGE + +**DOCUMENTS_LANGUAGE** + +Default: ``eng`` + +Default language selection when creating a document. + + Converter ========= .. setting:: CONVERTER_GRAPHICS_BACKEND @@ -279,15 +288,6 @@ File path to the ``tesseract`` executable, used to perform OCR on document page's images. -.. setting:: OCR_TESSERACT_LANGUAGE - -**OCR_TESSERACT_LANGUAGE** - -Default: ``eng`` - -Language code passed to the ``tesseract`` executable. - - .. setting:: OCR_UNPAPER_PATH **OCR_UNPAPER_PATH** diff --git a/mayan/apps/documents/forms.py b/mayan/apps/documents/forms.py index fd61a25d4a..5f3fa9d59d 100644 --- a/mayan/apps/documents/forms.py +++ b/mayan/apps/documents/forms.py @@ -178,6 +178,7 @@ class DocumentForm_edit(DocumentForm): if kwargs['instance'].latest_version: self.fields.pop('version_update') self.fields.pop('comment') + self.fields['language'].initial = kwargs['instance'].language else: self.fields.pop('new_filename') @@ -190,7 +191,7 @@ class DocumentPropertiesForm(DetailForm): """ class Meta: model = Document - fields = ('document_type', 'description',) + fields = ('document_type', 'description', 'language') class DocumentContentForm(forms.Form): diff --git a/mayan/apps/documents/models.py b/mayan/apps/documents/models.py index 0fbb42f94b..04d68ff4e4 100644 --- a/mayan/apps/documents/models.py +++ b/mayan/apps/documents/models.py @@ -12,6 +12,8 @@ try: except ImportError: from StringIO import StringIO +import pycountry + from django.db import models from django.contrib.auth.models import User from django.core.exceptions import ValidationError @@ -35,7 +37,7 @@ from .literals import (VERSION_UPDATE_MAJOR, VERSION_UPDATE_MICRO, from .managers import (DocumentPageTransformationManager, DocumentTypeManager, RecentDocumentManager) from .runtime import storage_backend -from .settings import (CACHE_PATH, CHECKSUM_FUNCTION, DISPLAY_SIZE, +from .settings import (CACHE_PATH, CHECKSUM_FUNCTION, DISPLAY_SIZE, LANGUAGE, UUID_FUNCTION, ZOOM_MAX_LEVEL, ZOOM_MIN_LEVEL) HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest() # document image cache name hash function @@ -83,6 +85,10 @@ class Document(models.Model): document_type = models.ForeignKey(DocumentType, verbose_name=_(u'Document type'), related_name='documents') description = models.TextField(blank=True, null=True, verbose_name=_(u'Description')) date_added = models.DateTimeField(verbose_name=_(u'Added'), auto_now_add=True) + language = models.CharField( + choices=[(i.bibliographic, i.name) for i in list(pycountry.languages)], + default=LANGUAGE, max_length=8, verbose_name=_('Language') + ) @staticmethod def clear_image_cache(): @@ -338,6 +344,8 @@ class DocumentVersion(models.Model): file = models.FileField(upload_to=get_filename_from_uuid, storage=storage_backend, verbose_name=_(u'File')) mimetype = models.CharField(max_length=255, null=True, blank=True, editable=False) encoding = models.CharField(max_length=64, null=True, blank=True, editable=False) + + # TODO: move filename to Document model, is should not be a version's field filename = models.CharField(max_length=255, default=u'', editable=False, db_index=True) checksum = models.TextField(blank=True, null=True, verbose_name=_(u'Checksum'), editable=False) diff --git a/mayan/apps/documents/settings.py b/mayan/apps/documents/settings.py index cd2ecd868e..964472c33c 100644 --- a/mayan/apps/documents/settings.py +++ b/mayan/apps/documents/settings.py @@ -42,5 +42,6 @@ register_settings( {'name': u'ROTATION_STEP', 'global_name': u'DOCUMENTS_ROTATION_STEP', 'default': 90, 'description': _(u'Amount in degrees to rotate a document page per user interaction.')}, # {'name': u'CACHE_PATH', 'global_name': u'DOCUMENTS_CACHE_PATH', 'default': os.path.join(settings.MEDIA_ROOT, 'image_cache'), 'exists': True}, + {'name': u'LANGUAGE', 'global_name': u'DOCUMENTS_LANGUAGE', 'default': u'eng', 'description': _('Default documents language (in ISO639-2 format).')}, ] ) diff --git a/mayan/apps/documents/south_migrations/0025_auto__add_field_document_language.py b/mayan/apps/documents/south_migrations/0025_auto__add_field_document_language.py new file mode 100644 index 0000000000..a26246fe71 --- /dev/null +++ b/mayan/apps/documents/south_migrations/0025_auto__add_field_document_language.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- +from south.utils import datetime_utils as datetime +from south.db import db +from south.v2 import SchemaMigration +from django.db import models + + +class Migration(SchemaMigration): + + def forwards(self, orm): + # Adding field 'Document.language' + db.add_column(u'documents_document', 'language', + self.gf('django.db.models.fields.CharField')(default=u'eng', max_length=8), + keep_default=False) + + + def backwards(self, orm): + # Deleting field 'Document.language' + db.delete_column(u'documents_document', 'language') + + + models = { + u'auth.group': { + 'Meta': {'object_name': 'Group'}, + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '80'}), + 'permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': u"orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'}) + }, + u'auth.permission': { + 'Meta': {'ordering': "(u'content_type__app_label', u'content_type__model', u'codename')", 'unique_together': "((u'content_type', u'codename'),)", 'object_name': 'Permission'}, + 'codename': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'content_type': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['contenttypes.ContentType']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '50'}) + }, + u'auth.user': { + 'Meta': {'object_name': 'User'}, + 'date_joined': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), + 'email': ('django.db.models.fields.EmailField', [], {'max_length': '75', 'blank': 'True'}), + 'first_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}), + 'groups': ('django.db.models.fields.related.ManyToManyField', [], {'symmetrical': 'False', 'related_name': "u'user_set'", 'blank': 'True', 'to': u"orm['auth.Group']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'is_active': ('django.db.models.fields.BooleanField', [], {'default': 'True'}), + 'is_staff': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'is_superuser': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'last_login': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), + 'last_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}), + 'password': ('django.db.models.fields.CharField', [], {'max_length': '128'}), + 'user_permissions': ('django.db.models.fields.related.ManyToManyField', [], {'symmetrical': 'False', 'related_name': "u'user_set'", 'blank': 'True', 'to': u"orm['auth.Permission']"}), + 'username': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '30'}) + }, + u'contenttypes.contenttype': { + 'Meta': {'ordering': "('name',)", 'unique_together': "(('app_label', 'model'),)", 'object_name': 'ContentType', 'db_table': "'django_content_type'"}, + 'app_label': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'model': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '100'}) + }, + u'documents.document': { + 'Meta': {'ordering': "['-date_added']", 'object_name': 'Document'}, + 'date_added': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}), + 'description': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}), + 'document_type': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'documents'", 'to': u"orm['documents.DocumentType']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'language': ('django.db.models.fields.CharField', [], {'default': "u'eng'", 'max_length': '8'}), + 'uuid': ('django.db.models.fields.CharField', [], {'max_length': '48', 'blank': 'True'}) + }, + u'documents.documentpage': { + 'Meta': {'ordering': "['page_number']", 'object_name': 'DocumentPage'}, + 'content': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}), + 'document_version': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'pages'", 'to': u"orm['documents.DocumentVersion']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'page_label': ('django.db.models.fields.CharField', [], {'max_length': '40', 'null': 'True', 'blank': 'True'}), + 'page_number': ('django.db.models.fields.PositiveIntegerField', [], {'default': '1', 'db_index': 'True'}) + }, + u'documents.documentpagetransformation': { + 'Meta': {'ordering': "('order',)", 'object_name': 'DocumentPageTransformation'}, + 'arguments': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}), + 'document_page': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['documents.DocumentPage']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'order': ('django.db.models.fields.PositiveIntegerField', [], {'default': '0', 'null': 'True', 'db_index': 'True', 'blank': 'True'}), + 'transformation': ('django.db.models.fields.CharField', [], {'max_length': '128'}) + }, + u'documents.documenttype': { + 'Meta': {'ordering': "['name']", 'object_name': 'DocumentType'}, + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '32'}), + 'ocr': ('django.db.models.fields.BooleanField', [], {'default': 'True'}) + }, + u'documents.documenttypefilename': { + 'Meta': {'ordering': "['filename']", 'object_name': 'DocumentTypeFilename'}, + 'document_type': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['documents.DocumentType']"}), + 'enabled': ('django.db.models.fields.BooleanField', [], {'default': 'True'}), + 'filename': ('django.db.models.fields.CharField', [], {'max_length': '128', 'db_index': 'True'}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}) + }, + u'documents.documentversion': { + 'Meta': {'unique_together': "(('document', 'major', 'minor', 'micro'),)", 'object_name': 'DocumentVersion'}, + 'checksum': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}), + 'comment': ('django.db.models.fields.TextField', [], {'blank': 'True'}), + 'document': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'versions'", 'to': u"orm['documents.Document']"}), + 'encoding': ('django.db.models.fields.CharField', [], {'max_length': '64', 'null': 'True', 'blank': 'True'}), + 'file': ('django.db.models.fields.files.FileField', [], {'max_length': '100'}), + 'filename': ('django.db.models.fields.CharField', [], {'default': "u''", 'max_length': '255', 'db_index': 'True'}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'major': ('django.db.models.fields.PositiveIntegerField', [], {'default': '1'}), + 'micro': ('django.db.models.fields.PositiveIntegerField', [], {'default': '0'}), + 'mimetype': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True', 'blank': 'True'}), + 'minor': ('django.db.models.fields.PositiveIntegerField', [], {'default': '0'}), + 'timestamp': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}) + }, + u'documents.recentdocument': { + 'Meta': {'ordering': "('-datetime_accessed',)", 'object_name': 'RecentDocument'}, + 'datetime_accessed': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'db_index': 'True', 'blank': 'True'}), + 'document': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['documents.Document']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'user': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['auth.User']"}) + } + } + + complete_apps = ['documents'] \ No newline at end of file diff --git a/mayan/apps/documents/views.py b/mayan/apps/documents/views.py index 4f327ed848..cb4c9576e3 100644 --- a/mayan/apps/documents/views.py +++ b/mayan/apps/documents/views.py @@ -237,6 +237,7 @@ def document_edit(request, document_id): if form.is_valid(): document.filename = form.cleaned_data['new_filename'] document.description = form.cleaned_data['description'] + document.language = form.cleaned_data['language'] if 'document_type_available_filenames' in form.cleaned_data: if form.cleaned_data['document_type_available_filenames']: diff --git a/mayan/apps/ocr/api.py b/mayan/apps/ocr/api.py index e9bacce502..a6c701a19d 100644 --- a/mayan/apps/ocr/api.py +++ b/mayan/apps/ocr/api.py @@ -9,7 +9,7 @@ import sh from django.utils.translation import ugettext as _ from common.settings import TEMPORARY_DIRECTORY -from common.utils import fs_cleanup +from common.utils import fs_cleanup, load_backend from converter.api import convert from documents.models import DocumentPage @@ -18,8 +18,8 @@ from .literals import (DEFAULT_OCR_FILE_EXTENSION, DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT) from .parsers import parse_document_page from .parsers.exceptions import ParserError, ParserUnknownFile -from .runtime import language_backend, ocr_backend -from .settings import UNPAPER_PATH, LANGUAGE +from .runtime import ocr_backend +from .settings import UNPAPER_PATH logger = logging.getLogger(__name__) @@ -68,9 +68,9 @@ def do_document_ocr(document): os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext) try: - ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, LANGUAGE) + ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document.language) - document_page.content = ocr_cleanup(ocr_text) + document_page.content = ocr_cleanup(document.language, ocr_text) document_page.page_label = _(u'Text from OCR') document_page.save() finally: @@ -80,11 +80,15 @@ def do_document_ocr(document): fs_cleanup(unpaper_output) -def ocr_cleanup(text): +def ocr_cleanup(language, text): """ Cleanup the OCR's output passing it thru the selected language's cleanup filter """ + try: + language_backend = load_backend(u'.'.join([u'ocr', u'lang', language, u'LanguageBackend']))() + except ImportError: + language_backend = None output = [] for line in text.splitlines(): @@ -108,7 +112,7 @@ def clean_pages(): """ for page in DocumentPage.objects.all(): if page.content: - page.content = ocr_cleanup(page.content) + page.content = ocr_cleanup(document.language, page.content) page.save() diff --git a/mayan/apps/ocr/runtime.py b/mayan/apps/ocr/runtime.py index 1ebd58b366..eef63478c0 100644 --- a/mayan/apps/ocr/runtime.py +++ b/mayan/apps/ocr/runtime.py @@ -2,11 +2,6 @@ from __future__ import absolute_import from common.utils import load_backend -from .settings import BACKEND, LANGUAGE - -try: - language_backend = load_backend(u'.'.join([u'ocr', u'lang', LANGUAGE, u'LanguageBackend']))() -except ImportError: - language_backend = None +from .settings import BACKEND ocr_backend = load_backend(BACKEND)() diff --git a/mayan/apps/ocr/settings.py b/mayan/apps/ocr/settings.py index 068c11f913..ddfec29592 100644 --- a/mayan/apps/ocr/settings.py +++ b/mayan/apps/ocr/settings.py @@ -9,7 +9,6 @@ register_settings( module=u'ocr.settings', settings=[ {'name': u'TESSERACT_PATH', 'global_name': u'OCR_TESSERACT_PATH', 'default': u'/usr/bin/tesseract', 'exists': True}, - {'name': u'LANGUAGE', 'global_name': u'OCR_LANGUAGE', 'default': u'eng'}, {'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True}, {'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True}, {'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract.Tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')}, diff --git a/mayan/apps/sources/models.py b/mayan/apps/sources/models.py index 92c72c1326..b4e65a4b87 100644 --- a/mayan/apps/sources/models.py +++ b/mayan/apps/sources/models.py @@ -59,7 +59,7 @@ class Source(models.Model): def get_transformation_list(self): return SourceTransformation.transformations.get_for_object_as_list(self) - def upload_file(self, file_object, filename=None, use_file_name=False, document_type=None, expand=False, metadata_dict_list=None, user=None, document=None, new_version_data=None, command_line=False, description=None): + def upload_file(self, file_object, filename=None, use_file_name=False, document_type=None, expand=False, metadata_dict_list=None, user=None, document=None, new_version_data=None, command_line=False, description=None, language=None): is_compressed = None if expand: @@ -69,7 +69,7 @@ class Source(models.Model): for fp in cf.children(): if command_line: print 'Uploading file #%d: %s' % (count, fp) - self.upload_single_file(file_object=fp, filename=None, document_type=document_type, metadata_dict_list=metadata_dict_list, user=user, description=description) + self.upload_single_file(file_object=fp, filename=None, document_type=document_type, metadata_dict_list=metadata_dict_list, user=user, description=description, language=language) fp.close() count += 1 @@ -78,17 +78,17 @@ class Source(models.Model): logging.debug('Exception: NotACompressedFile') if command_line: raise - self.upload_single_file(file_object=file_object, filename=filename, document_type=document_type, metadata_dict_list=metadata_dict_list, user=user, description=description) + self.upload_single_file(file_object=file_object, filename=filename, document_type=document_type, metadata_dict_list=metadata_dict_list, user=user, description=description, language=language) else: is_compressed = True else: - self.upload_single_file(file_object, filename, use_file_name, document_type, metadata_dict_list, user, document, new_version_data, description=description) + self.upload_single_file(file_object, filename, use_file_name, document_type, metadata_dict_list, user, document, new_version_data, description=description, language=language) file_object.close() return {'is_compressed': is_compressed} @transaction.atomic - def upload_single_file(self, file_object, filename=None, use_file_name=False, document_type=None, metadata_dict_list=None, user=None, document=None, new_version_data=None, description=None): + def upload_single_file(self, file_object, filename=None, use_file_name=False, document_type=None, metadata_dict_list=None, user=None, document=None, new_version_data=None, description=None, language=None): new_document = not document if new_document: @@ -99,6 +99,9 @@ class Source(models.Model): if description: document.description = description + if language: + document.language = language + document.save(user=user) else: if use_file_name: diff --git a/mayan/apps/sources/tasks.py b/mayan/apps/sources/tasks.py index 5516e7b4fb..fe2c11dd91 100644 --- a/mayan/apps/sources/tasks.py +++ b/mayan/apps/sources/tasks.py @@ -21,7 +21,7 @@ def task_check_interval_source(source_id): @app.task(ignore_result=True) -def task_upload_document(source_id, file_path, filename=None, use_file_name=False, document_type_id=None, expand=False, metadata_dict_list=None, user_id=None, document_id=None, new_version_data=None, command_line=False, description=None): +def task_upload_document(source_id, file_path, filename=None, use_file_name=False, document_type_id=None, expand=False, metadata_dict_list=None, user_id=None, document_id=None, new_version_data=None, command_line=False, description=None, language=None): source = Source.objects.get_subclass(pk=source_id) if document_type_id: @@ -41,7 +41,7 @@ def task_upload_document(source_id, file_path, filename=None, use_file_name=Fals with File(file=open(file_path, mode='rb')) as file_object: #try: - result = source.upload_file(file_object, filename, use_file_name, document_type, expand, metadata_dict_list, user, document, new_version_data, command_line, description) + result = source.upload_file(file_object, filename=filename, use_file_name=use_file_name, document_type=document_type, expand=expand, metadata_dict_list=metadata_dict_list, user=user, document=document, new_version_data=new_version_data, command_line=command_line, description=description, language=language) #except NewDocumentVersionNotAllowed: # messages.error(request, _(u'New version uploads are not allowed for this document.')) diff --git a/mayan/apps/sources/views.py b/mayan/apps/sources/views.py index db467fbc1a..a57e3b73ea 100644 --- a/mayan/apps/sources/views.py +++ b/mayan/apps/sources/views.py @@ -217,6 +217,7 @@ def upload_interactive(request, source_id=None, document_pk=None): document_id=document_id, new_version_data=form.cleaned_data.get('new_version_data'), description=form.cleaned_data.get('description'), + language=form.cleaned_data.get('language') ), queue='uploads') # TODO: Notify user diff --git a/requirements/common.txt b/requirements/common.txt index 2f0d4ca992..558563da8f 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -21,6 +21,7 @@ Pillow==2.5.0 PyYAML==3.11 pdfminer==20110227 psutil==2.1.1 +pycountry==1.8 pytz==2014.4 python-gnupg==0.3.6 python-hkp==0.1.3