diff --git a/mayan/apps/documents/migrations/0018_auto__chg_field_documentpage_page_label.py b/mayan/apps/documents/migrations/0018_auto__chg_field_documentpage_page_label.py new file mode 100644 index 0000000000..9c3e9d123b --- /dev/null +++ b/mayan/apps/documents/migrations/0018_auto__chg_field_documentpage_page_label.py @@ -0,0 +1,146 @@ +# -*- coding: utf-8 -*- +import datetime +from south.db import db +from south.v2 import SchemaMigration +from django.db import models + + +class Migration(SchemaMigration): + + def forwards(self, orm): + + # Changing field 'DocumentPage.page_label' + db.alter_column('documents_documentpage', 'page_label', self.gf('django.db.models.fields.CharField')(max_length=40, null=True)) + + def backwards(self, orm): + + # Changing field 'DocumentPage.page_label' + db.alter_column('documents_documentpage', 'page_label', self.gf('django.db.models.fields.CharField')(max_length=32, null=True)) + + models = { + 'auth.group': { + 'Meta': {'object_name': 'Group'}, + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '80'}), + 'permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'}) + }, + 'auth.permission': { + 'Meta': {'ordering': "('content_type__app_label', 'content_type__model', 'codename')", 'unique_together': "(('content_type', 'codename'),)", 'object_name': 'Permission'}, + 'codename': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'content_type': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['contenttypes.ContentType']"}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '50'}) + }, + 'auth.user': { + 'Meta': {'object_name': 'User'}, + 'date_joined': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), + 'email': ('django.db.models.fields.EmailField', [], {'max_length': '75', 'blank': 'True'}), + 'first_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}), + 'groups': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Group']", 'symmetrical': 'False', 'blank': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'is_active': ('django.db.models.fields.BooleanField', [], {'default': 'True'}), + 'is_staff': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'is_superuser': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'last_login': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), + 'last_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}), + 'password': ('django.db.models.fields.CharField', [], {'max_length': '128'}), + 'user_permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'}), + 'username': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '30'}) + }, + 'comments.comment': { + 'Meta': {'ordering': "('submit_date',)", 'object_name': 'Comment', 'db_table': "'django_comments'"}, + 'comment': ('django.db.models.fields.TextField', [], {'max_length': '3000'}), + 'content_type': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'content_type_set_for_comment'", 'to': "orm['contenttypes.ContentType']"}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'ip_address': ('django.db.models.fields.IPAddressField', [], {'max_length': '15', 'null': 'True', 'blank': 'True'}), + 'is_public': ('django.db.models.fields.BooleanField', [], {'default': 'True'}), + 'is_removed': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'object_pk': ('django.db.models.fields.TextField', [], {}), + 'site': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['sites.Site']"}), + 'submit_date': ('django.db.models.fields.DateTimeField', [], {'default': 'None'}), + 'user': ('django.db.models.fields.related.ForeignKey', [], {'blank': 'True', 'related_name': "'comment_comments'", 'null': 'True', 'to': "orm['auth.User']"}), + 'user_email': ('django.db.models.fields.EmailField', [], {'max_length': '75', 'blank': 'True'}), + 'user_name': ('django.db.models.fields.CharField', [], {'max_length': '50', 'blank': 'True'}), + 'user_url': ('django.db.models.fields.URLField', [], {'max_length': '200', 'blank': 'True'}) + }, + 'contenttypes.contenttype': { + 'Meta': {'ordering': "('name',)", 'unique_together': "(('app_label', 'model'),)", 'object_name': 'ContentType', 'db_table': "'django_content_type'"}, + 'app_label': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'model': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '100'}) + }, + 'documents.document': { + 'Meta': {'ordering': "['-date_added']", 'object_name': 'Document'}, + 'date_added': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}), + 'description': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}), + 'document_type': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['documents.DocumentType']", 'null': 'True', 'blank': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'uuid': ('django.db.models.fields.CharField', [], {'max_length': '48', 'blank': 'True'}) + }, + 'documents.documentpage': { + 'Meta': {'ordering': "['page_number']", 'object_name': 'DocumentPage'}, + 'content': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}), + 'document_version': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['documents.DocumentVersion']"}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'page_label': ('django.db.models.fields.CharField', [], {'max_length': '40', 'null': 'True', 'blank': 'True'}), + 'page_number': ('django.db.models.fields.PositiveIntegerField', [], {'default': '1', 'db_index': 'True'}) + }, + 'documents.documenttype': { + 'Meta': {'ordering': "['name']", 'object_name': 'DocumentType'}, + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '32'}) + }, + 'documents.documenttypefilename': { + 'Meta': {'ordering': "['filename']", 'object_name': 'DocumentTypeFilename'}, + 'document_type': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['documents.DocumentType']"}), + 'enabled': ('django.db.models.fields.BooleanField', [], {'default': 'True'}), + 'filename': ('django.db.models.fields.CharField', [], {'max_length': '128', 'db_index': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}) + }, + 'documents.documentversion': { + 'Meta': {'unique_together': "(('document', 'major', 'minor', 'micro', 'release_level', 'serial'),)", 'object_name': 'DocumentVersion'}, + 'checksum': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}), + 'comment': ('django.db.models.fields.TextField', [], {'blank': 'True'}), + 'document': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['documents.Document']"}), + 'encoding': ('django.db.models.fields.CharField', [], {'max_length': '64', 'null': 'True', 'blank': 'True'}), + 'file': ('django.db.models.fields.files.FileField', [], {'max_length': '100'}), + 'filename': ('django.db.models.fields.CharField', [], {'default': "u''", 'max_length': '255', 'db_index': 'True'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'major': ('django.db.models.fields.PositiveIntegerField', [], {'default': '1'}), + 'micro': ('django.db.models.fields.PositiveIntegerField', [], {'default': '0'}), + 'mimetype': ('django.db.models.fields.CharField', [], {'max_length': '64', 'null': 'True', 'blank': 'True'}), + 'minor': ('django.db.models.fields.PositiveIntegerField', [], {'default': '0'}), + 'release_level': ('django.db.models.fields.PositiveIntegerField', [], {'default': '1'}), + 'serial': ('django.db.models.fields.PositiveIntegerField', [], {'default': '0'}), + 'timestamp': ('django.db.models.fields.DateTimeField', [], {}) + }, + 'documents.recentdocument': { + 'Meta': {'ordering': "('-datetime_accessed',)", 'object_name': 'RecentDocument'}, + 'datetime_accessed': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime(2014, 2, 16, 0, 0)', 'db_index': 'True'}), + 'document': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['documents.Document']"}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'user': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['auth.User']"}) + }, + 'sites.site': { + 'Meta': {'ordering': "('domain',)", 'object_name': 'Site', 'db_table': "'django_site'"}, + 'domain': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '50'}) + }, + 'taggit.tag': { + 'Meta': {'object_name': 'Tag'}, + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'slug': ('django.db.models.fields.SlugField', [], {'unique': 'True', 'max_length': '100'}) + }, + 'taggit.taggeditem': { + 'Meta': {'object_name': 'TaggedItem'}, + 'content_type': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'taggit_taggeditem_tagged_items'", 'to': "orm['contenttypes.ContentType']"}), + 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'object_id': ('django.db.models.fields.IntegerField', [], {'db_index': 'True'}), + 'tag': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'taggit_taggeditem_items'", 'to': "orm['taggit.Tag']"}) + } + } + + complete_apps = ['documents'] \ No newline at end of file diff --git a/mayan/apps/documents/models.py b/mayan/apps/documents/models.py index 498a6e84c9..f3f5579719 100644 --- a/mayan/apps/documents/models.py +++ b/mayan/apps/documents/models.py @@ -552,7 +552,7 @@ class DocumentPage(models.Model): """ document_version = models.ForeignKey(DocumentVersion, verbose_name=_(u'document version'), related_name='pages') content = models.TextField(blank=True, null=True, verbose_name=_(u'content')) - page_label = models.CharField(max_length=32, blank=True, null=True, verbose_name=_(u'page label')) + page_label = models.CharField(max_length=40, blank=True, null=True, verbose_name=_(u'page label')) page_number = models.PositiveIntegerField(default=1, editable=False, verbose_name=_(u'page number'), db_index=True) def __unicode__(self): diff --git a/mayan/apps/ocr/lang/deu.py b/mayan/apps/ocr/lang/deu.py new file mode 100644 index 0000000000..16e4e359c7 --- /dev/null +++ b/mayan/apps/ocr/lang/deu.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +import re + + +def check_word(word): + ALL_ALPHANUM = re.compile('([0-9a-zäöüß])', re.I) + NON_ALPHANUM = re.compile('([^0-9a-zäöüß])', re.I) + ALL_ALPHANUM = re.compile('([0-9a-z])', re.I) + NON_ALPHANUM = re.compile('([^0-9a-z])', re.I) + + TOO_MANY_VOWELS = re.compile('[aäeioöuü]{4}', re.I) + TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{4}', re.I) + ALL_ALPHA = re.compile('^[a-z]+$', re.I) + # SINGLE_LETTER_WORDS = re.compile('^$', re.I) + + #(L) If a string is longer than 40 characters, it is considered as garbage + # http://www.duden.de/sprachwissen/sprachratgeber/die-laengsten-woerter-im-dudenkorpus + # http://www.duden.de/sprachwissen/sprachratgeber/durchschnittliche-laenge-eines-deutschen-wortes + if len(word) > 40: + return None + + #(A) If a string's ratio of alphanumeric characters to total + #characters is less than 50%, the string is garbage + if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: + return None + + #Remove word if all the letters in the word are non alphanumeric + if len(NON_ALPHANUM.findall(word)) == len(word): + return None + + #Removed words with too many consecutie vowels + if TOO_MANY_VOWELS.findall(word): + return None + + #Removed words with too many consecutie consonants + if TOO_MANY_CONSONANTS.findall(word): + return None + + # No single letter words in German + if len(word) == 1: + return None + + return word diff --git a/mayan/apps/ocr/lang/fra.py b/mayan/apps/ocr/lang/fra.py new file mode 100644 index 0000000000..cb0e37cc67 --- /dev/null +++ b/mayan/apps/ocr/lang/fra.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +import re + + +def check_word(word): + return word + + ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I) + NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I) + + TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I) + TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I) + ALL_ALPHA = re.compile('^[a-z]+$', re.I) + SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I) + + #(L) If a string is longer than 20 characters, it is garbage + if len(word) > 20: + return None + + #(A) If a string’s ratio of alphanumeric characters to total + #characters is less than 50%, the string is garbage + if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: + return None + + #Remove word if all the letters in the word are non alphanumeric + if len(NON_ALPHANUM.findall(word)) == len(word): + return None + + #Removed words with too many consecutie vowels + if TOO_MANY_VOWELS.findall(word): + return None + + #Removed words with too many consecutie consonants + if TOO_MANY_CONSONANTS.findall(word): + return None + + #Only allow specific single letter words + if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): + return None + + return word