Merge remote-tracking branch 'origin/master'
This commit is contained in:
@@ -0,0 +1,146 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import datetime
|
||||||
|
from south.db import db
|
||||||
|
from south.v2 import SchemaMigration
|
||||||
|
from django.db import models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(SchemaMigration):
|
||||||
|
|
||||||
|
def forwards(self, orm):
|
||||||
|
|
||||||
|
# Changing field 'DocumentPage.page_label'
|
||||||
|
db.alter_column('documents_documentpage', 'page_label', self.gf('django.db.models.fields.CharField')(max_length=40, null=True))
|
||||||
|
|
||||||
|
def backwards(self, orm):
|
||||||
|
|
||||||
|
# Changing field 'DocumentPage.page_label'
|
||||||
|
db.alter_column('documents_documentpage', 'page_label', self.gf('django.db.models.fields.CharField')(max_length=32, null=True))
|
||||||
|
|
||||||
|
models = {
|
||||||
|
'auth.group': {
|
||||||
|
'Meta': {'object_name': 'Group'},
|
||||||
|
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||||
|
'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '80'}),
|
||||||
|
'permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'})
|
||||||
|
},
|
||||||
|
'auth.permission': {
|
||||||
|
'Meta': {'ordering': "('content_type__app_label', 'content_type__model', 'codename')", 'unique_together': "(('content_type', 'codename'),)", 'object_name': 'Permission'},
|
||||||
|
'codename': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
|
||||||
|
'content_type': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['contenttypes.ContentType']"}),
|
||||||
|
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||||
|
'name': ('django.db.models.fields.CharField', [], {'max_length': '50'})
|
||||||
|
},
|
||||||
|
'auth.user': {
|
||||||
|
'Meta': {'object_name': 'User'},
|
||||||
|
'date_joined': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}),
|
||||||
|
'email': ('django.db.models.fields.EmailField', [], {'max_length': '75', 'blank': 'True'}),
|
||||||
|
'first_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}),
|
||||||
|
'groups': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Group']", 'symmetrical': 'False', 'blank': 'True'}),
|
||||||
|
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||||
|
'is_active': ('django.db.models.fields.BooleanField', [], {'default': 'True'}),
|
||||||
|
'is_staff': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
|
||||||
|
'is_superuser': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
|
||||||
|
'last_login': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}),
|
||||||
|
'last_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}),
|
||||||
|
'password': ('django.db.models.fields.CharField', [], {'max_length': '128'}),
|
||||||
|
'user_permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'}),
|
||||||
|
'username': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '30'})
|
||||||
|
},
|
||||||
|
'comments.comment': {
|
||||||
|
'Meta': {'ordering': "('submit_date',)", 'object_name': 'Comment', 'db_table': "'django_comments'"},
|
||||||
|
'comment': ('django.db.models.fields.TextField', [], {'max_length': '3000'}),
|
||||||
|
'content_type': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'content_type_set_for_comment'", 'to': "orm['contenttypes.ContentType']"}),
|
||||||
|
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||||
|
'ip_address': ('django.db.models.fields.IPAddressField', [], {'max_length': '15', 'null': 'True', 'blank': 'True'}),
|
||||||
|
'is_public': ('django.db.models.fields.BooleanField', [], {'default': 'True'}),
|
||||||
|
'is_removed': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
|
||||||
|
'object_pk': ('django.db.models.fields.TextField', [], {}),
|
||||||
|
'site': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['sites.Site']"}),
|
||||||
|
'submit_date': ('django.db.models.fields.DateTimeField', [], {'default': 'None'}),
|
||||||
|
'user': ('django.db.models.fields.related.ForeignKey', [], {'blank': 'True', 'related_name': "'comment_comments'", 'null': 'True', 'to': "orm['auth.User']"}),
|
||||||
|
'user_email': ('django.db.models.fields.EmailField', [], {'max_length': '75', 'blank': 'True'}),
|
||||||
|
'user_name': ('django.db.models.fields.CharField', [], {'max_length': '50', 'blank': 'True'}),
|
||||||
|
'user_url': ('django.db.models.fields.URLField', [], {'max_length': '200', 'blank': 'True'})
|
||||||
|
},
|
||||||
|
'contenttypes.contenttype': {
|
||||||
|
'Meta': {'ordering': "('name',)", 'unique_together': "(('app_label', 'model'),)", 'object_name': 'ContentType', 'db_table': "'django_content_type'"},
|
||||||
|
'app_label': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
|
||||||
|
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||||
|
'model': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
|
||||||
|
'name': ('django.db.models.fields.CharField', [], {'max_length': '100'})
|
||||||
|
},
|
||||||
|
'documents.document': {
|
||||||
|
'Meta': {'ordering': "['-date_added']", 'object_name': 'Document'},
|
||||||
|
'date_added': ('django.db.models.fields.DateTimeField', [], {'db_index': 'True'}),
|
||||||
|
'description': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}),
|
||||||
|
'document_type': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['documents.DocumentType']", 'null': 'True', 'blank': 'True'}),
|
||||||
|
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||||
|
'uuid': ('django.db.models.fields.CharField', [], {'max_length': '48', 'blank': 'True'})
|
||||||
|
},
|
||||||
|
'documents.documentpage': {
|
||||||
|
'Meta': {'ordering': "['page_number']", 'object_name': 'DocumentPage'},
|
||||||
|
'content': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}),
|
||||||
|
'document_version': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['documents.DocumentVersion']"}),
|
||||||
|
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||||
|
'page_label': ('django.db.models.fields.CharField', [], {'max_length': '40', 'null': 'True', 'blank': 'True'}),
|
||||||
|
'page_number': ('django.db.models.fields.PositiveIntegerField', [], {'default': '1', 'db_index': 'True'})
|
||||||
|
},
|
||||||
|
'documents.documenttype': {
|
||||||
|
'Meta': {'ordering': "['name']", 'object_name': 'DocumentType'},
|
||||||
|
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||||
|
'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '32'})
|
||||||
|
},
|
||||||
|
'documents.documenttypefilename': {
|
||||||
|
'Meta': {'ordering': "['filename']", 'object_name': 'DocumentTypeFilename'},
|
||||||
|
'document_type': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['documents.DocumentType']"}),
|
||||||
|
'enabled': ('django.db.models.fields.BooleanField', [], {'default': 'True'}),
|
||||||
|
'filename': ('django.db.models.fields.CharField', [], {'max_length': '128', 'db_index': 'True'}),
|
||||||
|
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'})
|
||||||
|
},
|
||||||
|
'documents.documentversion': {
|
||||||
|
'Meta': {'unique_together': "(('document', 'major', 'minor', 'micro', 'release_level', 'serial'),)", 'object_name': 'DocumentVersion'},
|
||||||
|
'checksum': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}),
|
||||||
|
'comment': ('django.db.models.fields.TextField', [], {'blank': 'True'}),
|
||||||
|
'document': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['documents.Document']"}),
|
||||||
|
'encoding': ('django.db.models.fields.CharField', [], {'max_length': '64', 'null': 'True', 'blank': 'True'}),
|
||||||
|
'file': ('django.db.models.fields.files.FileField', [], {'max_length': '100'}),
|
||||||
|
'filename': ('django.db.models.fields.CharField', [], {'default': "u''", 'max_length': '255', 'db_index': 'True'}),
|
||||||
|
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||||
|
'major': ('django.db.models.fields.PositiveIntegerField', [], {'default': '1'}),
|
||||||
|
'micro': ('django.db.models.fields.PositiveIntegerField', [], {'default': '0'}),
|
||||||
|
'mimetype': ('django.db.models.fields.CharField', [], {'max_length': '64', 'null': 'True', 'blank': 'True'}),
|
||||||
|
'minor': ('django.db.models.fields.PositiveIntegerField', [], {'default': '0'}),
|
||||||
|
'release_level': ('django.db.models.fields.PositiveIntegerField', [], {'default': '1'}),
|
||||||
|
'serial': ('django.db.models.fields.PositiveIntegerField', [], {'default': '0'}),
|
||||||
|
'timestamp': ('django.db.models.fields.DateTimeField', [], {})
|
||||||
|
},
|
||||||
|
'documents.recentdocument': {
|
||||||
|
'Meta': {'ordering': "('-datetime_accessed',)", 'object_name': 'RecentDocument'},
|
||||||
|
'datetime_accessed': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime(2014, 2, 16, 0, 0)', 'db_index': 'True'}),
|
||||||
|
'document': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['documents.Document']"}),
|
||||||
|
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||||
|
'user': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['auth.User']"})
|
||||||
|
},
|
||||||
|
'sites.site': {
|
||||||
|
'Meta': {'ordering': "('domain',)", 'object_name': 'Site', 'db_table': "'django_site'"},
|
||||||
|
'domain': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
|
||||||
|
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||||
|
'name': ('django.db.models.fields.CharField', [], {'max_length': '50'})
|
||||||
|
},
|
||||||
|
'taggit.tag': {
|
||||||
|
'Meta': {'object_name': 'Tag'},
|
||||||
|
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||||
|
'name': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
|
||||||
|
'slug': ('django.db.models.fields.SlugField', [], {'unique': 'True', 'max_length': '100'})
|
||||||
|
},
|
||||||
|
'taggit.taggeditem': {
|
||||||
|
'Meta': {'object_name': 'TaggedItem'},
|
||||||
|
'content_type': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'taggit_taggeditem_tagged_items'", 'to': "orm['contenttypes.ContentType']"}),
|
||||||
|
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||||
|
'object_id': ('django.db.models.fields.IntegerField', [], {'db_index': 'True'}),
|
||||||
|
'tag': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'taggit_taggeditem_items'", 'to': "orm['taggit.Tag']"})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
complete_apps = ['documents']
|
||||||
@@ -552,7 +552,7 @@ class DocumentPage(models.Model):
|
|||||||
"""
|
"""
|
||||||
document_version = models.ForeignKey(DocumentVersion, verbose_name=_(u'document version'), related_name='pages')
|
document_version = models.ForeignKey(DocumentVersion, verbose_name=_(u'document version'), related_name='pages')
|
||||||
content = models.TextField(blank=True, null=True, verbose_name=_(u'content'))
|
content = models.TextField(blank=True, null=True, verbose_name=_(u'content'))
|
||||||
page_label = models.CharField(max_length=32, blank=True, null=True, verbose_name=_(u'page label'))
|
page_label = models.CharField(max_length=40, blank=True, null=True, verbose_name=_(u'page label'))
|
||||||
page_number = models.PositiveIntegerField(default=1, editable=False, verbose_name=_(u'page number'), db_index=True)
|
page_number = models.PositiveIntegerField(default=1, editable=False, verbose_name=_(u'page number'), db_index=True)
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
|
|||||||
43
mayan/apps/ocr/lang/deu.py
Normal file
43
mayan/apps/ocr/lang/deu.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def check_word(word):
|
||||||
|
ALL_ALPHANUM = re.compile('([0-9a-zäöüß])', re.I)
|
||||||
|
NON_ALPHANUM = re.compile('([^0-9a-zäöüß])', re.I)
|
||||||
|
ALL_ALPHANUM = re.compile('([0-9a-z])', re.I)
|
||||||
|
NON_ALPHANUM = re.compile('([^0-9a-z])', re.I)
|
||||||
|
|
||||||
|
TOO_MANY_VOWELS = re.compile('[aäeioöuü]{4}', re.I)
|
||||||
|
TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{4}', re.I)
|
||||||
|
ALL_ALPHA = re.compile('^[a-z]+$', re.I)
|
||||||
|
# SINGLE_LETTER_WORDS = re.compile('^$', re.I)
|
||||||
|
|
||||||
|
#(L) If a string is longer than 40 characters, it is considered as garbage
|
||||||
|
# http://www.duden.de/sprachwissen/sprachratgeber/die-laengsten-woerter-im-dudenkorpus
|
||||||
|
# http://www.duden.de/sprachwissen/sprachratgeber/durchschnittliche-laenge-eines-deutschen-wortes
|
||||||
|
if len(word) > 40:
|
||||||
|
return None
|
||||||
|
|
||||||
|
#(A) If a string's ratio of alphanumeric characters to total
|
||||||
|
#characters is less than 50%, the string is garbage
|
||||||
|
if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
|
||||||
|
return None
|
||||||
|
|
||||||
|
#Remove word if all the letters in the word are non alphanumeric
|
||||||
|
if len(NON_ALPHANUM.findall(word)) == len(word):
|
||||||
|
return None
|
||||||
|
|
||||||
|
#Removed words with too many consecutie vowels
|
||||||
|
if TOO_MANY_VOWELS.findall(word):
|
||||||
|
return None
|
||||||
|
|
||||||
|
#Removed words with too many consecutie consonants
|
||||||
|
if TOO_MANY_CONSONANTS.findall(word):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# No single letter words in German
|
||||||
|
if len(word) == 1:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return word
|
||||||
41
mayan/apps/ocr/lang/fra.py
Normal file
41
mayan/apps/ocr/lang/fra.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def check_word(word):
|
||||||
|
return word
|
||||||
|
|
||||||
|
ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
|
||||||
|
NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
|
||||||
|
|
||||||
|
TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
|
||||||
|
TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
|
||||||
|
ALL_ALPHA = re.compile('^[a-z]+$', re.I)
|
||||||
|
SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
|
||||||
|
|
||||||
|
#(L) If a string is longer than 20 characters, it is garbage
|
||||||
|
if len(word) > 20:
|
||||||
|
return None
|
||||||
|
|
||||||
|
#(A) If a string’s ratio of alphanumeric characters to total
|
||||||
|
#characters is less than 50%, the string is garbage
|
||||||
|
if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
|
||||||
|
return None
|
||||||
|
|
||||||
|
#Remove word if all the letters in the word are non alphanumeric
|
||||||
|
if len(NON_ALPHANUM.findall(word)) == len(word):
|
||||||
|
return None
|
||||||
|
|
||||||
|
#Removed words with too many consecutie vowels
|
||||||
|
if TOO_MANY_VOWELS.findall(word):
|
||||||
|
return None
|
||||||
|
|
||||||
|
#Removed words with too many consecutie consonants
|
||||||
|
if TOO_MANY_CONSONANTS.findall(word):
|
||||||
|
return None
|
||||||
|
|
||||||
|
#Only allow specific single letter words
|
||||||
|
if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
|
||||||
|
return None
|
||||||
|
|
||||||
|
return word
|
||||||
Reference in New Issue
Block a user