Delete language processing backends

2015-06-09 03:24:17 -04:00
parent e9be14f2af
commit d073685680
6 changed files with 0 additions and 219 deletions
--- a/mayan/apps/ocr/lang/init.py
+++ b/mayan/apps/ocr/lang/init.py
@@ -1,3 +0,0 @@
-class BackendBase(object):
-    def check_word(self, word):
-        raise NotImplementedError
--- a/mayan/apps/ocr/lang/deu.py
+++ b/mayan/apps/ocr/lang/deu.py
@@ -1,45 +0,0 @@
-# -*- coding: utf-8 -*-
-from __future__ import unicode_literals
-
-import re
-
-from . import BackendBase
-
-
-class LanguageBackend(BackendBase):
-    def check_word(self, word):
-        ALL_ALPHANUM = re.compile('([0-9a-zäöüß])', re.I)
-        NON_ALPHANUM = re.compile('([^0-9a-zäöüß])', re.I)
-
-        TOO_MANY_VOWELS = re.compile('[aäeioöuü]{4}', re.I)
-        TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{4}', re.I)
-        # SINGLE_LETTER_WORDS = re.compile('^$', re.I)
-
-        # (L) If a string is longer than 40 characters, it is considered as garbage
-        # http://www.duden.de/sprachwissen/sprachratgeber/die-laengsten-woerter-im-dudenkorpus
-        # http://www.duden.de/sprachwissen/sprachratgeber/durchschnittliche-laenge-eines-deutschen-wortes
-        if len(word) > 40:
-            return None
-
-        # (A) If a string's ratio of alphanumeric characters to total
-        # characters is less than 50%, the string is garbage
-        if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
-            return None
-
-        # Remove word if all the letters in the word are non alphanumeric
-        if len(NON_ALPHANUM.findall(word)) == len(word):
-            return None
-
-        # Removed words with too many consecutie vowels
-        if TOO_MANY_VOWELS.findall(word):
-            return None
-
-        # Removed words with too many consecutie consonants
-        if TOO_MANY_CONSONANTS.findall(word):
-            return None
-
-        # No single letter words in German
-        if len(word) == 1:
-            return None
-
-        return word
--- a/mayan/apps/ocr/lang/eng.py
+++ b/mayan/apps/ocr/lang/eng.py
@@ -1,42 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from . import BackendBase
-
-
-class LanguageBackend(BackendBase):
-    def check_word(self, word):
-        ALL_ALPHANUM = re.compile('([0-9a-z])', re.I)
-        NON_ALPHANUM = re.compile('([^0-9a-z])', re.I)
-
-        TOO_MANY_VOWELS = re.compile('[aeiou]{3}', re.I)
-        TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{5}', re.I)
-        SINGLE_LETTER_WORDS = re.compile('^[ai]$', re.I)
-
-        # (L) If a string is longer than 20 characters, it is garbage
-        if len(word) > 20:
-            return None
-
-        # (A) If a string's ratio of alphanumeric characters to total
-        # characters is less than 50%, the string is garbage
-        if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
-            return None
-
-        # Remove word if all the letters in the word are non alphanumeric
-        if len(NON_ALPHANUM.findall(word)) == len(word):
-            return None
-
-        # Removed words with too many consecutie vowels
-        if TOO_MANY_VOWELS.findall(word):
-            return None
-
-        # Removed words with too many consecutie consonants
-        if TOO_MANY_CONSONANTS.findall(word):
-            return None
-
-        # Only allow specific single letter words
-        if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
-            return None
-
-        return word
--- a/mayan/apps/ocr/lang/fra.py
+++ b/mayan/apps/ocr/lang/fra.py
@@ -1,43 +0,0 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-
-import re
-
-from . import BackendBase
-
-
-class LanguageBackend(BackendBase):
-    def check_word(self, word):
-        ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
-        NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
-
-        TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
-        TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
-        SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
-
-        # (L) If a string is longer than 20 characters, it is garbage
-        if len(word) > 20:
-            return None
-
-        # (A) If a string’s ratio of alphanumeric characters to total
-        # characters is less than 50%, the string is garbage
-        if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
-            return None
-
-        # Remove word if all the letters in the word are non alphanumeric
-        if len(NON_ALPHANUM.findall(word)) == len(word):
-            return None
-
-        # Removed words with too many consecutie vowels
-        if TOO_MANY_VOWELS.findall(word):
-            return None
-
-        # Removed words with too many consecutie consonants
-        if TOO_MANY_CONSONANTS.findall(word):
-            return None
-
-        # Only allow specific single letter words
-        if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
-            return None
-
-        return word
--- a/mayan/apps/ocr/lang/rus.py
+++ b/mayan/apps/ocr/lang/rus.py
@@ -1,43 +0,0 @@
-# -*- coding: utf-8 -*-
-from __future__ import unicode_literals
-
-import re
-
-from . import BackendBase
-
-
-class LanguageBackend(BackendBase):
-    def check_word(self, word):
-        ALL_ALPHANUM = re.compile('([0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
-        NON_ALPHANUM = re.compile('([^0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
-
-        TOO_MANY_VOWELS = re.compile('[ёуеыаоэяию]{3}', re.I)
-        TOO_MANY_CONSONANTS = re.compile('[йцкнгшщзхъфвпрлджчсмтьб{5}', re.I)
-        SINGLE_LETTER_WORDS = re.compile('^[уквояси]$', re.I)
-
-        # (L) If a string is longer than 25 characters, it is garbage
-        if len(word) > 25:
-            return None
-
-        # (A) If a string's ratio of alphanumeric characters to total
-        # characters is less than 50%, the string is garbage
-        if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
-            return None
-
-        # Remove word if all the letters in the word are non alphanumeric
-        if len(NON_ALPHANUM.findall(word)) == len(word):
-            return None
-
-        # Removed words with too many consecutie vowels
-        if TOO_MANY_VOWELS.findall(word):
-            return None
-
-        # Removed words with too many consecutie consonants
-        if TOO_MANY_CONSONANTS.findall(word):
-            return None
-
-        # Only allow specific single letter words
-        if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
-            return None
-
-        return word
--- a/mayan/apps/ocr/lang/spa.py
+++ b/mayan/apps/ocr/lang/spa.py
@@ -1,43 +0,0 @@
-# -*- coding: utf-8 -*-
-from __future__ import unicode_literals
-
-import re
-
-from . import BackendBase
-
-
-class LanguageBackend(BackendBase):
-    def check_word(self, word):
-        ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
-        NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
-
-        TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
-        TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
-        SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
-
-        # (L) If a string is longer than 20 characters, it is garbage
-        if len(word) > 20:
-            return None
-
-        # (A) If a string’s ratio of alphanumeric characters to total
-        # characters is less than 50%, the string is garbage
-        if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
-            return None
-
-        # Remove word if all the letters in the word are non alphanumeric
-        if len(NON_ALPHANUM.findall(word)) == len(word):
-            return None
-
-        # Removed words with too many consecutie vowels
-        if TOO_MANY_VOWELS.findall(word):
-            return None
-
-        # Removed words with too many consecutie consonants
-        if TOO_MANY_CONSONANTS.findall(word):
-            return None
-
-        # Only allow specific single letter words
-        if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
-            return None
-
-        return word