From d073685680b696492f00843d1680e87aeefe3c83 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Tue, 9 Jun 2015 03:24:17 -0400 Subject: [PATCH] Delete language processing backends --- mayan/apps/ocr/lang/__init__.py | 3 --- mayan/apps/ocr/lang/deu.py | 45 --------------------------------- mayan/apps/ocr/lang/eng.py | 42 ------------------------------ mayan/apps/ocr/lang/fra.py | 43 ------------------------------- mayan/apps/ocr/lang/rus.py | 43 ------------------------------- mayan/apps/ocr/lang/spa.py | 43 ------------------------------- 6 files changed, 219 deletions(-) delete mode 100644 mayan/apps/ocr/lang/__init__.py delete mode 100644 mayan/apps/ocr/lang/deu.py delete mode 100644 mayan/apps/ocr/lang/eng.py delete mode 100644 mayan/apps/ocr/lang/fra.py delete mode 100644 mayan/apps/ocr/lang/rus.py delete mode 100644 mayan/apps/ocr/lang/spa.py diff --git a/mayan/apps/ocr/lang/__init__.py b/mayan/apps/ocr/lang/__init__.py deleted file mode 100644 index 9a45b1aaab..0000000000 --- a/mayan/apps/ocr/lang/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -class BackendBase(object): - def check_word(self, word): - raise NotImplementedError diff --git a/mayan/apps/ocr/lang/deu.py b/mayan/apps/ocr/lang/deu.py deleted file mode 100644 index ccff3eba7d..0000000000 --- a/mayan/apps/ocr/lang/deu.py +++ /dev/null @@ -1,45 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -import re - -from . import BackendBase - - -class LanguageBackend(BackendBase): - def check_word(self, word): - ALL_ALPHANUM = re.compile('([0-9a-zäöüß])', re.I) - NON_ALPHANUM = re.compile('([^0-9a-zäöüß])', re.I) - - TOO_MANY_VOWELS = re.compile('[aäeioöuü]{4}', re.I) - TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{4}', re.I) - # SINGLE_LETTER_WORDS = re.compile('^$', re.I) - - # (L) If a string is longer than 40 characters, it is considered as garbage - # http://www.duden.de/sprachwissen/sprachratgeber/die-laengsten-woerter-im-dudenkorpus - # http://www.duden.de/sprachwissen/sprachratgeber/durchschnittliche-laenge-eines-deutschen-wortes - if len(word) > 40: - return None - - # (A) If a string's ratio of alphanumeric characters to total - # characters is less than 50%, the string is garbage - if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: - return None - - # Remove word if all the letters in the word are non alphanumeric - if len(NON_ALPHANUM.findall(word)) == len(word): - return None - - # Removed words with too many consecutie vowels - if TOO_MANY_VOWELS.findall(word): - return None - - # Removed words with too many consecutie consonants - if TOO_MANY_CONSONANTS.findall(word): - return None - - # No single letter words in German - if len(word) == 1: - return None - - return word diff --git a/mayan/apps/ocr/lang/eng.py b/mayan/apps/ocr/lang/eng.py deleted file mode 100644 index 5025db136d..0000000000 --- a/mayan/apps/ocr/lang/eng.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import unicode_literals - -import re - -from . import BackendBase - - -class LanguageBackend(BackendBase): - def check_word(self, word): - ALL_ALPHANUM = re.compile('([0-9a-z])', re.I) - NON_ALPHANUM = re.compile('([^0-9a-z])', re.I) - - TOO_MANY_VOWELS = re.compile('[aeiou]{3}', re.I) - TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{5}', re.I) - SINGLE_LETTER_WORDS = re.compile('^[ai]$', re.I) - - # (L) If a string is longer than 20 characters, it is garbage - if len(word) > 20: - return None - - # (A) If a string's ratio of alphanumeric characters to total - # characters is less than 50%, the string is garbage - if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: - return None - - # Remove word if all the letters in the word are non alphanumeric - if len(NON_ALPHANUM.findall(word)) == len(word): - return None - - # Removed words with too many consecutie vowels - if TOO_MANY_VOWELS.findall(word): - return None - - # Removed words with too many consecutie consonants - if TOO_MANY_CONSONANTS.findall(word): - return None - - # Only allow specific single letter words - if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): - return None - - return word diff --git a/mayan/apps/ocr/lang/fra.py b/mayan/apps/ocr/lang/fra.py deleted file mode 100644 index eb4d9ead45..0000000000 --- a/mayan/apps/ocr/lang/fra.py +++ /dev/null @@ -1,43 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import - -import re - -from . import BackendBase - - -class LanguageBackend(BackendBase): - def check_word(self, word): - ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I) - NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I) - - TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I) - TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I) - SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I) - - # (L) If a string is longer than 20 characters, it is garbage - if len(word) > 20: - return None - - # (A) If a string’s ratio of alphanumeric characters to total - # characters is less than 50%, the string is garbage - if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: - return None - - # Remove word if all the letters in the word are non alphanumeric - if len(NON_ALPHANUM.findall(word)) == len(word): - return None - - # Removed words with too many consecutie vowels - if TOO_MANY_VOWELS.findall(word): - return None - - # Removed words with too many consecutie consonants - if TOO_MANY_CONSONANTS.findall(word): - return None - - # Only allow specific single letter words - if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): - return None - - return word diff --git a/mayan/apps/ocr/lang/rus.py b/mayan/apps/ocr/lang/rus.py deleted file mode 100644 index e7b7588358..0000000000 --- a/mayan/apps/ocr/lang/rus.py +++ /dev/null @@ -1,43 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -import re - -from . import BackendBase - - -class LanguageBackend(BackendBase): - def check_word(self, word): - ALL_ALPHANUM = re.compile('([0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I) - NON_ALPHANUM = re.compile('([^0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I) - - TOO_MANY_VOWELS = re.compile('[ёуеыаоэяию]{3}', re.I) - TOO_MANY_CONSONANTS = re.compile('[йцкнгшщзхъфвпрлджчсмтьб{5}', re.I) - SINGLE_LETTER_WORDS = re.compile('^[уквояси]$', re.I) - - # (L) If a string is longer than 25 characters, it is garbage - if len(word) > 25: - return None - - # (A) If a string's ratio of alphanumeric characters to total - # characters is less than 50%, the string is garbage - if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: - return None - - # Remove word if all the letters in the word are non alphanumeric - if len(NON_ALPHANUM.findall(word)) == len(word): - return None - - # Removed words with too many consecutie vowels - if TOO_MANY_VOWELS.findall(word): - return None - - # Removed words with too many consecutie consonants - if TOO_MANY_CONSONANTS.findall(word): - return None - - # Only allow specific single letter words - if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): - return None - - return word diff --git a/mayan/apps/ocr/lang/spa.py b/mayan/apps/ocr/lang/spa.py deleted file mode 100644 index c736a69b9a..0000000000 --- a/mayan/apps/ocr/lang/spa.py +++ /dev/null @@ -1,43 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -import re - -from . import BackendBase - - -class LanguageBackend(BackendBase): - def check_word(self, word): - ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I) - NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I) - - TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I) - TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I) - SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I) - - # (L) If a string is longer than 20 characters, it is garbage - if len(word) > 20: - return None - - # (A) If a string’s ratio of alphanumeric characters to total - # characters is less than 50%, the string is garbage - if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: - return None - - # Remove word if all the letters in the word are non alphanumeric - if len(NON_ALPHANUM.findall(word)) == len(word): - return None - - # Removed words with too many consecutie vowels - if TOO_MANY_VOWELS.findall(word): - return None - - # Removed words with too many consecutie consonants - if TOO_MANY_CONSONANTS.findall(word): - return None - - # Only allow specific single letter words - if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): - return None - - return word