From d073685680b696492f00843d1680e87aeefe3c83 Mon Sep 17 00:00:00 2001
From: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
Date: Tue, 9 Jun 2015 03:24:17 -0400
Subject: [PATCH] Delete language processing backends

---
 mayan/apps/ocr/lang/__init__.py |  3 ---
 mayan/apps/ocr/lang/deu.py      | 45 ---------------------------------
 mayan/apps/ocr/lang/eng.py      | 42 ------------------------------
 mayan/apps/ocr/lang/fra.py      | 43 -------------------------------
 mayan/apps/ocr/lang/rus.py      | 43 -------------------------------
 mayan/apps/ocr/lang/spa.py      | 43 -------------------------------
 6 files changed, 219 deletions(-)
 delete mode 100644 mayan/apps/ocr/lang/__init__.py
 delete mode 100644 mayan/apps/ocr/lang/deu.py
 delete mode 100644 mayan/apps/ocr/lang/eng.py
 delete mode 100644 mayan/apps/ocr/lang/fra.py
 delete mode 100644 mayan/apps/ocr/lang/rus.py
 delete mode 100644 mayan/apps/ocr/lang/spa.py

diff --git a/mayan/apps/ocr/lang/__init__.py b/mayan/apps/ocr/lang/__init__.py
deleted file mode 100644
index 9a45b1aaab..0000000000
--- a/mayan/apps/ocr/lang/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-class BackendBase(object):
-    def check_word(self, word):
-        raise NotImplementedError
diff --git a/mayan/apps/ocr/lang/deu.py b/mayan/apps/ocr/lang/deu.py
deleted file mode 100644
index ccff3eba7d..0000000000
--- a/mayan/apps/ocr/lang/deu.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# -*- coding: utf-8 -*-
-from __future__ import unicode_literals
-
-import re
-
-from . import BackendBase
-
-
-class LanguageBackend(BackendBase):
-    def check_word(self, word):
-        ALL_ALPHANUM = re.compile('([0-9a-zäöüß])', re.I)
-        NON_ALPHANUM = re.compile('([^0-9a-zäöüß])', re.I)
-
-        TOO_MANY_VOWELS = re.compile('[aäeioöuü]{4}', re.I)
-        TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{4}', re.I)
-        # SINGLE_LETTER_WORDS = re.compile('^$', re.I)
-
-        # (L) If a string is longer than 40 characters, it is considered as garbage
-        # http://www.duden.de/sprachwissen/sprachratgeber/die-laengsten-woerter-im-dudenkorpus
-        # http://www.duden.de/sprachwissen/sprachratgeber/durchschnittliche-laenge-eines-deutschen-wortes
-        if len(word) > 40:
-            return None
-
-        # (A) If a string's ratio of alphanumeric characters to total
-        # characters is less than 50%, the string is garbage
-        if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
-            return None
-
-        # Remove word if all the letters in the word are non alphanumeric
-        if len(NON_ALPHANUM.findall(word)) == len(word):
-            return None
-
-        # Removed words with too many consecutie vowels
-        if TOO_MANY_VOWELS.findall(word):
-            return None
-
-        # Removed words with too many consecutie consonants
-        if TOO_MANY_CONSONANTS.findall(word):
-            return None
-
-        # No single letter words in German
-        if len(word) == 1:
-            return None
-
-        return word
diff --git a/mayan/apps/ocr/lang/eng.py b/mayan/apps/ocr/lang/eng.py
deleted file mode 100644
index 5025db136d..0000000000
--- a/mayan/apps/ocr/lang/eng.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from . import BackendBase
-
-
-class LanguageBackend(BackendBase):
-    def check_word(self, word):
-        ALL_ALPHANUM = re.compile('([0-9a-z])', re.I)
-        NON_ALPHANUM = re.compile('([^0-9a-z])', re.I)
-
-        TOO_MANY_VOWELS = re.compile('[aeiou]{3}', re.I)
-        TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{5}', re.I)
-        SINGLE_LETTER_WORDS = re.compile('^[ai]$', re.I)
-
-        # (L) If a string is longer than 20 characters, it is garbage
-        if len(word) > 20:
-            return None
-
-        # (A) If a string's ratio of alphanumeric characters to total
-        # characters is less than 50%, the string is garbage
-        if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
-            return None
-
-        # Remove word if all the letters in the word are non alphanumeric
-        if len(NON_ALPHANUM.findall(word)) == len(word):
-            return None
-
-        # Removed words with too many consecutie vowels
-        if TOO_MANY_VOWELS.findall(word):
-            return None
-
-        # Removed words with too many consecutie consonants
-        if TOO_MANY_CONSONANTS.findall(word):
-            return None
-
-        # Only allow specific single letter words
-        if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
-            return None
-
-        return word
diff --git a/mayan/apps/ocr/lang/fra.py b/mayan/apps/ocr/lang/fra.py
deleted file mode 100644
index eb4d9ead45..0000000000
--- a/mayan/apps/ocr/lang/fra.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-
-import re
-
-from . import BackendBase
-
-
-class LanguageBackend(BackendBase):
-    def check_word(self, word):
-        ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
-        NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
-
-        TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
-        TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
-        SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
-
-        # (L) If a string is longer than 20 characters, it is garbage
-        if len(word) > 20:
-            return None
-
-        # (A) If a string’s ratio of alphanumeric characters to total
-        # characters is less than 50%, the string is garbage
-        if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
-            return None
-
-        # Remove word if all the letters in the word are non alphanumeric
-        if len(NON_ALPHANUM.findall(word)) == len(word):
-            return None
-
-        # Removed words with too many consecutie vowels
-        if TOO_MANY_VOWELS.findall(word):
-            return None
-
-        # Removed words with too many consecutie consonants
-        if TOO_MANY_CONSONANTS.findall(word):
-            return None
-
-        # Only allow specific single letter words
-        if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
-            return None
-
-        return word
diff --git a/mayan/apps/ocr/lang/rus.py b/mayan/apps/ocr/lang/rus.py
deleted file mode 100644
index e7b7588358..0000000000
--- a/mayan/apps/ocr/lang/rus.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# -*- coding: utf-8 -*-
-from __future__ import unicode_literals
-
-import re
-
-from . import BackendBase
-
-
-class LanguageBackend(BackendBase):
-    def check_word(self, word):
-        ALL_ALPHANUM = re.compile('([0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
-        NON_ALPHANUM = re.compile('([^0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
-
-        TOO_MANY_VOWELS = re.compile('[ёуеыаоэяию]{3}', re.I)
-        TOO_MANY_CONSONANTS = re.compile('[йцкнгшщзхъфвпрлджчсмтьб{5}', re.I)
-        SINGLE_LETTER_WORDS = re.compile('^[уквояси]$', re.I)
-
-        # (L) If a string is longer than 25 characters, it is garbage
-        if len(word) > 25:
-            return None
-
-        # (A) If a string's ratio of alphanumeric characters to total
-        # characters is less than 50%, the string is garbage
-        if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
-            return None
-
-        # Remove word if all the letters in the word are non alphanumeric
-        if len(NON_ALPHANUM.findall(word)) == len(word):
-            return None
-
-        # Removed words with too many consecutie vowels
-        if TOO_MANY_VOWELS.findall(word):
-            return None
-
-        # Removed words with too many consecutie consonants
-        if TOO_MANY_CONSONANTS.findall(word):
-            return None
-
-        # Only allow specific single letter words
-        if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
-            return None
-
-        return word
diff --git a/mayan/apps/ocr/lang/spa.py b/mayan/apps/ocr/lang/spa.py
deleted file mode 100644
index c736a69b9a..0000000000
--- a/mayan/apps/ocr/lang/spa.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# -*- coding: utf-8 -*-
-from __future__ import unicode_literals
-
-import re
-
-from . import BackendBase
-
-
-class LanguageBackend(BackendBase):
-    def check_word(self, word):
-        ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
-        NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
-
-        TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
-        TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
-        SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
-
-        # (L) If a string is longer than 20 characters, it is garbage
-        if len(word) > 20:
-            return None
-
-        # (A) If a string’s ratio of alphanumeric characters to total
-        # characters is less than 50%, the string is garbage
-        if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
-            return None
-
-        # Remove word if all the letters in the word are non alphanumeric
-        if len(NON_ALPHANUM.findall(word)) == len(word):
-            return None
-
-        # Removed words with too many consecutie vowels
-        if TOO_MANY_VOWELS.findall(word):
-            return None
-
-        # Removed words with too many consecutie consonants
-        if TOO_MANY_CONSONANTS.findall(word):
-            return None
-
-        # Only allow specific single letter words
-        if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
-            return None
-
-        return word