From 79f03d9907f58cbf143f085502a73554b8467db1 Mon Sep 17 00:00:00 2001 From: Mathias Behrle Date: Mon, 14 Jul 2014 14:48:12 +0200 Subject: [PATCH 1/3] Removing double variable definition. --- mayan/apps/ocr/lang/deu.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mayan/apps/ocr/lang/deu.py b/mayan/apps/ocr/lang/deu.py index 6291843844..be01ed2643 100644 --- a/mayan/apps/ocr/lang/deu.py +++ b/mayan/apps/ocr/lang/deu.py @@ -10,8 +10,6 @@ class LanguageBackend(BackendBase): def check_word(self, word): ALL_ALPHANUM = re.compile('([0-9a-zäöüß])', re.I) NON_ALPHANUM = re.compile('([^0-9a-zäöüß])', re.I) - ALL_ALPHANUM = re.compile('([0-9a-z])', re.I) - NON_ALPHANUM = re.compile('([^0-9a-z])', re.I) TOO_MANY_VOWELS = re.compile('[aäeioöuü]{4}', re.I) TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{4}', re.I) From 8bae4648415f9a45ba0afdec055fd2036f06a529 Mon Sep 17 00:00:00 2001 From: Mathias Behrle Date: Mon, 14 Jul 2014 14:49:56 +0200 Subject: [PATCH 2/3] Re-adding migrated french language correction file, --- mayan/apps/ocr/lang/fra.py | 44 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 mayan/apps/ocr/lang/fra.py diff --git a/mayan/apps/ocr/lang/fra.py b/mayan/apps/ocr/lang/fra.py new file mode 100644 index 0000000000..d4bff61c96 --- /dev/null +++ b/mayan/apps/ocr/lang/fra.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +import re + +from . import BackendBase + + +class LanguageBackend(BackendBase): + def check_word(self, word): + ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I) + NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I) + + TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I) + TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I) + ALL_ALPHA = re.compile('^[a-z]+$', re.I) + SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I) + + #(L) If a string is longer than 20 characters, it is garbage + if len(word) > 20: + return None + + #(A) If a string’s ratio of alphanumeric characters to total + #characters is less than 50%, the string is garbage + if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: + return None + + #Remove word if all the letters in the word are non alphanumeric + if len(NON_ALPHANUM.findall(word)) == len(word): + return None + + #Removed words with too many consecutie vowels + if TOO_MANY_VOWELS.findall(word): + return None + + #Removed words with too many consecutie consonants + if TOO_MANY_CONSONANTS.findall(word): + return None + + #Only allow specific single letter words + if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): + return None + + return word From f67176aa9f4b4724c3f9c4c8ec6751aa48503a7d Mon Sep 17 00:00:00 2001 From: Mathias Behrle Date: Mon, 14 Jul 2014 14:57:03 +0200 Subject: [PATCH 3/3] Removing unused variable from language correction files. --- mayan/apps/ocr/lang/deu.py | 1 - mayan/apps/ocr/lang/eng.py | 1 - mayan/apps/ocr/lang/fra.py | 1 - mayan/apps/ocr/lang/rus.py | 1 - mayan/apps/ocr/lang/spa.py | 1 - 5 files changed, 5 deletions(-) diff --git a/mayan/apps/ocr/lang/deu.py b/mayan/apps/ocr/lang/deu.py index be01ed2643..9a9b89549e 100644 --- a/mayan/apps/ocr/lang/deu.py +++ b/mayan/apps/ocr/lang/deu.py @@ -13,7 +13,6 @@ class LanguageBackend(BackendBase): TOO_MANY_VOWELS = re.compile('[aäeioöuü]{4}', re.I) TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{4}', re.I) - ALL_ALPHA = re.compile('^[a-z]+$', re.I) # SINGLE_LETTER_WORDS = re.compile('^$', re.I) #(L) If a string is longer than 40 characters, it is considered as garbage diff --git a/mayan/apps/ocr/lang/eng.py b/mayan/apps/ocr/lang/eng.py index c402bba9f9..29dc3384e8 100644 --- a/mayan/apps/ocr/lang/eng.py +++ b/mayan/apps/ocr/lang/eng.py @@ -12,7 +12,6 @@ class LanguageBackend(BackendBase): TOO_MANY_VOWELS = re.compile('[aeiou]{3}', re.I) TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{5}', re.I) - ALL_ALPHA = re.compile('^[a-z]+$', re.I) SINGLE_LETTER_WORDS = re.compile('^[ai]$', re.I) # (L) If a string is longer than 20 characters, it is garbage diff --git a/mayan/apps/ocr/lang/fra.py b/mayan/apps/ocr/lang/fra.py index d4bff61c96..937551bc9c 100644 --- a/mayan/apps/ocr/lang/fra.py +++ b/mayan/apps/ocr/lang/fra.py @@ -13,7 +13,6 @@ class LanguageBackend(BackendBase): TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I) TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I) - ALL_ALPHA = re.compile('^[a-z]+$', re.I) SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I) #(L) If a string is longer than 20 characters, it is garbage diff --git a/mayan/apps/ocr/lang/rus.py b/mayan/apps/ocr/lang/rus.py index 28b8644062..05ce0e1ab1 100644 --- a/mayan/apps/ocr/lang/rus.py +++ b/mayan/apps/ocr/lang/rus.py @@ -13,7 +13,6 @@ class LanguageBackend(BackendBase): TOO_MANY_VOWELS = re.compile('[ёуеыаоэяию]{3}', re.I) TOO_MANY_CONSONANTS = re.compile('[йцкнгшщзхъфвпрлджчсмтьб{5}', re.I) - ALL_ALPHA = re.compile('^[ёйцукенгшщзхъфывапролджэячсмитьбю]+$', re.I) SINGLE_LETTER_WORDS = re.compile('^[уквояси]$', re.I) # (L) If a string is longer than 25 characters, it is garbage diff --git a/mayan/apps/ocr/lang/spa.py b/mayan/apps/ocr/lang/spa.py index 677aed73f6..eb4d9ead45 100644 --- a/mayan/apps/ocr/lang/spa.py +++ b/mayan/apps/ocr/lang/spa.py @@ -13,7 +13,6 @@ class LanguageBackend(BackendBase): TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I) TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I) - ALL_ALPHA = re.compile('^[a-z]+$', re.I) SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I) # (L) If a string is longer than 20 characters, it is garbage