From 8fca5e278924fb2260713d09a13bb179c1f26660 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Sun, 13 Jul 2014 11:34:08 -0400 Subject: [PATCH] Remove the French OCR cleanup backend for now, a patchset seems to be missing. --- mayan/apps/ocr/lang/fra.py | 41 -------------------------------------- 1 file changed, 41 deletions(-) delete mode 100644 mayan/apps/ocr/lang/fra.py diff --git a/mayan/apps/ocr/lang/fra.py b/mayan/apps/ocr/lang/fra.py deleted file mode 100644 index cb0e37cc67..0000000000 --- a/mayan/apps/ocr/lang/fra.py +++ /dev/null @@ -1,41 +0,0 @@ -# -*- coding: utf-8 -*- -import re - - -def check_word(word): - return word - - ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I) - NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I) - - TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I) - TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I) - ALL_ALPHA = re.compile('^[a-z]+$', re.I) - SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I) - - #(L) If a string is longer than 20 characters, it is garbage - if len(word) > 20: - return None - - #(A) If a string’s ratio of alphanumeric characters to total - #characters is less than 50%, the string is garbage - if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: - return None - - #Remove word if all the letters in the word are non alphanumeric - if len(NON_ALPHANUM.findall(word)) == len(word): - return None - - #Removed words with too many consecutie vowels - if TOO_MANY_VOWELS.findall(word): - return None - - #Removed words with too many consecutie consonants - if TOO_MANY_CONSONANTS.findall(word): - return None - - #Only allow specific single letter words - if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): - return None - - return word