From 1f3fed2182ef807ffaef99f539df652343c50f14 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Sun, 13 Jul 2014 11:32:41 -0400 Subject: [PATCH] Update the German OCR cleanup backend to be a class not a function --- mayan/apps/ocr/lang/deu.py | 67 ++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/mayan/apps/ocr/lang/deu.py b/mayan/apps/ocr/lang/deu.py index 16e4e359c7..df20573b09 100644 --- a/mayan/apps/ocr/lang/deu.py +++ b/mayan/apps/ocr/lang/deu.py @@ -1,43 +1,48 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import + import re +from . import BackendBase -def check_word(word): - ALL_ALPHANUM = re.compile('([0-9a-zäöüß])', re.I) - NON_ALPHANUM = re.compile('([^0-9a-zäöüß])', re.I) - ALL_ALPHANUM = re.compile('([0-9a-z])', re.I) - NON_ALPHANUM = re.compile('([^0-9a-z])', re.I) - TOO_MANY_VOWELS = re.compile('[aäeioöuü]{4}', re.I) - TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{4}', re.I) - ALL_ALPHA = re.compile('^[a-z]+$', re.I) - # SINGLE_LETTER_WORDS = re.compile('^$', re.I) +class LanguageBackend(BackendBase): + def check_word(word): + ALL_ALPHANUM = re.compile('([0-9a-zäöüß])', re.I) + NON_ALPHANUM = re.compile('([^0-9a-zäöüß])', re.I) + ALL_ALPHANUM = re.compile('([0-9a-z])', re.I) + NON_ALPHANUM = re.compile('([^0-9a-z])', re.I) - #(L) If a string is longer than 40 characters, it is considered as garbage - # http://www.duden.de/sprachwissen/sprachratgeber/die-laengsten-woerter-im-dudenkorpus - # http://www.duden.de/sprachwissen/sprachratgeber/durchschnittliche-laenge-eines-deutschen-wortes - if len(word) > 40: - return None + TOO_MANY_VOWELS = re.compile('[aäeioöuü]{4}', re.I) + TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{4}', re.I) + ALL_ALPHA = re.compile('^[a-z]+$', re.I) + # SINGLE_LETTER_WORDS = re.compile('^$', re.I) - #(A) If a string's ratio of alphanumeric characters to total - #characters is less than 50%, the string is garbage - if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: - return None + #(L) If a string is longer than 40 characters, it is considered as garbage + # http://www.duden.de/sprachwissen/sprachratgeber/die-laengsten-woerter-im-dudenkorpus + # http://www.duden.de/sprachwissen/sprachratgeber/durchschnittliche-laenge-eines-deutschen-wortes + if len(word) > 40: + return None - #Remove word if all the letters in the word are non alphanumeric - if len(NON_ALPHANUM.findall(word)) == len(word): - return None + #(A) If a string's ratio of alphanumeric characters to total + #characters is less than 50%, the string is garbage + if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: + return None - #Removed words with too many consecutie vowels - if TOO_MANY_VOWELS.findall(word): - return None + #Remove word if all the letters in the word are non alphanumeric + if len(NON_ALPHANUM.findall(word)) == len(word): + return None - #Removed words with too many consecutie consonants - if TOO_MANY_CONSONANTS.findall(word): - return None + #Removed words with too many consecutie vowels + if TOO_MANY_VOWELS.findall(word): + return None - # No single letter words in German - if len(word) == 1: - return None + #Removed words with too many consecutie consonants + if TOO_MANY_CONSONANTS.findall(word): + return None - return word + # No single letter words in German + if len(word) == 1: + return None + + return word