From f0c019f6fc1a75831d3bfbc42e45f4307beb639c Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Sun, 6 Nov 2011 01:06:43 -0400 Subject: [PATCH 1/3] Reduce severity of the messages displayed when no OCR backend is found for a language --- apps/ocr/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/ocr/api.py b/apps/ocr/api.py index 86d7bae461..d9d7782b1d 100644 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -31,7 +31,7 @@ def get_language_backend(): try: module = import_module(u'.'.join([u'ocr', u'lang', TESSERACT_LANGUAGE])) except ImportError: - sys.stderr.write(u'\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE) + sys.stderr.write(u'\nWarning: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE) return None return module From 7577f5b0e45ea157a0fb2eada81512ca90c86b8b Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Sun, 6 Nov 2011 01:21:19 -0400 Subject: [PATCH 2/3] =?UTF-8?q?Added=20Russian=20locale=20post=20OCR=20cle?= =?UTF-8?q?anup=20backend=20(=D0=A1=D0=B5=D1=80=D0=B3=D0=B5=D0=B9=20=D0=93?= =?UTF-8?q?=D0=BB=D0=B8=D1=82=D0=B0=20[Sergei=20Glita])?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/ocr/lang/rus.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 apps/ocr/lang/rus.py diff --git a/apps/ocr/lang/rus.py b/apps/ocr/lang/rus.py new file mode 100644 index 0000000000..a91d3bf224 --- /dev/null +++ b/apps/ocr/lang/rus.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +import re + + +def check_word(word): + ALL_ALPHANUM = re.compile('([0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I) + NON_ALPHANUM = re.compile('([^0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I) + + TOO_MANY_VOWELS = re.compile('[ёуеыаоэяию]{3}', re.I) + TOO_MANY_CONSONANTS = re.compile('[йцкнгшщзхъфвпрлджчсмтьб{5}', re.I) + ALL_ALPHA = re.compile('^[ёйцукенгшщзхъфывапролджэячсмитьбю]+$', re.I) + SINGLE_LETTER_WORDS = re.compile('^[уквояси]$', re.I) + + #(L) If a string is longer than 25 characters, it is garbage + if len(word) > 25: + return None + + #(A) If a string's ratio of alphanumeric characters to total + #characters is less than 50%, the string is garbage + if len(ALL_ALPHANUM.findall(word)) < len(word) / 2: + return None + + #Remove word if all the letters in the word are non alphanumeric + if len(NON_ALPHANUM.findall(word)) == len(word): + return None + + #Removed words with too many consecutie vowels + if TOO_MANY_VOWELS.findall(word): + return None + + #Removed words with too many consecutie consonants + if TOO_MANY_CONSONANTS.findall(word): + return None + + #Only allow specific single letter words + if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word): + return None + + return word From 90123623a5424577fe4396d3077b9857909c76bd Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Sun, 6 Nov 2011 02:31:20 -0400 Subject: [PATCH 3/3] Contributor Sergey Glita name update --- docs/contributors.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributors.rst b/docs/contributors.rst index b8f2439112..fdd87cc290 100644 --- a/docs/contributors.rst +++ b/docs/contributors.rst @@ -33,4 +33,4 @@ Translations ------------ * Emerson Soares (http://emersonsoares.com) * Renata Oliveira (https://twitter.com/#!/rnataoliveira) -* Sergey Glita (s.v.glita@gmail.com) +* Сергей Глита [Sergey Glita] (s.v.glita@gmail.com)