From 7577f5b0e45ea157a0fb2eada81512ca90c86b8b Mon Sep 17 00:00:00 2001
From: Roberto Rosario <Roberto.Rosario.Gonzalez@gmail.com>
Date: Sun, 6 Nov 2011 01:21:19 -0400
Subject: [PATCH] =?UTF-8?q?Added=20Russian=20locale=20post=20OCR=20cleanup?=
 =?UTF-8?q?=20backend=20(=D0=A1=D0=B5=D1=80=D0=B3=D0=B5=D0=B9=20=D0=93?=
 =?UTF-8?q?=D0=BB=D0=B8=D1=82=D0=B0=20[Sergei=20Glita])?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 apps/ocr/lang/rus.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 apps/ocr/lang/rus.py

diff --git a/apps/ocr/lang/rus.py b/apps/ocr/lang/rus.py
new file mode 100644
index 0000000000..a91d3bf224
--- /dev/null
+++ b/apps/ocr/lang/rus.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+import re
+
+
+def check_word(word):
+    ALL_ALPHANUM = re.compile('([0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
+    NON_ALPHANUM = re.compile('([^0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
+
+    TOO_MANY_VOWELS = re.compile('[ёуеыаоэяию]{3}', re.I)
+    TOO_MANY_CONSONANTS = re.compile('[йцкнгшщзхъфвпрлджчсмтьб{5}', re.I)
+    ALL_ALPHA = re.compile('^[ёйцукенгшщзхъфывапролджэячсмитьбю]+$', re.I)
+    SINGLE_LETTER_WORDS = re.compile('^[уквояси]$', re.I)
+
+    #(L) If a string is longer than 25 characters, it is garbage
+    if len(word) > 25:
+        return None
+
+    #(A) If a string's ratio of alphanumeric characters to total
+    #characters is less than 50%, the string is garbage
+    if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
+        return None
+
+    #Remove word if all the letters in the word are non alphanumeric
+    if len(NON_ALPHANUM.findall(word)) == len(word):
+        return None
+
+    #Removed words with too many consecutie vowels
+    if TOO_MANY_VOWELS.findall(word):
+        return None
+
+    #Removed words with too many consecutie consonants
+    if TOO_MANY_CONSONANTS.findall(word):
+        return None
+
+    #Only allow specific single letter words
+    if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
+        return None
+
+    return word