Merge branch 'feature/russian_ocr' into development
This commit is contained in:
@@ -31,7 +31,7 @@ def get_language_backend():
|
||||
try:
|
||||
module = import_module(u'.'.join([u'ocr', u'lang', TESSERACT_LANGUAGE]))
|
||||
except ImportError:
|
||||
sys.stderr.write(u'\nError: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE)
|
||||
sys.stderr.write(u'\nWarning: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE)
|
||||
return None
|
||||
return module
|
||||
|
||||
|
||||
39
apps/ocr/lang/rus.py
Normal file
39
apps/ocr/lang/rus.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
|
||||
|
||||
def check_word(word):
|
||||
ALL_ALPHANUM = re.compile('([0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
|
||||
NON_ALPHANUM = re.compile('([^0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
|
||||
|
||||
TOO_MANY_VOWELS = re.compile('[ёуеыаоэяию]{3}', re.I)
|
||||
TOO_MANY_CONSONANTS = re.compile('[йцкнгшщзхъфвпрлджчсмтьб{5}', re.I)
|
||||
ALL_ALPHA = re.compile('^[ёйцукенгшщзхъфывапролджэячсмитьбю]+$', re.I)
|
||||
SINGLE_LETTER_WORDS = re.compile('^[уквояси]$', re.I)
|
||||
|
||||
#(L) If a string is longer than 25 characters, it is garbage
|
||||
if len(word) > 25:
|
||||
return None
|
||||
|
||||
#(A) If a string's ratio of alphanumeric characters to total
|
||||
#characters is less than 50%, the string is garbage
|
||||
if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
|
||||
return None
|
||||
|
||||
#Remove word if all the letters in the word are non alphanumeric
|
||||
if len(NON_ALPHANUM.findall(word)) == len(word):
|
||||
return None
|
||||
|
||||
#Removed words with too many consecutie vowels
|
||||
if TOO_MANY_VOWELS.findall(word):
|
||||
return None
|
||||
|
||||
#Removed words with too many consecutie consonants
|
||||
if TOO_MANY_CONSONANTS.findall(word):
|
||||
return None
|
||||
|
||||
#Only allow specific single letter words
|
||||
if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
|
||||
return None
|
||||
|
||||
return word
|
||||
@@ -33,4 +33,4 @@ Translations
|
||||
------------
|
||||
* Emerson Soares (http://emersonsoares.com)
|
||||
* Renata Oliveira (https://twitter.com/#!/rnataoliveira)
|
||||
* Sergey Glita (s.v.glita@gmail.com)
|
||||
* Сергей Глита [Sergey Glita] (s.v.glita@gmail.com)
|
||||
|
||||
Reference in New Issue
Block a user