Delete language processing backends

This commit is contained in:
Roberto Rosario
2015-06-09 03:24:17 -04:00
parent e9be14f2af
commit d073685680
6 changed files with 0 additions and 219 deletions

View File

@@ -1,3 +0,0 @@
class BackendBase(object):
def check_word(self, word):
raise NotImplementedError

View File

@@ -1,45 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
from . import BackendBase
class LanguageBackend(BackendBase):
def check_word(self, word):
ALL_ALPHANUM = re.compile('([0-9a-zäöüß])', re.I)
NON_ALPHANUM = re.compile('([^0-9a-zäöüß])', re.I)
TOO_MANY_VOWELS = re.compile('[aäeioöuü]{4}', re.I)
TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{4}', re.I)
# SINGLE_LETTER_WORDS = re.compile('^$', re.I)
# (L) If a string is longer than 40 characters, it is considered as garbage
# http://www.duden.de/sprachwissen/sprachratgeber/die-laengsten-woerter-im-dudenkorpus
# http://www.duden.de/sprachwissen/sprachratgeber/durchschnittliche-laenge-eines-deutschen-wortes
if len(word) > 40:
return None
# (A) If a string's ratio of alphanumeric characters to total
# characters is less than 50%, the string is garbage
if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
return None
# Remove word if all the letters in the word are non alphanumeric
if len(NON_ALPHANUM.findall(word)) == len(word):
return None
# Removed words with too many consecutie vowels
if TOO_MANY_VOWELS.findall(word):
return None
# Removed words with too many consecutie consonants
if TOO_MANY_CONSONANTS.findall(word):
return None
# No single letter words in German
if len(word) == 1:
return None
return word

View File

@@ -1,42 +0,0 @@
from __future__ import unicode_literals
import re
from . import BackendBase
class LanguageBackend(BackendBase):
def check_word(self, word):
ALL_ALPHANUM = re.compile('([0-9a-z])', re.I)
NON_ALPHANUM = re.compile('([^0-9a-z])', re.I)
TOO_MANY_VOWELS = re.compile('[aeiou]{3}', re.I)
TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnpqrstvwxyz]{5}', re.I)
SINGLE_LETTER_WORDS = re.compile('^[ai]$', re.I)
# (L) If a string is longer than 20 characters, it is garbage
if len(word) > 20:
return None
# (A) If a string's ratio of alphanumeric characters to total
# characters is less than 50%, the string is garbage
if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
return None
# Remove word if all the letters in the word are non alphanumeric
if len(NON_ALPHANUM.findall(word)) == len(word):
return None
# Removed words with too many consecutie vowels
if TOO_MANY_VOWELS.findall(word):
return None
# Removed words with too many consecutie consonants
if TOO_MANY_CONSONANTS.findall(word):
return None
# Only allow specific single letter words
if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
return None
return word

View File

@@ -1,43 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import re
from . import BackendBase
class LanguageBackend(BackendBase):
def check_word(self, word):
ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
# (L) If a string is longer than 20 characters, it is garbage
if len(word) > 20:
return None
# (A) If a strings ratio of alphanumeric characters to total
# characters is less than 50%, the string is garbage
if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
return None
# Remove word if all the letters in the word are non alphanumeric
if len(NON_ALPHANUM.findall(word)) == len(word):
return None
# Removed words with too many consecutie vowels
if TOO_MANY_VOWELS.findall(word):
return None
# Removed words with too many consecutie consonants
if TOO_MANY_CONSONANTS.findall(word):
return None
# Only allow specific single letter words
if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
return None
return word

View File

@@ -1,43 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
from . import BackendBase
class LanguageBackend(BackendBase):
def check_word(self, word):
ALL_ALPHANUM = re.compile('([0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
NON_ALPHANUM = re.compile('([^0-9ёйцукенгшщзхъфывапролджэячсмитьбю])', re.I)
TOO_MANY_VOWELS = re.compile('[ёуеыаоэяию]{3}', re.I)
TOO_MANY_CONSONANTS = re.compile('[йцкнгшщзхъфвпрлджчсмтьб{5}', re.I)
SINGLE_LETTER_WORDS = re.compile('^[уквояси]$', re.I)
# (L) If a string is longer than 25 characters, it is garbage
if len(word) > 25:
return None
# (A) If a string's ratio of alphanumeric characters to total
# characters is less than 50%, the string is garbage
if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
return None
# Remove word if all the letters in the word are non alphanumeric
if len(NON_ALPHANUM.findall(word)) == len(word):
return None
# Removed words with too many consecutie vowels
if TOO_MANY_VOWELS.findall(word):
return None
# Removed words with too many consecutie consonants
if TOO_MANY_CONSONANTS.findall(word):
return None
# Only allow specific single letter words
if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
return None
return word

View File

@@ -1,43 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
from . import BackendBase
class LanguageBackend(BackendBase):
def check_word(self, word):
ALL_ALPHANUM = re.compile('([0-9a-záéíóúüñ])', re.I)
NON_ALPHANUM = re.compile('([^0-9a-záéíóúüñ])', re.I)
TOO_MANY_VOWELS = re.compile('[aáeéiíoóuúü]{3}', re.I)
TOO_MANY_CONSONANTS = re.compile('[bcdfghjklmnñpqrstvwxyz]{5}', re.I)
SINGLE_LETTER_WORDS = re.compile('^[aeoóuy]$', re.I)
# (L) If a string is longer than 20 characters, it is garbage
if len(word) > 20:
return None
# (A) If a strings ratio of alphanumeric characters to total
# characters is less than 50%, the string is garbage
if len(ALL_ALPHANUM.findall(word)) < len(word) / 2:
return None
# Remove word if all the letters in the word are non alphanumeric
if len(NON_ALPHANUM.findall(word)) == len(word):
return None
# Removed words with too many consecutie vowels
if TOO_MANY_VOWELS.findall(word):
return None
# Removed words with too many consecutie consonants
if TOO_MANY_CONSONANTS.findall(word):
return None
# Only allow specific single letter words
if len(word) == 1 and not SINGLE_LETTER_WORDS.findall(word):
return None
return word