-
Notifications
You must be signed in to change notification settings - Fork 5
/
Soundex.py
244 lines (209 loc) · 10.9 KB
/
Soundex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import re
from abc import ABC, abstractmethod
import editdistance
import pymorphy2
class Soundex(ABC):
_vowels = ''
_table = str.maketrans('', '')
_reduce_regex = re.compile(r'(\w)(\1)+', re.IGNORECASE)
_vowels_regex = re.compile(r'(0+)', re.IGNORECASE)
def __init__(self, delete_first_letter=False, delete_first_coded_letter=False,
delete_zeros=False, cut_result=False, seq_cutted_len=4):
"""
Initialization of Soundex object
:param delete_first_letter: remove the first letter from the result code (A169 -> 169)
:param delete_first_coded_letter: remove the first coded letter from the result code (A5169 -> A169)
:param delete_zeros: remove vowels from the result code
:param cut_result: cut result core till N symbols
:param seq_cutted_len: length of the result code
"""
self.delete_first_letter = delete_first_letter
self.delete_first_coded_letter = delete_first_coded_letter
self.delete_zeros = delete_zeros
self.cut_result = cut_result
self.seq_cutted_len = seq_cutted_len
def _is_vowel(self, letter):
return letter in self._vowels
def _reduce_seq(self, seq):
return self._reduce_regex.sub(r'\1', seq)
def _translate_vowels(self, word):
return ''.join('0' if self._is_vowel(letter) else letter for letter in word)
def _remove_vowels_and_paired_sounds(self, seq):
seq = self._vowels_regex.sub('', seq)
seq = self._reduce_seq(seq)
return seq
def _apply_soundex_algorithm(self, word):
word = word.lower()
first, last = word[0], word
last = last.translate(self._table)
last = self._translate_vowels(last)
last = self._reduce_seq(last)
if self.delete_zeros:
last = self._remove_vowels_and_paired_sounds(last)
if self.cut_result:
last = last[:self.seq_cutted_len] if len(last) >= self.seq_cutted_len else last
last += ('0' * (self.seq_cutted_len - len(last)))
if self.delete_first_coded_letter:
last = last[1:]
first_char = '' if self.delete_first_letter else first.capitalize()
return first_char + last.upper()
def get_vowels(self):
return self._vowels
def is_delete_first_coded_letter(self):
return self.delete_first_coded_letter
def is_delete_first_letter(self):
return self.delete_first_letter
@abstractmethod
def transform(self, word):
"""
Converts a given word th Soundex code
:param word: string
:return: Soundex string code
"""
return None
class EnglishSoundex(Soundex):
_hw_replacement = re.compile(r'[hw]', re.IGNORECASE)
_vowels = 'aeiouy'
_table = str.maketrans('bpfvcksgjqxzdtlmnr', '112233344555667889')
def transform(self, word):
word = self._hw_replacement.sub('', word)
return self._apply_soundex_algorithm(word)
class RussianSoundex(Soundex):
_vowels = 'аэиоуыеёюя'
_vowels_table = str.maketrans('аяоыиеёэюу', 'AAABBBBBCC')
_table = str.maketrans('бпвфгкхдтжшчщзсцлмнр', '11223334455556667889')
_ego_ogo_endings = re.compile(r'([ео])(г)(о$)', re.IGNORECASE)
_ia_ending = re.compile(r'[еи][ая]', re.IGNORECASE)
_ii_ending = re.compile(r'и[еио]', re.IGNORECASE)
_replacement_map = {
re.compile(r'(^|ъ|ь|' + r'|'.join(_vowels) + r')(я)', re.IGNORECASE): 'jа',
re.compile(r'(^|ъ|ь|' + r'|'.join(_vowels) + r')(ю)', re.IGNORECASE): 'jу',
re.compile(r'(^|ъ|ь|' + r'|'.join(_vowels) + r')(е)', re.IGNORECASE): 'jэ',
re.compile(r'(^|ъ|ь|' + r'|'.join(_vowels) + r')(ё)', re.IGNORECASE): 'jо',
re.compile(r'й', re.IGNORECASE): 'j',
re.compile(r'([тсзжцчшщ])([жцчшщ])', re.IGNORECASE): r'\2',
re.compile(r'(с)(т)([лнц])', re.IGNORECASE): r'\1\3',
re.compile(r'(н)([тд])(ств)', re.IGNORECASE): r'\1\3',
re.compile(r'([нс])([тд])(ск)', re.IGNORECASE): r'\1\3',
re.compile(r'(р)(д)([чц])', re.IGNORECASE): r'\1\3',
re.compile(r'(з)(д)([нц])', re.IGNORECASE): r'\1\3',
re.compile(r'(в)(ств)', re.IGNORECASE): r'\2',
re.compile(r'(л)(нц)', re.IGNORECASE): r'\2',
re.compile(r'[ъь]', re.IGNORECASE): '',
re.compile(r'([дт][зсц])', re.IGNORECASE): 'ц'
}
def __init__(self, delete_first_letter=False, delete_first_coded_letter=False,
delete_zeros=False, cut_result=False, seq_cutted_len=4,
code_vowels=False, use_morph_analysis=False):
"""
Initialization of Russian Soundex object
:param delete_first_letter:
:param delete_first_coded_letter:
:param delete_zeros:
:param cut_result:
:param seq_cutted_len:
:param use_morph_analysis: use morphological grammems for phonemes analysis
:param code_vowels: group and code vowels as ABC letters
"""
super(RussianSoundex, self).__init__(delete_first_letter, delete_first_coded_letter,
delete_zeros, cut_result, seq_cutted_len)
self.code_vowels = code_vowels
self.use_morph_analysis = use_morph_analysis
self._moprh = pymorphy2.MorphAnalyzer()
def _translate_vowels(self, word):
if self.code_vowels:
return word.translate(self._vowels_table)
else:
return super(RussianSoundex, self)._translate_vowels(word)
def _replace_ego_ogo_endings(self, word):
return self._ego_ogo_endings.sub(r'\1в\3', word)
def _use_morph_for_phoneme_replace(self, word):
parse = self._moprh.parse(word)
if parse and ('ADJF' in parse[0].tag or 'NUMB' in parse[0].tag or 'NPRO' in parse[0].tag):
word = self._replace_ego_ogo_endings(word)
return word
def _replace_vowels_seq(self, word):
word = self._ii_ending.sub('и', word)
word = self._ia_ending.sub('я', word)
return word
def transform(self, word):
if self.use_morph_analysis:
word = self._use_morph_for_phoneme_replace(word)
for replace, result in self._replacement_map.items():
word = replace.sub(result, word)
if self.code_vowels:
word = self._replace_vowels_seq(word)
return self._apply_soundex_algorithm(word)
class SoundexSimilarity:
METRICS = {
'levenstein': editdistance.eval
}
def __init__(self, soundex, metrics='levenstein'):
"""
Init a similarity object
:param soundex: an object of Soundex class
:param metrics: similarity function, optional, default is Levenstein distance
"""
self.soundex_converter = soundex
self.metrics = self.METRICS.get(metrics, None)
def similarity(self, word1, word2):
"""
Compute the similarity between Soundex codes
:param word1: first original word
:param word2: second original word
:return: distance value
"""
w1, w2 = self.soundex_converter.transform(word1), self.soundex_converter.transform(word2)
if self.soundex_converter.is_delete_first_letter():
return self.metrics(w1, w2)
return self.metrics(w1[1:], w2[1:])
if __name__ == '__main__':
en_soundex = EnglishSoundex(delete_first_coded_letter=True,
cut_result=True, delete_zeros=True)
assert en_soundex.transform('Robert') == 'R196'
assert en_soundex.transform('Rubin') == 'R180'
assert en_soundex.transform('Rupert') == en_soundex.transform('Robert')
assert en_soundex.transform('Ashcraft') == 'A926'
assert en_soundex.transform('Ashcraft') == en_soundex.transform('Ashcroft')
assert en_soundex.transform('Tymczak') == 'T835'
ru_soundex = RussianSoundex()
assert ru_soundex.transform('ёлочка') == 'JJ070530'
assert ru_soundex.transform('ёлочка') == ru_soundex.transform('йолочка')
assert ru_soundex.transform('кот') == ru_soundex.transform('код')
assert ru_soundex.transform('медь') == ru_soundex.transform('меть')
assert ru_soundex.transform('девчонка') == ru_soundex.transform('девчёнка')
assert ru_soundex.transform('детский') == ru_soundex.transform('децкий')
assert ru_soundex.transform('двацать') == ru_soundex.transform('двадцать')
assert ru_soundex.transform('сница') == ru_soundex.transform('сниться')
assert ru_soundex.transform('воротца') == ru_soundex.transform('вороца')
assert ru_soundex.transform('гигантский') == ru_soundex.transform('гиганский')
assert ru_soundex.transform('марксистский') == ru_soundex.transform('марксисский')
assert ru_soundex.transform('чувствовать') == ru_soundex.transform('чуствовать')
assert ru_soundex.transform('праздник') == ru_soundex.transform('празник')
assert ru_soundex.transform('шчастье') == ru_soundex.transform('счастье')
assert ru_soundex.transform('том') == ru_soundex.transform('тон')
assert ru_soundex.transform('щастье') == 'Щ5064J0'
assert ru_soundex.transform('счастье') == 'Ч5064J0'
assert ru_soundex.transform('агенство') == ru_soundex.transform('агентство')
assert ru_soundex.transform('театр') == ru_soundex.transform('тятр')
assert ru_soundex.transform('сонце') == ru_soundex.transform('солнце')
assert ru_soundex.transform('серце') == ru_soundex.transform('сердце')
assert ru_soundex.transform('считать') == 'Ч50404'
assert ru_soundex.transform('щитать') == 'Щ50404'
ru_soundex = RussianSoundex(use_morph_analysis=True, code_vowels=True)
assert ru_soundex.transform('зелёного') == 'З6B7B8A2A'
assert ru_soundex.transform('никого') == 'Н8B3A2A'
assert ru_soundex.transform('ничего') == 'Н8B5B2A'
assert ru_soundex.transform('много') == 'М8A3A'
ru_soundex = RussianSoundex(delete_first_letter=True)
similarity_checker = SoundexSimilarity(ru_soundex)
assert similarity_checker.similarity('щастье', 'счастье') == 0
assert similarity_checker.similarity('считать', 'щитать') == 0
assert similarity_checker.similarity('зуд', 'суд') == 0
assert similarity_checker.similarity('мощь', 'мочь') == 0
assert similarity_checker.similarity('ночь', 'мочь') == 0
assert similarity_checker.similarity('сахар', 'цукер') == 0
assert similarity_checker.similarity('булочная', 'булошная') == 0
assert similarity_checker.similarity('булочная', 'булошная') == 0
assert similarity_checker.similarity('блеснуть', 'блестнуть') == 0
assert similarity_checker.similarity('ненасный', 'ненастный') == 0