forked from chrismattmann/tika-similarity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
metalevenshtein.py
143 lines (110 loc) · 5.09 KB
/
metalevenshtein.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# author: Asitang Mishra
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import features as feat
import math
import re
#split a string when we see a transition from one type to another say alpha, numeric, spl chars.
def break_natural_boundaries(string):
stringbreak=[]
if len(string.split(' ')) > 1:
stringbreak = string.split(' ')
else:
spl = '[a-z][\%|\$|\^|\*|\@|\!|\_|\-|\(|\)|\:|\;|\'|\"|\{|\}|\[|\]|]'
up = '[A-Z]'
low = '[a-z]'
num = '\d'
matchindex = set()
matchindex.update(set(m.start() for m in re.finditer(up + low, string)))
matchindex.update(set(m.start() for m in re.finditer(low + up, string)))
matchindex.update(set(m.start() for m in re.finditer(num + up, string)))
matchindex.update(set(m.start() for m in re.finditer(up + num, string)))
matchindex.update(set(m.start() for m in re.finditer(low + num, string)))
matchindex.update(set(m.start() for m in re.finditer(num + low, string)))
matchindex.update(set(m.start() for m in re.finditer(spl + up, string)))
matchindex.update(set(m.start() for m in re.finditer(up + spl, string)))
matchindex.update(set(m.start() for m in re.finditer(low + spl, string)))
matchindex.update(set(m.start() for m in re.finditer(spl + low, string)))
matchindex.update(set(m.start() for m in re.finditer(spl + num, string)))
matchindex.update(set(m.start() for m in re.finditer(num + spl, string)))
matchindex.add(len(string)-1)
matchindex = sorted(matchindex)
start = 0
for i in matchindex:
end = i
stringbreak.append(string[start:end + 1])
start = i+1
return stringbreak
def meta_levenshtein(string1,string2,Sim='levenshtein',theta=0.5,strict=-1,idf=dict()):
''' Implements ideas from the paper : Robust Similarity Measures for Named Entities Matching by Erwan et al.
Sim = jaro_winkler, levenshtein : can be chosen as the secondary matching function.
theta is the secondary similarity threshold: If set higher it will be more difficult for the strings to match.
strict=-1 for doing all permutations of the substrings
strict=1 for no permutations
idf=provide a dictionary for {string(word),float(idf od the word)}: More useful when mathings multi word entities (And word importances are very important)
like: 'harry potter', 'the wizard harry potter'
'''
# set the secondary simlarity function
if Sim == 'levenshtein':
simf = lambda x, y: feat.levenshtein_similarity(unicode(x, 'utf-8'), unicode(y, 'utf-8'))
elif Sim=='jaro_winkler':
simf = lambda x, y: feat.jaro_winkler_similarity(unicode(x, 'utf-8'), unicode(y, 'utf-8'))
# set the idf and normalization functions
if len(idf)>0:
idf=lambda x,y :idf[x]*idf[y]
norm=lambda x,y,ex,ey: math.sqrt(sum([i**2 for i in ex]))*math.sqrt(sum([i**2 for i in ey]))
else:
idf=lambda x,y:1
norm=lambda x,y,ex,ey:max(len(x),len(y))
# break the string down into sub-strings ( if it has not spaces already)
string1break=break_natural_boundaries(string1)
string2break=break_natural_boundaries(string2)
# swap the bigger one to string2
if len(string1break)>len(string2break):
temp=string1break
string1break=string2break
string2break=temp
# make permutations of srtingbreak
perm1 = []
perm2 = []
if strict==-1:
perm1=itertools.permutations(string1break)
perm2=itertools.permutations(string2break)
elif strict==1:
perm1.append(string1break)
perm2.append(string2break)
# Do secondary matching for each permutation and each (Levenshtein) shift (E=edges that qualify/remain after applying the threshold: theta)
bestscore=0.0
for st1 in perm1:
for st2 in perm2:
for k in range (0,len(st2)-len(st1)+1):
permscore = 0.0
E1=[]
E2=[]
for i in range(0,len(st1)):
# calculate the secondary similarites for a fixed instance
simtemp=simf(st1[i],st2[i+k])*idf(st1[i],st2[i+k])
if simtemp>=theta:
E1.append(st1[i])
E2.append(st2[i+k])
permscore+=simtemp
if permscore>bestscore:
bestscore=permscore
bestscore=bestscore/norm(st1,st2,E1,E2)
return bestscore
if __name__ == '__main__':
#usage example
print meta_levenshtein('abacus1cat','cat1cus')