-
Notifications
You must be signed in to change notification settings - Fork 26
/
jarowinkler.go
135 lines (113 loc) · 2.97 KB
/
jarowinkler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
package matchr
func jaroWinklerBase(s1 string, s2 string,
longTolerance bool, winklerize bool) (distance float64) {
// index by code point, not byte
r1 := []rune(s1)
r2 := []rune(s2)
r1Length := len(r1)
r2Length := len(r2)
if r1Length == 0 || r2Length == 0 {
return
}
minLength := 0
if r1Length > r2Length {
minLength = r1Length
} else {
minLength = r2Length
}
searchRange := minLength
searchRange = (searchRange / 2) - 1
if searchRange < 0 {
searchRange = 0
}
var lowLim, hiLim, transCount, commonChars int
var i, j, k int
r1Flag := make([]bool, r1Length+1)
r2Flag := make([]bool, r2Length+1)
// find the common chars within the acceptable range
commonChars = 0
for i, _ = range r1 {
if i >= searchRange {
lowLim = i - searchRange
} else {
lowLim = 0
}
if (i + searchRange) <= (r2Length - 1) {
hiLim = i + searchRange
} else {
hiLim = r2Length - 1
}
for j := lowLim; j <= hiLim; j++ {
if !r2Flag[j] && r2[j] == r1[i] {
r2Flag[j] = true
r1Flag[i] = true
commonChars++
break
}
}
}
// if we have nothing in common at this point, nothing else can be done
if commonChars == 0 {
return
}
// otherwise we count the transpositions
k = 0
transCount = 0
for i, _ := range r1 {
if r1Flag[i] {
for j = k; j < r2Length; j++ {
if r2Flag[j] {
k = j + 1
break
}
}
if r1[i] != r2[j] {
transCount++
}
}
}
transCount /= 2
// adjust for similarities in nonmatched characters
distance = float64(commonChars)/float64(r1Length) +
float64(commonChars)/float64(r2Length) +
(float64(commonChars-transCount))/float64(commonChars)
distance /= 3.0
// give more weight to already-similar strings
if winklerize && distance > 0.7 {
// the first 4 characters in common
if minLength >= 4 {
j = 4
} else {
j = minLength
}
for i = 0; i < j && len(r1) > i && len(r2) > i && r1[i] == r2[i] && nan(r1[i]); i++ {
}
if i > 0 {
distance += float64(i) * 0.1 * (1.0 - distance)
}
if longTolerance && (minLength > 4) && (commonChars > i+1) &&
(2*commonChars >= minLength+i) {
if nan(r1[0]) {
distance += (1.0 - distance) * (float64(commonChars-i-1) /
(float64(r1Length) + float64(r2Length) - float64(i*2) + 2))
}
}
}
return
}
// Jaro computes the Jaro edit distance between two strings. It represents
// this with a float64 between 0 and 1 inclusive, with 0 indicating the two
// strings are not at all similar and 1 indicating the two strings are exact
// matches.
//
// See http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance for a
// full description.
func Jaro(r1 string, r2 string) (distance float64) {
return jaroWinklerBase(r1, r2, false, false)
}
// JaroWinkler computes the Jaro-Winkler edit distance between two strings.
// This is a modification of the Jaro algorithm that gives additional weight
// to prefix matches.
func JaroWinkler(r1 string, r2 string, longTolerance bool) (distance float64) {
return jaroWinklerBase(r1, r2, longTolerance, true)
}