-
Notifications
You must be signed in to change notification settings - Fork 25
/
stopwords_test.go
349 lines (310 loc) · 10.8 KB
/
stopwords_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
// Copyright 2015 Benjamin BALET. All rights reserved.
// Use of this source code is governed by the BSD license
// license that can be found in the LICENSE file.
package stopwords
import (
"reflect"
"testing"
)
var expected = " abcdefghijk lmnopqrstuvwxyz "
func TestLangCodesHTML(t *testing.T) {
var langCodeTests = []struct {
langCode string
}{
{"en-US"},
{"en_US"},
{"en_GB"},
{"en-GB"},
{"en"},
{"eng"},
{"zz"},
}
source := "afterwards"
expected2 := " "
for _, tt := range langCodeTests {
actual := CleanString(source, tt.langCode, false)
if actual != expected2 {
t.Errorf("string test failed: expected %q, got %q", expected2, actual)
}
actualB := Clean([]byte(source), tt.langCode, false)
if !reflect.DeepEqual(actualB, []byte(expected2)) {
t.Errorf("bytes test failed: expected %q, got: %q", []byte(expected2), actualB)
}
}
}
func TestRemoveHTML(t *testing.T) {
source := "<a href='resource.htm'>"Fran & Freddie's Diner"</a> <b><[email protected]></b>"
expected2 := "fran freddie's diner tasty example com "
actual := CleanString(source, "en", true)
if actual != expected2 {
t.Errorf("string test failed: expected %q, got %q", expected2, actual)
}
actualB := Clean([]byte(source), "en", true)
if !reflect.DeepEqual(actualB, []byte(expected2)) {
t.Errorf("bytes test failed: expected %q, got: %q", []byte(expected2), actualB)
}
}
func TestRemoveSpaces(t *testing.T) {
source := "ab cd ef gh ij kl mn op qr st uv wx yz"
expected2 := "ab cd ef gh ij kl mn op qr st uv wx yz "
actual := CleanString(source, "en", true)
if actual != expected2 {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestToLower(t *testing.T) {
source := "All Problematic ALmoST Always"
expected2 := " problematic "
actual := CleanString(source, "en", true)
if actual != expected2 {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestArabicStopWords(t *testing.T) {
source := "كلم abcdefghijk واضافت lmnopqrstuvwxyz اليوم"
actual := CleanString(source, "ar", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestBulgarianStopWords(t *testing.T) {
source := "беше abcdefghijk въпреки lmnopqrstuvwxyz юмрук"
actual := CleanString(source, "bg", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestCzechStopWords(t *testing.T) {
source := "ačkoli abcdefghijk chceš lmnopqrstuvwxyz neděláš"
actual := CleanString(source, "cs", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestDanishStopWords(t *testing.T) {
source := "før abcdefghijk få lmnopqrstuvwxyz hvornår næste"
actual := CleanString(source, "da", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestDutchStopWords(t *testing.T) {
source := "aangezien abcdefghijk hierbeneden lmnopqrstuvwxyz ofschoon uitgezonderd"
actual := CleanString(source, "nl", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestEnglishStopWords(t *testing.T) {
source := "along abcdefghijk another lmnopqrstuvwxyz yet"
actual := CleanString(source, "en", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestFrenchStopWords(t *testing.T) {
source := "assez abcdefghijk certaines lmnopqrstuvwxyz vous-mêmes"
actual := CleanString(source, "fr", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestFinnishStopWords(t *testing.T) {
source := "ylös abcdefghijk vähintään lmnopqrstuvwxyz täytyvät kyllä"
actual := CleanString(source, "fi", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestGermanStopWords(t *testing.T) {
source := "dafür abcdefghijk dementsprechend lmnopqrstuvwxyz großen zwölf"
actual := CleanString(source, "de", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestGreekStopWords(t *testing.T) {
source := "δαίσ abcdefghijk τοιοῦτοσ lmnopqrstuvwxyz ειστε εκεινουσ"
actual := CleanString(source, "el", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestHungarianStopWords(t *testing.T) {
source := "alapján abcdefghijk általában lmnopqrstuvwxyz belőle különbözőbb"
actual := CleanString(source, "hu", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestItalianStopWords(t *testing.T) {
source := "ahimè abcdefghijk quantunque lmnopqrstuvwxyz perchè tuttavia"
actual := CleanString(source, "it", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestLatvianStopWords(t *testing.T) {
source := "būsiet abcdefghijk kļūsim lmnopqrstuvwxyz līdzko tiklīdz"
actual := CleanString(source, "lv", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestNorwegianStopWords(t *testing.T) {
source := "fÅ abcdefghijk tilstand lmnopqrstuvwxyz vÖre gjÛre"
actual := CleanString(source, "no", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestPersianStopWords(t *testing.T) {
source := "شده abcdefghijk شوند lmnopqrstuvwxyz خواهند بنابراين"
actual := CleanString(source, "fa", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestPolishStopWords(t *testing.T) {
source := "daleko abcdefghijk dziś lmnopqrstuvwxyz natychmiast każdy"
actual := CleanString(source, "pl", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestPortugueseStopWords(t *testing.T) {
source := "aí abcdefghijk área lmnopqrstuvwxyz vocês vão"
actual := CleanString(source, "pt", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestRomanianStopWords(t *testing.T) {
source := "voastră abcdefghijk ţi lmnopqrstuvwxyz aibă"
actual := CleanString(source, "ro", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestRussianStopWords(t *testing.T) {
source := "многочисленное abcdefghijk меньше lmnopqrstuvwxyz тринадцать третий"
actual := CleanString(source, "ru", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestSpanishStopWords(t *testing.T) {
source := "aquél abcdefghijk cómo lmnopqrstuvwxyz día mucho"
actual := CleanString(source, "es", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestSlovakStopWords(t *testing.T) {
source := "budú abcdefghijk každý lmnopqrstuvwxyz môže tvojími"
actual := CleanString(source, "sk", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestSwedishStopWords(t *testing.T) {
source := "alltså abcdefghijk åttonde lmnopqrstuvwxyz verkligen likställda"
actual := CleanString(source, "sv", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestTurkishStopWords(t *testing.T) {
source := "altmış abcdefghijk yaptığı lmnopqrstuvwxyz ancak beş"
actual := CleanString(source, "tr", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestIndonesianStopWords(t *testing.T) {
source := "dia abcdefghijk lmnopqrstuvwxyz adalah seorang"
actual := CleanString(source, "id", false)
if actual != expected {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestUnicodeWordBreakStopWords(t *testing.T) {
//If the text has been edited with a modern text processor, the words are broken using ZWSP unicode character, e.g.
//住宅地域における本機の使用は有害な電波妨害を引き起こすことがあり、その場合ユーザーは自己負担で電波妨害の問題を解決しなければなりません。
/* This sentence should be broken into the following words (see http://www.atilika.org/):
(N) 住宅地域 -
(P) に - stop word
(V) おける - stop word
(N) 本機
(P) の - stop word
(N) 使用
(P) は - stop word
(AN) 有害な
(N) 電波妨害
(P) を - stop word
(V) 引き起こす
(N) こと - stop word
(P) が - stop word
(V) あり、
(D) その - stop word
(N) 場合
(N) ユーザー
(P) は - stop word
(N) 自己負担
(P) で - stop word
(N) 電波妨害
(P) の - stop word
(N) 問題
(P) を - stop word
(N) 解決
(V) しなければなりません。
*/
source := "住宅地域における本機の使用は有害な電波妨害を引き起こすことがあり、その場合ユーザーは自己負担で電波妨害の問題を解決しなければなりません。"
expected2 := "住宅 地域 おける 本機 使用 有害な 電波妨害 引き起こす 場合 ユーザー 自己負担 電波妨害 問題 解決 しなければなりません "
actual := CleanString(source, "ja", false)
if actual != expected2 {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestThaiStopWords(t *testing.T) {
//As for TestUnicodeWordBreakStopWords, we assume that the words are properly
//tokenized using ZWSP unicode character or space as a word boundary.
source := "การที่ได้ต้องแสดงว่างานดี"
expected2 := " แสดง งาน ดี "
actual := CleanString(source, "th", false)
if actual != expected2 {
t.Errorf("Test failed, got: '%s'", actual)
}
}
func TestKhmerStopWords(t *testing.T) {
//As for TestUnicodeWordBreakStopWords, we assume that the words are properly
//tokenized using ZWSP unicode character or space as a word boundary.
source := "ខ្ញុំទៅផ្សារ។ ទៅអោយបានសុខ។"
expected2 := " អោយ សុខ "
actual := CleanString(source, "km", false)
if actual != expected2 {
t.Errorf("Test failed, got: '%s'", actual)
}
}
// TestDontStripDigits tests if we can include Number, Decimal Digit'
// Unicode Category as words
func TestDontStripDigits(t *testing.T) {
source := "a 6-years-old boy was found the hand in the cookie jar"
expected2 := " 6-years-old boy hand cookie jar "
DontStripDigits()
actual := CleanString(source, "en", false)
if actual != expected2 {
t.Errorf("Test failed, got: '%s'", actual)
}
}
// TestOverwriteWordSegmenter tests if we can overwrite the word segmenter
// with a custom regular expression
func TestOverwriteWordSegmenter(t *testing.T) {
source := "AABC4EEF7DDYTH9"
expected2 := "aabc eef ddyth "
OverwriteWordSegmenter(`[\pL]+`)
actual := CleanString(source, "en", false)
if actual != expected2 {
t.Errorf("Test failed, got: '%s'", actual)
}
}