-
Notifications
You must be signed in to change notification settings - Fork 0
/
search_engine.py
155 lines (139 loc) · 5.06 KB
/
search_engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/bin/python
''' A search engine that loads a json file and creates an indexed from
which fast searches could be done.
Requires the json, nltk and sklearn libraries for python.
Make sure that the following nltk libraries are downloaded:
punkt
wordnet
Run the script using:
$ python search_engine.py
You will be required to enter the search terms including field specific
operators. To close the script use Ctrl-C.
'''
import json
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
prod_json = json.load(open('products.json', 'r'))
descrs = []
titles = []
merchs = []
alltxt = []
# class for the tokenizer and lemmatizer
class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self, doc):
return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
# tfidf vectorizer object
vect = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words='english')
# read json mappings to lists per key
for l in prod_json:
desc = l['description']
titl = l['title']
merc = l['merchant']
merchs.append(merc)
titles.append(titl)
descrs.append(desc)
# fit vectorizer and return tfidf matrices
tfidf_mat_d = vect.fit_transform(descrs)
feature_names_d = vect.get_feature_names()
tfidf_mat_t = vect.fit_transform(titles)
feature_names_t = vect.get_feature_names()
tfidf_mat_m = vect.fit_transform(merchs)
feature_names_m = vect.get_feature_names()
# infinite loop for multiple queries
while True:
# get user search terms or close
try:
print('Separate terms with spaces. Precede fields with either')
print('d= t= or m= if searching only in description, title or')
print('merchant fields. Ctrl-C to close.')
search_txt = input('search text: ')
except KeyboardInterrupt:
break
search_txt = search_txt.lower().split()
wnl = WordNetLemmatizer()
records = []
tfidf = []
sort = []
# search terms in lists, includes exceptions
for txt in search_txt:
if txt[:2] == 'd=':
try:
i = feature_names_d.index(wnl.lemmatize(txt[2:]))
except ValueError:
print('Search text not in description field.')
else:
records.append(tfidf_mat_d.getcol(i).tocoo().row)
d = tfidf_mat_d.getcol(i).tocoo().data
tfidf.append(d)
elif txt[:2] == 't=':
try:
i = feature_names_t.index(wnl.lemmatize(txt[2:]))
except ValueError:
print('Search text not in title field.')
else:
records.append(tfidf_mat_t.getcol(i).tocoo().row)
d = tfidf_mat_t.getcol(i).tocoo().data
tfidf.append(d)
elif txt[:2] == 'm=':
try:
i = feature_names_m.index(wnl.lemmatize(txt[2:]))
except ValueError:
print('Search text not in merchant field.')
else:
records.append(tfidf_mat_m.getcol(i).tocoo().row)
d = tfidf_mat_m.getcol(i).tocoo().data
tfidf.append(d)
else:
try:
i = feature_names_d.index(wnl.lemmatize(txt))
except ValueError:
print('Search text not in description field.')
else:
records.append(tfidf_mat_d.getcol(i).tocoo().row)
d = tfidf_mat_d.getcol(i).tocoo().data
tfidf.append(d)
try:
i = feature_names_t.index(wnl.lemmatize(txt))
except ValueError:
print('Search text not in title field.')
else:
records.append(tfidf_mat_t.getcol(i).tocoo().row)
d = tfidf_mat_t.getcol(i).tocoo().data
tfidf.append(d)
try:
i = feature_names_m.index(wnl.lemmatize(txt))
except ValueError:
print('Search text not in merchant field.')
else:
records.append(tfidf_mat_m.getcol(i).tocoo().row)
d = tfidf_mat_m.getcol(i).tocoo().data
tfidf.append(d)
# flatten lists
records = [item for arr in records for item in arr]
tfidf = [item for arr in tfidf for item in arr]
# if duplicate records sum duplicates tfidf
records_dups = []
tfidf_dups = []
tally = []
for i in range(len(records)):
if i in tally:
continue
records_dups.append(records[i])
tfidf_dups.append(tfidf[i])
for j in range(i+1, len(records)):
if records[i] == records[j]:
tfidf_dups[-1] += tfidf[j]
tally.append(j)
# sort tfidf values in descending order
sort = sorted(range(len(tfidf_dups)), key=tfidf_dups.__getitem__, reverse=True)
search_docs = []
search_index = []
for s in sort:
r = records_dups[s]
search_docs.append(prod_json[r])
search_index.append(r)
print('Search result:')
print(search_docs)