-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.py
110 lines (90 loc) · 3.61 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import math
import searchdata
def search(phrase, boost):
# lstResults is a list of dictionaries
# dicPage is each entry in lstResults that tracks url, title and (search) score for the page
lstResults =[]
dicPage = {}
# store url and title to lstResults
ioFile = open("pages.txt","r")
strURL = ioFile.readline().strip()
while(strURL != ""):
dicPage["url"] = strURL
dicPage["title"] = strURL[strURL.rfind("/")+1: len(strURL) - 5]
lstResults.append(dicPage)
strURL = ioFile.readline().strip()
dicPage = {}
ioFile.close()
lstSearchWords = phrase.strip().split()
# calculate tf-idf value for the query and store it to a dictionary
# key: value = word: [count, tf-idf of this word in the query]
# e.g., apple: [5, 0.6]
dicQry = calculateQryTFIDF(lstSearchWords)
# calculate the cosine similarity score for all websites
lstResults = calculateSimilarityScore(dicQry, lstResults, boost)
# find and return top 10 similar pages
lstTopResults = findTopResults(lstResults, 10)
return lstTopResults
# calculate tf-idf for the query and return a dictionary that contains tf-idf value for all the words in the query
def calculateQryTFIDF(words):
# dicQry is used to store query words info
# word : [count, tf-idf of this word in the query]
# e.g., apple : [5,0.6]
dicQry = {}
numOfWords = len(words)
for word in words:
if word in dicQry:
dicQry[word][0] += 1
else:
dicQry[word] = [1,1]
# calculate tf-idf for each word and store it to the first entry of its value
for word in dicQry:
tf = dicQry[word][0] / numOfWords
# tfidf = log2(1+tf) * tf
dicQry[word][1] = math.log2(1+tf) * searchdata.get_idf(word)
return dicQry
def calculateSimilarityScore(dict, list, boost):
# for each search word in dict, calculate similarity score and store it to the list (dictionary list that tracks url, title and score)
for index in range(len(list)):
numerator = 0
denominator = 0
sumSqrDocTFIDF = 0
sumSqrQryTFIDF = 0
for word in dict:
docTFIDF = searchdata.get_tf_idf(list[index]["url"], word)
qryTFIDF = dict[word][1]
# calculate the numerator
numerator += docTFIDF * qryTFIDF
# calculate the denominator factors
sumSqrDocTFIDF += docTFIDF**2
sumSqrQryTFIDF += qryTFIDF**2
# calculate the denominator
denominator = math.sqrt(sumSqrDocTFIDF * sumSqrQryTFIDF)
# calculate the final result
if(denominator == 0):
list[index]["score"] = 0
else:
score = numerator/denominator
if boost:
# score multiplied by page rank
score *= searchdata.get_page_rank(list[index]["url"])
list[index]["score"] = score
return list
def findTopResults(list, num):
topResults = []
# make sure the required top num is no more than the length of the unsorted list
if len(list) >= num:
for i in range(num):
score = -1
index = 0
page = {}
for j in range(len(list)):
if (list[j]["score"]) > score:
page = list[j]
score = list[j]["score"]
index = j
topResults.append(page)
list.pop(index)
return topResults
else:
return None