forked from BORG-ReFiG/PythonScraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.py
executable file
·293 lines (240 loc) · 11 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import sys
import re
import csv
import logging
import os
import glob
import time
from collections import Counter
import pandas as pd
from tqdm import tqdm
logger = logging.getLogger(__name__)
# current time, used in the names of the folder and the logging file
curtime = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
# Create a new log file
logging.basicConfig(filename=('_unisearchlog_' + curtime + '.log'),
level=logging.DEBUG
)
def main():
current_working_dir = os.getcwd() # current directory we are standing on
# with every run, remove any older result CSVs for the folder
try:
os.remove(csv_file_name)
os.remove(sorted_csv_file_name)
except FileNotFoundError as e:
pass
# given the name of the folder, this gets all the saved page files as
# a list
all_txt_files = glob.glob(
os.path.join(current_working_dir,
"{}*/*.*.txt".format(folder_name)),
recursive=False
)
# Not a good sign if list if empty...
if not all_txt_files:
logger.error("{}: Folder is empty or does not exist.".
format(folder_name))
sys.exit()
# read boost terms from file into a list
boost_terms = get_file_content_as_list('boost_terms.txt')
# make the terms lowercase
boost_terms = [x.lower() for x in boost_terms]
# read keywords from file into a list
keywords = get_file_content_as_list(keywords_file)
# make the keywords lowercase
keywords = [x.lower() for x in keywords]
# make keywords dictionary with zero frequency as value
all_keywords = dict((strip_weights(el)[0], 0) for el in keywords)
all_keywords_dict = Counter(all_keywords)
sorted_keywords_list = sorted(all_keywords_dict.items())
# extract a sorted list of keywords to write as CSV headers
headers = [str(x) for x, y in sorted_keywords_list]
# prepend url header onto the keywords list
headers.insert(0, u'url')
headers.insert(1, u'freq_boost_sum')
headers.insert(2, u'frequency_sum')
headers.insert(3, u'boost_sum')
pbar = tqdm(total=len(all_txt_files))
tqdm.write("Found {} files to search. Please wait.".
format(len(all_txt_files)))
with open(csv_file_name, 'a+', encoding="utf-8-sig") as f:
# Using dictionary keys as fieldnames for the CSV file header
writer = csv.DictWriter(f, headers)
writer.writeheader()
logger.info("CSV headers written")
for idx, txt_file in enumerate(all_txt_files):
with open(txt_file, "r", encoding="utf-8-sig") as fp:
visible_text_list = fp.readlines()
current_url = visible_text_list[0].strip().rstrip()
num_digits = len(str(len(all_txt_files)))
tqdm.write("[{0:0{width}d}] {1}".
format(idx+1, current_url, width=num_digits))
logger.info("Working on: {}".format(current_url))
visible_text_list = [x.lower() for x in visible_text_list]
# This try/except loop ensures that
# you'll catch TookTooDamnLongException when it's sent.
# https://stackoverflow.com/questions/25027122/break-the-function-after-certain-time
# counts keywords in page
found_count, found_keywords, broad_terms_sum = count_keywords(
visible_text_list,
boost_terms,
keywords
)
if broad_terms_sum < 1:
# if none of the terms exist, don't event bother
pbar.update(1)
continue
logger.info("Keywords found: {}".format(found_count))
found_keywords_as_dict = dict((x, y) for x, y in found_keywords)
found_keywords_freq_dict = Counter(found_keywords_as_dict)
all_keywords_dict = Counter(all_keywords)
# combine both dicts to have uniform dictionary for all pages
all_keywords_dict.update(found_keywords_freq_dict)
logger.info("Keywords search results merged!")
# after merging, sort the resulting dictionary based on keys to
# make a tuples list that is always uniform for every page
sorted_keywords_list = sorted(all_keywords_dict.items())
# create a sorted dictionary list
final_csv_dict = []
final_csv_dict.append({x: y for x, y in sorted_keywords_list})
logger.info("Final dictionary appended!")
# prepend the current URL onto the frequencies dict object
freq_sum = sum(final_csv_dict[0].values())
boost_multiplied = broad_terms_sum*10
final_csv_dict[0]['boost_sum'] = boost_multiplied
final_csv_dict[0]['frequency_sum'] = freq_sum+boost_multiplied
final_csv_dict[0]['freq_boost_sum'] = (freq_sum+boost_multiplied)+boost_multiplied
final_csv_dict[0]['url'] = current_url
# ignore zero frequency_sum...
if freq_sum == 0:
pbar.update(1)
continue
for d in final_csv_dict:
writer.writerow(d)
logger.info("Row written successfully!")
pbar.update(1)
pbar.close()
sort_csv(csv_file_name, sorted_csv_file_name)
def sort_csv(csv_input, csv_output):
"""Uses pandas to sort the CSV from the highest frequency
summation to the lowest.
"""
df = pd.read_csv(csv_input)
df = df.sort_values(['freq_boost_sum'], ascending=[0])
# remove duplicates
print(df.shape)
df.drop_duplicates(subset=['url'], keep='first', inplace=True)
print(df.shape)
df.to_csv(csv_output, index=False)
def strip_weights(token):
"""Extracts the weights from keywords from the file
Return keyword and assigned weight if any otherwise default weight one
"""
try:
weighted_token = token.split("|", 1)[0].strip()
token_weight = token.split("|", 1)[1]
except IndexError as e: # catch IndexError since no weight is observed
weighted_token = token.strip()
token_weight = 1
return weighted_token, token_weight
def get_file_content_as_list(file_name):
"""Give a filename, open and read the contents into a list
file_name - file to be opened
return list of words
"""
with open(file_name, 'r') as file_name_handle:
return file_name_handle.read().splitlines()
def count_keywords(list_of_tokens, list_of_boost_terms, list_of_target_words):
"""Counts how many instances of the keywords were found
list_of_tokens - The list of words as haystack
boost_terms - The list of broader terms to check for after keywords search.
E.g. if words "program", "academic" appear then boost this page further.
keywords - The list of words as needle
return number of words, list of keywords found
Inspiration: http://www.cademuir.eu/blog/2011/10/20/python-searching-for-a-string-within-a-list-list-comprehension/
https://developmentality.wordpress.com/2011/09/22/python-gotcha-word-boundaries-in-regular-expressions/
"""
num_target_words = 0
total_weights_sum = 0
num_target_terms = 0
matched_words = []
for token in list_of_target_words: # Goes through the tokens in the list
weighted_token, token_weight = strip_weights(token)
# regex = re.compile(".*({}).*".format(token)) # does match in-word substrings
regex = re.compile(".*(\\b{}\\b).*".format(weighted_token)) # match strictly whole words only
# found_what = [m.group(0) for l in list_of_target_words for m in [regex.search(l)] if m]
found_what = [m.group(1) for l in list_of_tokens for m in [regex.search(l)] if m]
if len(found_what) > 0: # For each one it checks if it is in the target list
num_target_words = len(found_what)*int(token_weight)
total_weights_sum = total_weights_sum + int(token_weight)
matched_words.append((weighted_token, num_target_words))
if total_weights_sum > len(found_what): # check that
num_target_terms, matched_terms = relevancy_boost(list_of_tokens, list_of_boost_terms)
# print(num_target_terms, matched_terms)
return num_target_words, matched_words, num_target_terms # Note that we are returning a tuple (2 values)
def relevancy_boost(list_of_tokens, boost_terms):
num_target_words = 0
total_terms_count = 0
matched_words = []
for term in boost_terms: # Goes through the tokens in the list
weighted_term, term_weight = strip_weights(term)
# regex = re.compile(".*({}).*".format(token)) # does match in-word substrings
regex = re.compile(".*(\\b{}\\b).*".format(weighted_term)) # match strictly whole words only
# found_what = [m.group(0) for l in list_of_target_words for m in [regex.search(l)] if m]
found_what = [m.group(1) for l in list_of_tokens for m in [regex.search(l)] if m]
if len(found_what) > 0: # For each one it checks if it is in the target list
num_target_words = len(found_what)*int(term_weight)
total_terms_count = total_terms_count + len(found_what)
matched_words.append((weighted_term, num_target_words))
return total_terms_count, matched_words # Note that we are returning a tuple (2 values)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Generate a sorted CSV file with keyword frequencies'
' from scraped web pages.'
)
parser.add_argument(
'-f',
'--folder',
dest='folder_name',
default=None,
required=True,
help='Name of directory with scraped pages (mandatory)'
)
parser.add_argument(
'-k',
'--keywords_file',
dest='keywords_file',
default=None,
required=True,
help='File with keywords to search for in the directory (mandatory)'
)
parser.add_argument(
'-p',
'--patience',
dest='patience',
default=30,
required=False,
help="Number of seconds you can give per-page-search. Life is too" \
" short to parse unabridged web pages. Default is 30. Bye"
)
# these are module global variables and can be access by any function in
# this module
args = parser.parse_args()
folder_name = args.folder_name
keywords_file = args.keywords_file
patience = int(args.patience)
# the output files of all observed keyword frequencies
csv_file_name = "{}_results.csv".format(folder_name)
sorted_csv_file_name = "{}_results_sorted.csv".format(folder_name)
try:
main()
except KeyboardInterrupt as e:
logger.info("Script interrupted by user")
sort_csv(csv_file_name, sorted_csv_file_name)
try:
sys.exit(0)
except SystemExit:
os._exit(0)