-
Notifications
You must be signed in to change notification settings - Fork 0
/
ImageSearch.py
334 lines (270 loc) · 12.5 KB
/
ImageSearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
from utils.media_sender import ImageSender
import requests, urllib
import config
class EvaluateChoose(object):
def __init__(self, models_path, configuration_path):
self.models_path = models_path
self.configuration_path = configuration_path
def evaluation(self, validation_data, validation_labels):
# This function takes validation data and labels and evaluate them
# Evaluation results are gathered in list which is saved to txt file. This saving is done because of preventing
# evaluate it every running.
# Returns results of evaluation which is list of tuples. Each tuple has such shape (model_name, acc, loss).
results = []
file = 'results_of_models.json'
file_path = os.path.join(self.configuration_path, file)
if os.path.exists(file_path):
with open(file_path) as results_file:
results_dict = json.load(results_file)
results_str = results_dict['results']
for each in results_str:
results.append((each[0], float(each[1]), float(each[2])))
print('Results were loaded from {}!'.format(self.configuration_path))
else:
models = os.listdir(self.models_path)
for each in models[:-3]:
print('{} model is loaded from the {} folder.'.format(each, self.models_path))
model_path = os.path.join(self.models_path, each)
model = tensorflow.keras.models.load_model(model_path)
loss, acc = model.evaluate(validation_data, validation_labels, verbose=2)
results.append((each, str(acc), str(loss)))
results_dict = {'results': results}
with open(file_path, 'w') as results_file:
json.dump(results_dict, results_file)
print('Results were saved to {}!'.format(self.configuration_path))
return results
def choose_max(self, validation_data, validation_labels):
# validation data and validation labels are given as input in order to call evaluation function
# We get results list and we choose the model which has maximum accuracy
results = self.evaluation(validation_data, validation_labels)
max_result = max(results, key=lambda item: item[1])
print('The best accuracy was obtained from the {}!'.format(max_result[0]))
return max_result
def split_return_aug_types(self, model_name):
# This function is for splitting augmentation types from the model name. It is used in order to make the code
# run automatically
temp_name = model_name[:-3]
delete_index = len('model_')
result = temp_name[delete_index:]
aug_types = []
if 'together' in result:
aug_types.append(result)
elif 'without_augmentation' in result:
aug_types = []
else:
if '_' in result:
aug_types = result.split('_')
else:
aug_types.append(result)
return aug_types
def getwords(doc):
splitter = re.compile('\\W*')
# Split the words by non-alpha characters
words = [s.lower() for s in splitter.split(doc)
if len(s)>2 and len(s)<20]
# Return the unique set of words only
return dict([(w,1) for w in words])
class BingViews():
def __init__(self, interface_layer):
self.image_sender = ImageSender(interface_layer)
self.routes = [
("/i(mage)?\s(?P<term>[^$]+)$", self.bing_image_search)
]
def bing_image_search(self, message, match):
req = requests.get("https://api.datamarket.azure.com/Bing/Search/v1/Image?Query=%27{}%27&$format=json&$top=1".format(match.group("term")), auth=("",config.bing_api))
image_url = urllib.unquote(req.json()['d']['results'][0]['MediaUrl'].encode('utf-8'))
self.image_sender.send_by_url(jid=message.getFrom(), file_url=image_url)
def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
# Calculate current probability
basicprob = prf(f,cat)
# Count the number of times this feature has appeared in
# all categories
totals = sum([self.fcount(f,c) for c in self.categories()])
# Calculate the weighted average
bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
return bp
def sampletrain(cl):
cl.train('Nobody owns the water.','good')
cl.train('the quick rabbit jumps fences','good')
cl.train('buy pharmaceuticals now','bad')
cl.train('make quick money at the online casino','bad')
cl.train('the quick brown fox jumps','good')
class naivebayes(classifier):
def __init__(self,getfeatures):
classifier.__init__(self,getfeatures)
self.thresholds = {}
def setthreshold(self,cat,t):
self.thresholds[cat] = t
def getthreshold(self,cat):
if cat not in self.thresholds:
return 1.0
return self.thresholds[cat]
def classify(self, item, default=None):
# Find the category with the highest probability
max = 0
for cat in self.categories():
cat_prob = self.prob(item, cat)
# print cat, probs[cat]
if cat_prob >= max:
max = cat_prob
best = cat
return best
def classify_with_thresholds(self,item,default=None):
probs={}
# Find the category with the highest probability
max=0.0
for cat in self.categories():
probs[cat]=self.prob(item,cat)
#print cat, probs[cat]
if probs[cat]>max:
max = probs[cat]
best = cat
# Make sure the probability exceeds threshold*next best
for cat in probs:
if cat == best:
continue
if probs[cat]*self.getthreshold(best)>probs[best]:
return default
return best
class Member: # class for member
def __init__(self, name, profile_url, publications):
self.name = name
self.profile_url = profile_url
self.publications = publications
class IMGpro: # class for IMG
def __init__(self, title, summary, PI_name):
self.title = title
self.summary = summary
self.PI_name = PI_name
root = Tk()
root.geometry("1050x500")
root.title("PI Estimator Tool for SEHIR CS Projects")
app = PI_Estimator(root)
root.mainloop()
# Validation data is taken from folders
data_validation, labels_validation = rd_validation.load_augmented_dataset([])
validation_dataset = (data_validation, labels_validation)
# Evaluate and Choose class is declared here
ev_choose = EvaluateChoose(models_path, configuration_path)
maximum_result = ev_choose.choose_max(data_validation, labels_validation)
augmentation_of_max = ev_choose.split_return_aug_types(maximum_result[0])
# Transfer Learning for non-pretrained model 100 epochs
# unfreezed weights trainable
# NP False None True non pretrained
# PF False 'imagenet' False frozen pretrained
# PD True 'imagenet' True defrosted pretrained
models = {'pre_trained_frozen': [False, 'imagenet', False],
'unfreezed_pretrained': [True, 'imagenet', True],
'non_pretrained': [False, None, True]}
epochs = 100
print('VGG16 will be trained in {} combinations on the following dataset: {}.'.format(len(models), augmentation_of_max))
train_data_raw, train_labels_raw = rd_train.load_augmented_dataset(augmentation_of_max)
for each in models.keys():
model_params = models[each]
print('Transfer Learning model has chosen as {} version of ResNet'.format(each))
print('Model parameters will be: Unfreezed: {}, Weights: {}, Trainable: {}'.format(model_params[0], model_params[1],
model_params[2]))
transfer_learning_model = TransferModel(epochs, input_shape, models_path, defrosted=model_params[0],
weights=model_params[1], trainable=model_params[2])
history = transfer_learning_model.model(train_data_raw, train_labels_raw, validation_dataset, model_name=each)
path_image_accuracy = os.path.join(results_path, each + '_accuracy.png')
path_image_loss = os.path.join(results_path, each + '_loss.png')
plot_images_results(path_image_accuracy, history, 'accuracy')
plot_images_results(path_image_loss, history, 'loss')
class fisherclassifier(classifier):
def cprob(self,f,cat):
# The frequency of this feature in this category
clf = self.fprob(f,cat)
if clf == 0:
return 0
# The frequency of this feature in all the categories
freqsum = sum([self.fprob(f,c) for c in self.categories()])
# The probability is the frequency in this category divided by
# the overall frequency
p = clf/(freqsum)
return p
def __init__(self,getfeatures):
classifier.__init__(self,getfeatures)
self.minimums={}
def setminimum(self,cat,min):
self.minimums[cat]=min
def getminimum(self,cat):
if cat not in self.minimums:
return 0
return self.minimums[cat]
def fisherprob(self,item,cat):
# Multiply all the probabilities together
p = 1
features = self.getfeatures(item)
for f in features:
p *= (self.weightedprob(f,cat,self.cprob))
# Take the natural log and multiply by -2
fscore = -2*math.log(p)
# Use the inverse chi2 function to get a probability
return self.invchi2(fscore,len(features)*2)
def invchi2(self,chi,df):
m = chi / 2.0
sum = term = math.exp(-m)
for i in range(1, df//2):
term *= m / i
sum += term
return min(sum, 1.0)
def classify(self,item,default=None):
# Loop through looking for the best result
best = default
max = 0.0
for c in self.categories():
p = self.fisherprob(item,c)
# Make sure it exceeds its minimum
if p>self.getminimum(c) and p>max:
best=c
max=p
return best
class Predictor: # Predictor class for collecting data, training and creating classifier and make prediction
def __init__(self):
self.classifier = ""
self.faculty_members = {}
self.projects = {}
def fetch_members(self): # collects the links to the members profile pages from the first link
url = app.entry_url_people.get()
page = urllib2.urlopen(url)
doc = page.read()
soup = BeautifulSoup(doc, 'html.parser')
items = soup.find_all(class_="member")
links_temp = []
for i in items:
for tag in i.find_all('a'):
links_temp.append(tag.get('href'))
links = []
i = 0
while i < len(links_temp):
links.append("ht"+links_temp[i])
i += 3
return links
def fetch_publications(self): # goes to each members profile page and collects all the necessary data
list_of_members_url = self.fetch_members()
for member_url in list_of_members_url:
url = member_url
page = urllib2.urlopen(url)
doc = page.read()
soup = BeautifulSoup(doc, 'html.parser')
name = soup.find_all('h3')
name = name[0].text.split()
name = name[0] + " " + name[-1]
table = soup.find_all(class_="tab-pane active pubs")
publications = []
for item in table:
for tag in item.find_all("li"):
app_item = tag.text.strip()[4:] # filtering out unwanted info and characters
while app_item.startswith('\n'):
app_item = app_item[1:]
if app_item.endswith("[1\n Citation]"):
app_item = app_item[:-19]
elif app_item.endswith("\n \n Citations]"):
app_item = app_item[:-23]
while app_item.endswith('\n'):
app_item = app_item[:-1]
app_item = app_item
publications.append(app_item)
current_fac_member = FacultyMember(name, member_url, publications) # crating a faculty member object for adding to database
self.faculty_members.setdefault(name, current_fac_member) # adding each member to the database