-
Notifications
You must be signed in to change notification settings - Fork 0
/
Agents.py
203 lines (168 loc) · 6.87 KB
/
Agents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
"""
MABP:- Creating simulated data for Multi Armed Bandit problem (N armed TestBed)
"""
import numpy as np
from collections import defaultdict
from typing import List
from numpy.core.fromnumeric import var
from seaborn.categorical import factorplot
flag = 0
class Bandit:
def __init__(self, action_id, mean):
"""
Constructor for a single bandit
Args:
action_id : unique id of an action
mean: True mean of the reward distribution
"""
self.id = action_id
self.mean = mean
def pull_arm(self):
"""
function returning observed rewards after pressing lever
with mean = self.mean and sigma = 1
"""
return np.random.normal(loc = 0, scale = 1)+self.mean
class Logging:
"""
class for implementing
"""
def __init__(self):
self.total_actions = 0
self.total_rewards = 0
self.all_rewards = []
self.all_actions = []
self.record = defaultdict(lambda: dict(actions=0, reward=0))
def record_action(self, bandit, reward):
self.total_actions += 1
self.total_rewards += reward
self.all_rewards.append(reward)
self.all_actions.append(bandit.id)
self.record[bandit.id]['actions'] += 1
self.record[bandit.id]['reward'] += reward
def __getitem__(self, bandit):
return self.record[bandit.id]
class EpsilonGreedyAgent:
def __init__(self, bandits: List[Bandit], epsilon:float = None):
"""
Constructor for Epsilon Greedy Agent on 10 armed test bed
"""
self.bandits = bandits
self.epsilon = epsilon
# mu_pri and var_pri are hypermaterers for prior distribution
self.mu_pri = np.zeros(len(self.bandits))
self.var_pri = np.ones(len(self.bandits))*5
# mu and var are hyperparameters for posterior distribution
self.mu = self.mu_pri
self.var = self.var_pri
self.var0 = 1 # 1 is taken but any can be taken no prob upto a limit constant of inintial distribution
self.logging = Logging()
def _get_random_bandit(self)-> Bandit:
"""
random choice of bandits
"""
return np.random.choice(self.bandits)
def _get_max_estimated_bandit(self)->Bandit:
"""
this function is used to estimate model based on mean/mode
"""
# print("mus - ", self.mu)
# print("actions - ", np.argmax(self.mu))
unique, counts = np.unique(self.mu, return_counts=True)
lens = counts[np.argmax(unique)]
if lens>1: # if two actions have same argmax
# then return arbitrarily from those max ones
maxs = list(np.array(self.bandits)[self.mu==unique[np.argmax(unique)]])
return np.random.choice(maxs)
# otherwise return the max one
return self.bandits[np.argmax(self.mu)]
def _update(self, bandit):
"""
returning maximum one using sam[ple averaging method
"""
bandit_logs = self.logging[bandit]
bandit = bandit.id
estimate = bandit_logs['reward'] / bandit_logs['actions'] # if not assigned
actions = bandit_logs['actions']
self.mu[bandit] = (self.mu_pri[bandit]/self.var_pri[bandit] + actions*estimate/self.var0)/(actions/self.var0 + 1/self.var_pri[bandit])
self.var[bandit] = 1/(actions/self.var0 + 1/self.var[bandit])
def _choose_bandit(self)->Bandit:
epsilon = self.epsilon
global flag
p = np.random.uniform(0, 1, 1)
if p < epsilon or (flag==0 and epsilon==0):
flag = 1
bandit = self._get_random_bandit()
else:
bandit = self._get_max_estimated_bandit()
return bandit
def action(self):
current_bandit = self._choose_bandit()
reward = current_bandit.pull_arm()
self.logging.record_action(current_bandit, reward)
self._update(current_bandit)
def actions(self, timesteps):
for _ in range(timesteps):
self.action()
return self.logging.all_rewards, self.logging.all_actions
class ThomspsonSamplingAgent:
def __init__(self, bandits: List[Bandit]):
"""
Constructor for Thompson Sampling Agent on 10 armed test bed
"""
self.bandits = bandits
# mu_pri and var_pri are hypermaterers for prior distribution
self.mu_pri = np.zeros(len(self.bandits)) # 5 for each
self.var_pri = np.ones(len(self.bandits))*5
# mu and var are hyperparameters for posterior distribution
self.mu = self.mu_pri
self.var = self.var_pri
self.var0 = 1 # 1 is taken but any can be taken no prob upto a limit constant of inintial distribution
self.logging = Logging()
def _get_max_sampled_bandit(self)->Bandit:
"""
this function is used to estimate model based on mean/mode
"""
estimates = []
for bandit in self.bandits:
estimates.append(np.random.normal(loc =self.mu[bandit.id], scale = self.var[bandit.id]))
return self.bandits[np.argmax(estimates)]
def _update(self, bandit):
"""
returning maximum one using sam[ple averaging method
"""
bandit_logs = self.logging[bandit]
bandit = bandit.id
if not bandit_logs['actions']:
estimate = 0 # if not taken till now then 0 is assigned
actions = 0
else:
estimate = bandit_logs['reward'] / bandit_logs['actions'] # if not assigned
actions = bandit_logs['actions']
self.mu[bandit] = (self.mu_pri[bandit]/self.var_pri[bandit] + actions*estimate/self.var0)/(actions/self.var0 + 1/self.var_pri[bandit])
self.var[bandit] = 1/(actions/self.var0 + 1/self.var[bandit])
def _choose_bandit(self)->Bandit:
bandit = self._get_max_sampled_bandit()
return bandit
def action(self):
current_bandit = self._choose_bandit()
reward = current_bandit.pull_arm()
self.logging.record_action(current_bandit, reward)
self._update(current_bandit)
def actions(self, timesteps):
for _ in range(timesteps):
self.action()
return self.logging.all_rewards, self.logging.all_actions
class OTS(ThomspsonSamplingAgent):
"""Optimistic Thompson Sampling Agent
Subclassing Thompson Sampling Agent only for _changing _get_max_sampled_bandit function"""
def _get_max_sampled_bandit(self)->Bandit:
"""
this function is used to estimate model based on mean/mode
"""
estimates = []
for bandit in self.bandits:
Qth = np.random.normal(loc =self.mu[bandit.id], scale = self.var[bandit.id])
f_hat = self.mu[bandit.id]#computing moving_average here
estimates.append(max(Qth, f_hat))
return self.bandits[np.argmax(estimates)]