-
Notifications
You must be signed in to change notification settings - Fork 6
/
TRPO_A2C.py
191 lines (164 loc) · 8.08 KB
/
TRPO_A2C.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 16 12:55:20 2022
@author: Abhilash
"""
import random
from collections import deque
import numpy as np
import tensorflow as tf
from tensorflow.keras.activations import softmax
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
import threading
from utils import Portfolio
import tensorflow.keras.backend as k
# Tensorflow GPU configuration
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(sess)
tf.compat.v1.disable_eager_execution()
class ActorNetwork:
def __init__(self, sess, state_size, action_dim, buffer_size, tau, learning_rate, is_eval=False, model_name=""):
self.sess = sess
self.tau = tau
self.learning_rate = learning_rate
self.action_dim = action_dim
self.model, self.states = self.create_actor_network(state_size, action_dim)
self.model_target, self.target_state = self.create_actor_network(state_size, action_dim)
self.model_target.set_weights(self.model.get_weights())
self.action_gradient = tf.compat.v1.placeholder(tf.float32, [None, action_dim])
print("chain rule: ∂a/∂θ * ∂Q(s,a)/∂a (action_gradients); minus sign for gradient descent; 1/buffer_size for mean value")
self.sampled_policy_grad = tf.gradients(self.model.output/buffer_size, self.model.trainable_weights, -self.action_gradient)
self.update_actor_policy = Adam(learning_rate=learning_rate).apply_gradients(zip(self.sampled_policy_grad, self.model.trainable_weights))
def train(self, states_batch, action_grads_batch):
print("Policy gradient",self.update_actor_policy)
self.update_actor_policy
def transfer_target(self):
actor_weights = self.model.get_weights()
actor_target_weights = self.model_target.get_weights()
for i in range(len(actor_weights)):
actor_target_weights[i] = self.tau * actor_weights[i] + (1 - self.tau) * actor_target_weights[i]
self.model_target.set_weights(actor_target_weights)
def create_actor_network(self, state_size, action_dim):
states = Input(shape=[state_size])
h0 = Dense(24, activation='relu')(states)
h1 = Dense(48, activation='relu')(h0)
h2 = Dense(24, activation='relu')(h1)
actions = Dense(self.action_dim, activation='softmax')(h2)
model = Model(inputs=states, outputs=actions)
self.actor_model=model
return self.actor_model, states
class CriticNetwork:
def __init__(self, sess, state_size, action_dim, tau, learning_rate, is_eval=False, model_name=""):
self.sess = sess
self.tau = tau
self.learning_rate = learning_rate
self.action_dim = action_dim
self.model, self.actions, self.states = self.create_critic_network(state_size, action_dim)
self.model_target, self.target_action, self.target_state = self.create_critic_network(state_size, action_dim)
self.action_grads = tf.gradients(self.model.output, self.actions)
def gradients(self, states_batch, actions_batch):
return self.sess.run(self.action_grads, feed_dict={self.states: states_batch, self.actions: actions_batch})[0]
def train_target(self):
critic_weights = self.model.get_weights()
critic_target_weights = self.model_target.get_weights()
for i in range(len(critic_weights)):
critic_target_weights[i] = self.tau * critic_weights[i] + (1 - self.tau) * critic_target_weights[i]
self.model_target.set_weights(critic_target_weights)
def create_critic_network(self, state_size, action_dim):
states = Input(shape=[state_size])
actions = Input(shape=[action_dim])
h0 = Concatenate()([states, actions])
h1 = Dense(24, activation='relu')(h0)
h2 = Dense(48, activation='relu')(h1)
h3 = Dense(24, activation='relu')(h2)
Q = Dense(action_dim, activation='relu')(h3)
model = Model(inputs=[states, actions], outputs=Q)
def trpo_ppo_clip_loss(y_true,y_pred):
entropy=2e-5
clip_loss=0.2
old_log= k.sum(y_true)
print(old_log)
pred_log=k.sum(y_pred)
print(pred_log)
r=pred_log/(old_log + 1e-9)
advantage=pred_log-old_log
p1=r*advantage
p2=k.clip(r,min_value=1-clip_loss,max_value=1+clip_loss)*advantage
prob=1e-2
loss=-k.mean(k.minimum(p1,p2) + entropy*(-(prob*k.log(prob+1e-10))))
return loss
def trpo_ppo_penalty_loss(y_true,y_pred):
entropy=2e-5
clip_loss=0.2
old_log= k.sum(y_true)
print(old_log)
pred_log=k.sum(y_pred)
print(pred_log)
r=pred_log/(old_log + 1e-9)
kl_divergence= k.sum(old_log* k.log(old_log/pred_log))
advantage=kl_divergence
p1=r*advantage
p2=k.clip(r,min_value=1-clip_loss,max_value=1+clip_loss)*advantage
prob=1e-2
loss=-k.mean(k.minimum(p1,p2) + entropy*(-(prob*k.log(prob+1e-10))))
return loss
model.compile(loss=trpo_ppo_penalty_loss, optimizer=Adam(lr=self.learning_rate, decay=1e-6))
return model, actions, states
class TRPO_A2C_Agent(Portfolio):
def __init__(self, state_dim, balance,is_eval=False):
super().__init__(balance=balance)
self.model_type = 'TRPO_A2C'
self.state_dim = state_dim
self.action_dim = 3 # hold, buy, sell
self.memory = deque(maxlen=100)
self.buffer_size = 90
self.gamma = 0.95 # discount factor
self.is_eval = is_eval
tau = 0.001 # Target network hyperparameter
learning_rate_actor = 0.001 # learning rate for Actor network
learning_rate_critic = 0.001 # learning rate for Critic network
model_name="AC"
self.actor = ActorNetwork(sess, state_dim, self.action_dim, self.buffer_size, tau, learning_rate_actor, is_eval, model_name)
self.critic = CriticNetwork(sess, state_dim, self.action_dim, tau, learning_rate_critic)
self.tensorboard = tf.keras.callbacks.TensorBoard(log_dir='./logs/TRPO_A2C_tensorboard', update_freq=90)
self.tensorboard.set_model(self.critic.model)
def reset(self):
self.reset_portfolio()
def remember(self, state, actions, reward, next_state, done):
self.memory.append((state, actions, reward, next_state, done))
def act(self, state, t):
actions = self.actor.model.predict(state)[0]
print("Action",actions)
return actions
def experience_replay(self):
# sample random buffer_size long memory
mini_batch = random.sample(self.memory, self.buffer_size)
y_batch = []
for state, actions, reward, next_state, done in mini_batch:
if not done:
Q_target_value = self.critic.model_target.predict([next_state, self.actor.model_target.predict(next_state)])
y = reward + self.gamma * Q_target_value
else:
y = reward * np.ones((1, self.action_dim))
y_batch.append(y)
y_batch = np.vstack(y_batch)
states_batch = np.vstack([tup[0] for tup in mini_batch]) # batch_size * state_dim
actions_batch = np.vstack([tup[1] for tup in mini_batch]) # batch_size * action_dim
lock=threading.Lock()
#lock.acquire()
# update critic by minimizing the loss
loss = self.critic.model.train_on_batch([states_batch, actions_batch], y_batch)
print("Critic Loss", loss)
#lock.release()
# update actor using the sampled policy gradients
action_grads_batch = self.critic.gradients(states_batch, self.actor.model.predict(states_batch)) # batch_size * action_dim
self.actor.train(states_batch, action_grads_batch)
# update target networks
self.actor.transfer_target()
print("Transfer weight to actor")
#self.critic.train_target()
return loss