From 5dc50ebbb5d67299704f2f1e8e810d68dfaa0950 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Mon, 29 Jun 2020 17:28:26 +0000 Subject: [PATCH 01/34] Copy code from Colab --- Gym/DQN/main.swift | 278 +++++++++++++++++++++++++++++++++++++++++++++ Gym/README.md | 1 + Package.swift | 1 + 3 files changed, 280 insertions(+) create mode 100644 Gym/DQN/main.swift diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift new file mode 100644 index 00000000000..150a0ef1954 --- /dev/null +++ b/Gym/DQN/main.swift @@ -0,0 +1,278 @@ +// Copyright 2020 The TensorFlow Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if canImport(PythonKit) + import PythonKit +#else + import Python +#endif +import TensorFlow + +// Force unwrapping with `!` does not provide source location when unwrapping `nil`, so we instead +// make a utility function for debuggability. +fileprivate extension Optional { + func unwrapped(file: StaticString = #filePath, line: UInt = #line) -> Wrapped { + guard let unwrapped = self else { + fatalError("Value is nil", file: (file), line: line) + } + return unwrapped + } +} + +// Initialize Python. This comment is a hook for internal use, do not remove. + +let np = Python.import("numpy") +let gym = Python.import("gym") + +typealias State = Tensor +typealias Action = Tensor +typealias Reward = Tensor + +class ReplayBuffer { + var states: Tensor + var actions: Tensor + var rewards: Tensor + var nextStates: Tensor + let capacity: Int + var count: Int + var index: Int + + init(capacity: Int) { + self.capacity = capacity + + states = Tensor(numpy: np.zeros([capacity, 4], dtype: np.float32))! + actions = Tensor(numpy: np.zeros([capacity, 1], dtype: np.int32))! + rewards = Tensor(numpy: np.zeros([capacity, 1], dtype: np.float32))! + nextStates = Tensor(numpy: np.zeros([capacity, 4], dtype: np.float32))! + count = 0 + index = 0 + } + + func append(state: Tensor, action: Tensor, reward: Tensor, nextState: Tensor) { + if count < capacity { + count += 1 + } + // Erase oldest SARS if the replay buffer is full + states[index] = state + actions[index] = Tensor(numpy: np.expand_dims(action.makeNumpyArray(), axis: 0))! + rewards[index] = Tensor(numpy: np.expand_dims(reward.makeNumpyArray(), axis: 0))! + nextStates[index] = nextState + index = (index + 1) % capacity + } + + func sample(batchSize: Int) -> (stateBatch: Tensor, actionBatch: Tensor, rewardBatch: Tensor, nextStateBatch: Tensor) { + let randomIndices = Tensor(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))! + + let stateBatch = _Raw.gather(params: states, indices: randomIndices) + let actionBatch = _Raw.gather(params: actions, indices: randomIndices) + let rewardBatch = _Raw.gather(params: rewards, indices: randomIndices) + let nextStateBatch = _Raw.gather(params: nextStates, indices: randomIndices) + + return (stateBatch, actionBatch, rewardBatch, nextStateBatch) + } +} + +struct Net: Layer { + typealias Input = Tensor + typealias Output = Tensor + + var l1, l2: Dense + + init(observationSize: Int, hiddenSize: Int, actionCount: Int) { + l1 = Dense(inputSize: observationSize, outputSize: hiddenSize, activation: relu, weightInitializer: heNormal()) + l2 = Dense(inputSize: hiddenSize, outputSize: actionCount, weightInitializer: heNormal()) + } + + @differentiable + func callAsFunction(_ input: Input) -> Output { + return input.sequenced(through: l1, l2) + } +} + +class Agent { + // Q-network + var qNet: Net + // Target Q-network + var targetQNet: Net + // Optimizer + let optimizer: Adam + // Replay Buffer + let replayBuffer: ReplayBuffer + // Discount Factor + let discount: Float + + init(qNet: Net, targetQNet: Net, optimizer: Adam, replayBuffer: ReplayBuffer, discount: Float) { + self.qNet = qNet + self.targetQNet = targetQNet + self.optimizer = optimizer + self.replayBuffer = replayBuffer + self.discount = discount + } + + func getAction(state: Tensor, epsilon: Float) -> Tensor { + if Float(np.random.uniform()).unwrapped() < epsilon { + // print("getAction | state: \(state)") + // print("getAction | epsilon: \(epsilon)") + let npState = np.random.randint(0, 2, dtype: np.int32) + // print("getAction | npState: \(npState)") + return Tensor(numpy: np.array(npState, dtype: np.int32))! + } + else { + // Neural network input needs to be 2D + let tfState = Tensor(numpy: np.expand_dims(state.makeNumpyArray(), axis: 0))! + let qValues = qNet(tfState) + let leftQValue = Float(qValues[0][0]).unwrapped() + let rightQValue = Float(qValues[0][1]).unwrapped() + return leftQValue < rightQValue ? Tensor(numpy: np.array(1, dtype: np.int32))! : Tensor(numpy: np.array(0, dtype: np.int32))! + } + } + + func train(batchSize: Int) { + // Don't train if replay buffer is too small + if replayBuffer.count >= batchSize { + // print("train | Start training") + let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch) = replayBuffer.sample(batchSize: batchSize) + + // TODO: Find equivalent function of tf.gather_nd in S4TF to parallelize Q-value computation (_Raw.gather_nd does not exist) + // Gradient are accumulated since we calculate every element in the batch individually + var totalGrad = qNet.zeroTangentVector + for i in 0.. Tensor in + + let stateQValueBatch = qNet(tfStateBatch) + let tfAction: Tensor = tfActionBatch[i][0] + let action = Int(tfAction.makeNumpyArray()).unwrapped() + let prediction: Tensor = stateQValueBatch[i][action] + + let nextStateQValueBatch = self.targetQNet(tfNextStateBatch) + let tfReward: Tensor = tfRewardBatch[i][0] + let leftQValue = Float(nextStateQValueBatch[i][0].makeNumpyArray()).unwrapped() + let rightQValue = Float(nextStateQValueBatch[i][1].makeNumpyArray()).unwrapped() + let maxNextStateQValue = leftQValue > rightQValue ? leftQValue : rightQValue + let target: Tensor = tfReward + self.discount * maxNextStateQValue + + return squaredDifference(prediction, withoutDerivative(at: target)) + } + totalGrad += 𝛁qNet + } + optimizer.update(&qNet, along: totalGrad) + } + } +} + +func updateTargetQNet(source: Net, target: inout Net) { + target.l1.weight = Tensor(source.l1.weight) + target.l1.bias = Tensor(source.l1.bias) + target.l2.weight = Tensor(source.l2.weight) + target.l2.bias = Tensor(source.l2.bias) +} + +class TensorFlowEnvironmentWrapper { + let originalEnv: PythonObject + let action_space: PythonObject + let observation_space: PythonObject + + init(_ env: PythonObject) { + self.originalEnv = env + self.action_space = env.action_space + self.observation_space = env.observation_space + } + + func reset() -> Tensor { + let state = self.originalEnv.reset() + return Tensor(numpy: np.array(state, dtype: np.float32))! + } + + func step(_ action: Tensor) -> (Tensor, Tensor, PythonObject, PythonObject) { + let npAction = action.makeNumpyArray().item() + let (state, reward, isDone, info) = originalEnv.step(npAction).tuple4 + let tfState = Tensor(numpy: np.array(state, dtype: np.float32))! + let tfReward = Tensor(numpy: np.array(reward, dtype: np.float32))! + return (tfState, tfReward, isDone, info) + } +} + +// Hyperparameters +let discount: Float = 0.99 +let learningRate: Float = 0.01 +let hiddenSize: Int = 64 +let startEpsilon: Float = 0.5 +let maxEpisode: Int = 500 +let replayBufferCapacity: Int = 1000 +let batchSize: Int = 32 +let targetNetUpdateRate: Int = 1 + +// Initialize environment +let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0")) + +// Initialize agent +let actionCount = Int(env.action_space.n).unwrapped() +var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: actionCount) +var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: actionCount) +updateTargetQNet(source: qNet, target: &targetQNet) +let optimizer = Adam(for: qNet, learningRate: learningRate) +var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity) +var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount) + +// RL Loop +var stepIndex = 0 +var episodeIndex = 0 +var episodeReturn: Int = 0 +var episodeReturns: Array = [] +var state = env.reset() +while episodeIndex < maxEpisode { + stepIndex += 1 + // print("Step \(stepIndex)") + + // Interact with environment + let action = agent.getAction(state: state, epsilon: startEpsilon * Float(maxEpisode - episodeIndex)) + // print("action: \(action)") + var (nextState, reward, isDone, _) = env.step(action) + // print("state: \(state)") + // print("nextState: \(nextState)") + // print("reward: \(reward)") + // print("isDone: \(isDone)") + episodeReturn += Int(reward.makeNumpyArray().item()).unwrapped() + // print("episodeReturn: \(episodeReturn)") + + // Save interaction to replay buffer + replayBuffer.append(state: state, action: action, reward: reward, nextState: nextState) + // print("Append successful") + + // Train agent + agent.train(batchSize: batchSize) + // print("Train successful") + + // Periodically update Target Net + if stepIndex % targetNetUpdateRate == 0 { + updateTargetQNet(source: qNet, target: &targetQNet) + } + // print("Target net update successful") + + // End-of-episode + if isDone == true { + state = env.reset() + episodeIndex += 1 + print("Episode \(episodeIndex) Return \(episodeReturn)") + if episodeReturn > 199 { + print("Solved in \(episodeIndex) episodes with \(stepIndex) steps!") + break + } + episodeReturns.append(episodeReturn) + episodeReturn = 0 + } + + // End-of-step + nextState = state +} diff --git a/Gym/README.md b/Gym/README.md index 97d5bfbf66b..b203de133a2 100644 --- a/Gym/README.md +++ b/Gym/README.md @@ -31,4 +31,5 @@ To build and run the models, run: swift run Gym-CartPole swift run Gym-FrozenLake swift run Gym-Blackjack +swift run Gym-DQN ``` diff --git a/Package.swift b/Package.swift index 75496f2f69c..f6f6c9ce85a 100644 --- a/Package.swift +++ b/Package.swift @@ -47,6 +47,7 @@ let package = Package( .target(name: "Gym-FrozenLake", path: "Gym/FrozenLake"), .target(name: "Gym-CartPole", path: "Gym/CartPole"), .target(name: "Gym-Blackjack", path: "Gym/Blackjack"), + .target(name: "Gym-DQN", path: "Gym/DQN"), .target( name: "VGG-Imagewoof", dependencies: ["ImageClassificationModels", "Datasets"], path: "Examples/VGG-Imagewoof"), From 6268d58d8d951fc2f8ce0fd6f1c1e5b924cf901c Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Tue, 30 Jun 2020 09:30:58 +0000 Subject: [PATCH 02/34] Use .scalarized() to convert TF scalar to Swift --- Gym/DQN/main.swift | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 150a0ef1954..b7a86e561f2 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -152,13 +152,13 @@ class Agent { let stateQValueBatch = qNet(tfStateBatch) let tfAction: Tensor = tfActionBatch[i][0] - let action = Int(tfAction.makeNumpyArray()).unwrapped() + let action = Int(tfAction.scalarized()) let prediction: Tensor = stateQValueBatch[i][action] let nextStateQValueBatch = self.targetQNet(tfNextStateBatch) let tfReward: Tensor = tfRewardBatch[i][0] - let leftQValue = Float(nextStateQValueBatch[i][0].makeNumpyArray()).unwrapped() - let rightQValue = Float(nextStateQValueBatch[i][1].makeNumpyArray()).unwrapped() + let leftQValue = Float(nextStateQValueBatch[i][0].scalarized()) + let rightQValue = Float(nextStateQValueBatch[i][1].scalarized()) let maxNextStateQValue = leftQValue > rightQValue ? leftQValue : rightQValue let target: Tensor = tfReward + self.discount * maxNextStateQValue @@ -208,7 +208,7 @@ let discount: Float = 0.99 let learningRate: Float = 0.01 let hiddenSize: Int = 64 let startEpsilon: Float = 0.5 -let maxEpisode: Int = 500 +let maxEpisode: Int = 100 let replayBufferCapacity: Int = 1000 let batchSize: Int = 32 let targetNetUpdateRate: Int = 1 @@ -243,7 +243,7 @@ while episodeIndex < maxEpisode { // print("nextState: \(nextState)") // print("reward: \(reward)") // print("isDone: \(isDone)") - episodeReturn += Int(reward.makeNumpyArray().item()).unwrapped() + episodeReturn += Int(reward.scalarized()) // print("episodeReturn: \(episodeReturn)") // Save interaction to replay buffer From 51d1fad8f6a487c4c2386a0de7b5d2f369d863b1 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Tue, 30 Jun 2020 09:34:00 +0000 Subject: [PATCH 03/34] Improve code clarity --- Gym/DQN/main.swift | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index b7a86e561f2..57b7e403b64 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -45,8 +45,8 @@ class ReplayBuffer { var rewards: Tensor var nextStates: Tensor let capacity: Int - var count: Int - var index: Int + var count: Int = 0 + var index: Int = 0 init(capacity: Int) { self.capacity = capacity @@ -55,8 +55,6 @@ class ReplayBuffer { actions = Tensor(numpy: np.zeros([capacity, 1], dtype: np.int32))! rewards = Tensor(numpy: np.zeros([capacity, 1], dtype: np.float32))! nextStates = Tensor(numpy: np.zeros([capacity, 4], dtype: np.float32))! - count = 0 - index = 0 } func append(state: Tensor, action: Tensor, reward: Tensor, nextState: Tensor) { @@ -194,7 +192,7 @@ class TensorFlowEnvironmentWrapper { return Tensor(numpy: np.array(state, dtype: np.float32))! } - func step(_ action: Tensor) -> (Tensor, Tensor, PythonObject, PythonObject) { + func step(_ action: Tensor) -> (state: Tensor, reward: Tensor, isDone: PythonObject, info: PythonObject) { let npAction = action.makeNumpyArray().item() let (state, reward, isDone, info) = originalEnv.step(npAction).tuple4 let tfState = Tensor(numpy: np.array(state, dtype: np.float32))! From 84df320def2273edb0f8fb2391f1bfd418b11bc3 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Tue, 30 Jun 2020 10:56:56 +0000 Subject: [PATCH 04/34] Save isDone as Tensor --- Gym/DQN/main.swift | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 57b7e403b64..e8acb511186 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -188,16 +188,17 @@ class TensorFlowEnvironmentWrapper { } func reset() -> Tensor { - let state = self.originalEnv.reset() - return Tensor(numpy: np.array(state, dtype: np.float32))! + let state = self.originalEnv.reset() + return Tensor(numpy: np.array(state, dtype: np.float32))! } - func step(_ action: Tensor) -> (state: Tensor, reward: Tensor, isDone: PythonObject, info: PythonObject) { - let npAction = action.makeNumpyArray().item() - let (state, reward, isDone, info) = originalEnv.step(npAction).tuple4 - let tfState = Tensor(numpy: np.array(state, dtype: np.float32))! - let tfReward = Tensor(numpy: np.array(reward, dtype: np.float32))! - return (tfState, tfReward, isDone, info) + func step(_ action: Tensor) -> (state: Tensor, reward: Tensor, isDone: Tensor, info: PythonObject) { + let npAction = action.makeNumpyArray().item() + let (state, reward, isDone, info) = originalEnv.step(npAction).tuple4 + let tfState = Tensor(numpy: np.array(state, dtype: np.float32))! + let tfReward = Tensor(numpy: np.array(reward, dtype: np.float32))! + let tfIsDone = Tensor(numpy: np.array(isDone, dtype: np.bool))! + return (tfState, tfReward, tfIsDone, info) } } @@ -259,7 +260,7 @@ while episodeIndex < maxEpisode { // print("Target net update successful") // End-of-episode - if isDone == true { + if isDone.scalarized() == true { state = env.reset() episodeIndex += 1 print("Episode \(episodeIndex) Return \(episodeReturn)") From 2b9948945568206ebca0995e1ad54c28577eabaa Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Tue, 30 Jun 2020 11:02:38 +0000 Subject: [PATCH 05/34] Save and use isDone for target calculation --- Gym/DQN/main.swift | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index e8acb511186..4687b5a2567 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -44,6 +44,7 @@ class ReplayBuffer { var actions: Tensor var rewards: Tensor var nextStates: Tensor + var isDones: Tensor let capacity: Int var count: Int = 0 var index: Int = 0 @@ -55,9 +56,10 @@ class ReplayBuffer { actions = Tensor(numpy: np.zeros([capacity, 1], dtype: np.int32))! rewards = Tensor(numpy: np.zeros([capacity, 1], dtype: np.float32))! nextStates = Tensor(numpy: np.zeros([capacity, 4], dtype: np.float32))! + isDones = Tensor(numpy: np.zeros([capacity], dtype: np.bool))! } - func append(state: Tensor, action: Tensor, reward: Tensor, nextState: Tensor) { + func append(state: Tensor, action: Tensor, reward: Tensor, nextState: Tensor, isDone: Tensor) { if count < capacity { count += 1 } @@ -66,18 +68,20 @@ class ReplayBuffer { actions[index] = Tensor(numpy: np.expand_dims(action.makeNumpyArray(), axis: 0))! rewards[index] = Tensor(numpy: np.expand_dims(reward.makeNumpyArray(), axis: 0))! nextStates[index] = nextState + isDones[index] = isDone index = (index + 1) % capacity } - func sample(batchSize: Int) -> (stateBatch: Tensor, actionBatch: Tensor, rewardBatch: Tensor, nextStateBatch: Tensor) { + func sample(batchSize: Int) -> (stateBatch: Tensor, actionBatch: Tensor, rewardBatch: Tensor, nextStateBatch: Tensor, isDoneBatch: Tensor) { let randomIndices = Tensor(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))! let stateBatch = _Raw.gather(params: states, indices: randomIndices) let actionBatch = _Raw.gather(params: actions, indices: randomIndices) let rewardBatch = _Raw.gather(params: rewards, indices: randomIndices) let nextStateBatch = _Raw.gather(params: nextStates, indices: randomIndices) + let isDoneBatch = _Raw.gather(params: isDones, indices: randomIndices) - return (stateBatch, actionBatch, rewardBatch, nextStateBatch) + return (stateBatch, actionBatch, rewardBatch, nextStateBatch, isDoneBatch) } } @@ -140,7 +144,7 @@ class Agent { // Don't train if replay buffer is too small if replayBuffer.count >= batchSize { // print("train | Start training") - let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch) = replayBuffer.sample(batchSize: batchSize) + let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch, tfIsDoneBatch) = replayBuffer.sample(batchSize: batchSize) // TODO: Find equivalent function of tf.gather_nd in S4TF to parallelize Q-value computation (_Raw.gather_nd does not exist) // Gradient are accumulated since we calculate every element in the batch individually @@ -158,7 +162,7 @@ class Agent { let leftQValue = Float(nextStateQValueBatch[i][0].scalarized()) let rightQValue = Float(nextStateQValueBatch[i][1].scalarized()) let maxNextStateQValue = leftQValue > rightQValue ? leftQValue : rightQValue - let target: Tensor = tfReward + self.discount * maxNextStateQValue + let target: Tensor = tfReward + Tensor(tfIsDoneBatch[i]) * self.discount * maxNextStateQValue return squaredDifference(prediction, withoutDerivative(at: target)) } @@ -246,7 +250,7 @@ while episodeIndex < maxEpisode { // print("episodeReturn: \(episodeReturn)") // Save interaction to replay buffer - replayBuffer.append(state: state, action: action, reward: reward, nextState: nextState) + replayBuffer.append(state: state, action: action, reward: reward, nextState: nextState, isDone: isDone) // print("Append successful") // Train agent From 18e629486d8cb9a56460cfc49ce7bc332da26b2b Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Tue, 30 Jun 2020 11:05:20 +0000 Subject: [PATCH 06/34] Add commented parallelized training implementation --- Gym/DQN/main.swift | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 4687b5a2567..5e01a2c741d 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -146,7 +146,6 @@ class Agent { // print("train | Start training") let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch, tfIsDoneBatch) = replayBuffer.sample(batchSize: batchSize) - // TODO: Find equivalent function of tf.gather_nd in S4TF to parallelize Q-value computation (_Raw.gather_nd does not exist) // Gradient are accumulated since we calculate every element in the batch individually var totalGrad = qNet.zeroTangentVector for i in 0.. Tensor in + // // Compute prediction batch + // let npActionBatch = tfActionBatch.makeNumpyArray() + // print("A: \(np.arange(batchSize, dtype: np.int32)))") + // print("B: \(npActionBatch.flatten())") + // let npFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npActionBatch.flatten()], axis: 1) + // let tfFullIndices = Tensor(numpy: npFullIndices)! + // let stateQValueBatch = qNet(tfStateBatch) + // let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices) + + // // TODO: Just save rewards as 1D to avoid this extra squeeze operation + // // Compute target batch + // let targetBatch: Tensor = _Raw.squeeze(tfRewardBatch, squeezeDims: [1]) + self.discount * _Raw.max(self.targetQNet(tfNextStateBatch), reductionIndices: Tensor(1)) + + // return squaredDifference(predictionBatch, withoutDerivative(at: targetBatch)) + // } + // optimizer.update(&qNet, along: 𝛁qNet) } } } From 36c1ddf7fca383b28f938e2499a94d1ab56bfe5f Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Tue, 30 Jun 2020 11:18:41 +0000 Subject: [PATCH 07/34] Save learning curve plot --- Gym/DQN/main.swift | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 5e01a2c741d..34558196d8f 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -34,6 +34,7 @@ fileprivate extension Optional { let np = Python.import("numpy") let gym = Python.import("gym") +let plt = Python.import("matplotlib.pyplot") typealias State = Tensor typealias Action = Tensor @@ -298,3 +299,13 @@ while episodeIndex < maxEpisode { // End-of-step nextState = state } + +// Save smoothed learning curve +let runningMeanWindow: Int = 2 +let smoothedEpisodeReturns = np.convolve(episodeReturns, np.ones((runningMeanWindow)) / np.array(runningMeanWindow, dtype: np.int32), mode: "same") + +plt.plot(smoothedEpisodeReturns) +plt.title("Deep Q-Network on CartPole-v0") +plt.xlabel("Episode") +plt.ylabel("Smoothed Episode Return") +plt.savefig("dqnSmoothedEpisodeReturns.png") From da57062f121d980e0a227bd08186a19f09f10218 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Wed, 1 Jul 2020 00:32:16 +0000 Subject: [PATCH 08/34] Use parallelized training with custom gatherNd --- Gym/DQN/main.swift | 102 ++++++++++++++++++++++++++++----------------- 1 file changed, 64 insertions(+), 38 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 34558196d8f..ccb30188155 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -30,6 +30,32 @@ fileprivate extension Optional { } } +extension _Raw { + /// Derivative of `_Raw.gatherNd`. + /// + /// Ported from TensorFlow Python reference implementation: + /// https://github.com/tensorflow/tensorflow/blob/r2.2/tensorflow/python/ops/array_grad.py#L691-L701 + @inlinable + @derivative(of: gatherNd) + public static func vjpGatherNd< + Scalar: TensorFlowFloatingPoint, + Index: TensorFlowIndex + >( + params: Tensor, + indices: Tensor + ) -> ( + value: Tensor, + pullback: (Tensor) -> Tensor + ) { + let shapeTensor = Tensor(params.shapeTensor) + let value = gatherNd(params: params, indices: indices) + return (value, { v in + let dparams = scatterNd(indices: indices, updates: v, shape: shapeTensor) + return dparams + }) + } +} + // Initialize Python. This comment is a hook for internal use, do not remove. let np = Python.import("numpy") @@ -148,47 +174,47 @@ class Agent { let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch, tfIsDoneBatch) = replayBuffer.sample(batchSize: batchSize) // Gradient are accumulated since we calculate every element in the batch individually - var totalGrad = qNet.zeroTangentVector - for i in 0.. Tensor in - - let stateQValueBatch = qNet(tfStateBatch) - let tfAction: Tensor = tfActionBatch[i][0] - let action = Int(tfAction.scalarized()) - let prediction: Tensor = stateQValueBatch[i][action] - - let nextStateQValueBatch = self.targetQNet(tfNextStateBatch) - let tfReward: Tensor = tfRewardBatch[i][0] - let leftQValue = Float(nextStateQValueBatch[i][0].scalarized()) - let rightQValue = Float(nextStateQValueBatch[i][1].scalarized()) - let maxNextStateQValue = leftQValue > rightQValue ? leftQValue : rightQValue - let target: Tensor = tfReward + Tensor(tfIsDoneBatch[i]) * self.discount * maxNextStateQValue - - return squaredDifference(prediction, withoutDerivative(at: target)) - } - totalGrad += 𝛁qNet - } - optimizer.update(&qNet, along: totalGrad) + // var totalGrad = qNet.zeroTangentVector + // for i in 0.. Tensor in + + // let stateQValueBatch = qNet(tfStateBatch) + // let tfAction: Tensor = tfActionBatch[i][0] + // let action = Int(tfAction.scalarized()) + // let prediction: Tensor = stateQValueBatch[i][action] + + // let nextStateQValueBatch = self.targetQNet(tfNextStateBatch) + // let tfReward: Tensor = tfRewardBatch[i][0] + // let leftQValue = Float(nextStateQValueBatch[i][0].scalarized()) + // let rightQValue = Float(nextStateQValueBatch[i][1].scalarized()) + // let maxNextStateQValue = leftQValue > rightQValue ? leftQValue : rightQValue + // let target: Tensor = tfReward + Tensor(tfIsDoneBatch[i]) * self.discount * maxNextStateQValue + + // return squaredDifference(prediction, withoutDerivative(at: target)) + // } + // totalGrad += 𝛁qNet + // } + // optimizer.update(&qNet, along: totalGrad) // TODO: Use parallelized methods commented out below // TODO: _Raw.gatherNd() is not differentiable? - // let 𝛁qNet = gradient(at: qNet) { qNet -> Tensor in - // // Compute prediction batch - // let npActionBatch = tfActionBatch.makeNumpyArray() - // print("A: \(np.arange(batchSize, dtype: np.int32)))") - // print("B: \(npActionBatch.flatten())") - // let npFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npActionBatch.flatten()], axis: 1) - // let tfFullIndices = Tensor(numpy: npFullIndices)! - // let stateQValueBatch = qNet(tfStateBatch) - // let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices) - - // // TODO: Just save rewards as 1D to avoid this extra squeeze operation - // // Compute target batch - // let targetBatch: Tensor = _Raw.squeeze(tfRewardBatch, squeezeDims: [1]) + self.discount * _Raw.max(self.targetQNet(tfNextStateBatch), reductionIndices: Tensor(1)) - - // return squaredDifference(predictionBatch, withoutDerivative(at: targetBatch)) - // } - // optimizer.update(&qNet, along: 𝛁qNet) + let 𝛁qNet = gradient(at: qNet) { qNet -> Tensor in + // Compute prediction batch + let npActionBatch = tfActionBatch.makeNumpyArray() + // print("A: \(np.arange(batchSize, dtype: np.int32)))") + // print("B: \(npActionBatch.flatten())") + let npFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npActionBatch.flatten()], axis: 1) + let tfFullIndices = Tensor(numpy: npFullIndices)! + let stateQValueBatch = qNet(tfStateBatch) + let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices) + + // TODO: Just save rewards as 1D to avoid this extra squeeze operation + // Compute target batch + let targetBatch: Tensor = _Raw.squeeze(tfRewardBatch, squeezeDims: [1]) + Tensor(tfIsDoneBatch) * self.discount * _Raw.max(self.targetQNet(tfNextStateBatch), reductionIndices: Tensor(1)) + + return meanSquaredError(predicted: predictionBatch, expected: withoutDerivative(at: targetBatch)) + } + optimizer.update(&qNet, along: 𝛁qNet) } } } From 2ec956ccbd8aa7abd8d3d8ae96e5b839608a2280 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Wed, 1 Jul 2020 00:48:28 +0000 Subject: [PATCH 09/34] Add minBufferSize parameter --- Gym/DQN/main.swift | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index ccb30188155..c20753e1ccb 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -169,7 +169,7 @@ class Agent { func train(batchSize: Int) { // Don't train if replay buffer is too small - if replayBuffer.count >= batchSize { + if replayBuffer.count >= minBufferSize { // print("train | Start training") let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch, tfIsDoneBatch) = replayBuffer.sample(batchSize: batchSize) @@ -257,10 +257,11 @@ let discount: Float = 0.99 let learningRate: Float = 0.01 let hiddenSize: Int = 64 let startEpsilon: Float = 0.5 -let maxEpisode: Int = 100 -let replayBufferCapacity: Int = 1000 -let batchSize: Int = 32 -let targetNetUpdateRate: Int = 1 +let maxEpisode: Int = 1000 +let replayBufferCapacity: Int = 5000 +let minBufferSize: Int = 1000 +let batchSize: Int = 64 +let targetNetUpdateRate: Int = 32 // Initialize environment let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0")) From dab2a3f811b0139050ce9a6cfb0e8da48405a272 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Wed, 1 Jul 2020 02:20:50 +0000 Subject: [PATCH 10/34] Remove comments and refactor code --- Gym/DQN/main.swift | 63 +++++++++++++--------------------------------- 1 file changed, 18 insertions(+), 45 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index c20753e1ccb..f2899d52f43 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -86,7 +86,13 @@ class ReplayBuffer { isDones = Tensor(numpy: np.zeros([capacity], dtype: np.bool))! } - func append(state: Tensor, action: Tensor, reward: Tensor, nextState: Tensor, isDone: Tensor) { + func append( + state: Tensor, + action: Tensor, + reward: Tensor, + nextState: Tensor, + isDone: Tensor + ) { if count < capacity { count += 1 } @@ -99,7 +105,13 @@ class ReplayBuffer { index = (index + 1) % capacity } - func sample(batchSize: Int) -> (stateBatch: Tensor, actionBatch: Tensor, rewardBatch: Tensor, nextStateBatch: Tensor, isDoneBatch: Tensor) { + func sample(batchSize: Int) -> ( + stateBatch: Tensor, + actionBatch: Tensor, + rewardBatch: Tensor, + nextStateBatch: Tensor, + isDoneBatch: Tensor + ) { let randomIndices = Tensor(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))! let stateBatch = _Raw.gather(params: states, indices: randomIndices) @@ -151,10 +163,7 @@ class Agent { func getAction(state: Tensor, epsilon: Float) -> Tensor { if Float(np.random.uniform()).unwrapped() < epsilon { - // print("getAction | state: \(state)") - // print("getAction | epsilon: \(epsilon)") let npState = np.random.randint(0, 2, dtype: np.int32) - // print("getAction | npState: \(npState)") return Tensor(numpy: np.array(npState, dtype: np.int32))! } else { @@ -170,39 +179,12 @@ class Agent { func train(batchSize: Int) { // Don't train if replay buffer is too small if replayBuffer.count >= minBufferSize { - // print("train | Start training") let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch, tfIsDoneBatch) = replayBuffer.sample(batchSize: batchSize) - // Gradient are accumulated since we calculate every element in the batch individually - // var totalGrad = qNet.zeroTangentVector - // for i in 0.. Tensor in - - // let stateQValueBatch = qNet(tfStateBatch) - // let tfAction: Tensor = tfActionBatch[i][0] - // let action = Int(tfAction.scalarized()) - // let prediction: Tensor = stateQValueBatch[i][action] - - // let nextStateQValueBatch = self.targetQNet(tfNextStateBatch) - // let tfReward: Tensor = tfRewardBatch[i][0] - // let leftQValue = Float(nextStateQValueBatch[i][0].scalarized()) - // let rightQValue = Float(nextStateQValueBatch[i][1].scalarized()) - // let maxNextStateQValue = leftQValue > rightQValue ? leftQValue : rightQValue - // let target: Tensor = tfReward + Tensor(tfIsDoneBatch[i]) * self.discount * maxNextStateQValue - - // return squaredDifference(prediction, withoutDerivative(at: target)) - // } - // totalGrad += 𝛁qNet - // } - // optimizer.update(&qNet, along: totalGrad) - - // TODO: Use parallelized methods commented out below - // TODO: _Raw.gatherNd() is not differentiable? + // TODO: Check gradient values let 𝛁qNet = gradient(at: qNet) { qNet -> Tensor in // Compute prediction batch let npActionBatch = tfActionBatch.makeNumpyArray() - // print("A: \(np.arange(batchSize, dtype: np.int32)))") - // print("B: \(npActionBatch.flatten())") let npFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npActionBatch.flatten()], axis: 1) let tfFullIndices = Tensor(numpy: npFullIndices)! let stateQValueBatch = qNet(tfStateBatch) @@ -283,38 +265,29 @@ var episodeReturns: Array = [] var state = env.reset() while episodeIndex < maxEpisode { stepIndex += 1 - // print("Step \(stepIndex)") // Interact with environment - let action = agent.getAction(state: state, epsilon: startEpsilon * Float(maxEpisode - episodeIndex)) - // print("action: \(action)") + let epsilon = startEpsilon * Float(maxEpisode - episodeIndex) / Float(maxEpisode) + let action = agent.getAction(state: state, epsilon: epsilon) var (nextState, reward, isDone, _) = env.step(action) - // print("state: \(state)") - // print("nextState: \(nextState)") - // print("reward: \(reward)") - // print("isDone: \(isDone)") episodeReturn += Int(reward.scalarized()) - // print("episodeReturn: \(episodeReturn)") // Save interaction to replay buffer replayBuffer.append(state: state, action: action, reward: reward, nextState: nextState, isDone: isDone) - // print("Append successful") // Train agent agent.train(batchSize: batchSize) - // print("Train successful") // Periodically update Target Net if stepIndex % targetNetUpdateRate == 0 { updateTargetQNet(source: qNet, target: &targetQNet) } - // print("Target net update successful") // End-of-episode if isDone.scalarized() == true { state = env.reset() episodeIndex += 1 - print("Episode \(episodeIndex) Return \(episodeReturn)") + print(String(format: "Episode: %4d | Epsilon: %.03f | Return: %3d", episodeIndex, epsilon, episodeReturn)) if episodeReturn > 199 { print("Solved in \(episodeIndex) episodes with \(stepIndex) steps!") break From 0bc60ca6442b886c9007ee4c5cd41e74787eb1fe Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Wed, 1 Jul 2020 02:34:59 +0000 Subject: [PATCH 11/34] Fix bug where state was updated --- Gym/DQN/main.swift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index f2899d52f43..c2d35495879 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -269,7 +269,7 @@ while episodeIndex < maxEpisode { // Interact with environment let epsilon = startEpsilon * Float(maxEpisode - episodeIndex) / Float(maxEpisode) let action = agent.getAction(state: state, epsilon: epsilon) - var (nextState, reward, isDone, _) = env.step(action) + let (nextState, reward, isDone, _) = env.step(action) episodeReturn += Int(reward.scalarized()) // Save interaction to replay buffer @@ -297,7 +297,7 @@ while episodeIndex < maxEpisode { } // End-of-step - nextState = state + state = nextState } // Save smoothed learning curve From 01074d9075a0e561a5265edc8ff3267f137b8544 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Wed, 1 Jul 2020 02:46:32 +0000 Subject: [PATCH 12/34] Simplify code --- Gym/DQN/main.swift | 50 +++++++++++++++------------------------------- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index c2d35495879..f982f9edfe6 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -62,10 +62,6 @@ let np = Python.import("numpy") let gym = Python.import("gym") let plt = Python.import("matplotlib.pyplot") -typealias State = Tensor -typealias Action = Tensor -typealias Reward = Tensor - class ReplayBuffer { var states: Tensor var actions: Tensor @@ -80,8 +76,8 @@ class ReplayBuffer { self.capacity = capacity states = Tensor(numpy: np.zeros([capacity, 4], dtype: np.float32))! - actions = Tensor(numpy: np.zeros([capacity, 1], dtype: np.int32))! - rewards = Tensor(numpy: np.zeros([capacity, 1], dtype: np.float32))! + actions = Tensor(numpy: np.zeros([capacity], dtype: np.int32))! + rewards = Tensor(numpy: np.zeros([capacity], dtype: np.float32))! nextStates = Tensor(numpy: np.zeros([capacity, 4], dtype: np.float32))! isDones = Tensor(numpy: np.zeros([capacity], dtype: np.bool))! } @@ -98,8 +94,8 @@ class ReplayBuffer { } // Erase oldest SARS if the replay buffer is full states[index] = state - actions[index] = Tensor(numpy: np.expand_dims(action.makeNumpyArray(), axis: 0))! - rewards[index] = Tensor(numpy: np.expand_dims(reward.makeNumpyArray(), axis: 0))! + actions[index] = action + rewards[index] = reward nextStates[index] = nextState isDones[index] = isDone index = (index + 1) % capacity @@ -132,7 +128,7 @@ struct Net: Layer { init(observationSize: Int, hiddenSize: Int, actionCount: Int) { l1 = Dense(inputSize: observationSize, outputSize: hiddenSize, activation: relu, weightInitializer: heNormal()) - l2 = Dense(inputSize: hiddenSize, outputSize: actionCount, weightInitializer: heNormal()) + l2 = Dense(inputSize: hiddenSize, outputSize: actionCount, activation: identity, weightInitializer: heNormal()) } @differentiable @@ -142,15 +138,10 @@ struct Net: Layer { } class Agent { - // Q-network var qNet: Net - // Target Q-network var targetQNet: Net - // Optimizer let optimizer: Adam - // Replay Buffer let replayBuffer: ReplayBuffer - // Discount Factor let discount: Float init(qNet: Net, targetQNet: Net, optimizer: Adam, replayBuffer: ReplayBuffer, discount: Float) { @@ -169,10 +160,8 @@ class Agent { else { // Neural network input needs to be 2D let tfState = Tensor(numpy: np.expand_dims(state.makeNumpyArray(), axis: 0))! - let qValues = qNet(tfState) - let leftQValue = Float(qValues[0][0]).unwrapped() - let rightQValue = Float(qValues[0][1]).unwrapped() - return leftQValue < rightQValue ? Tensor(numpy: np.array(1, dtype: np.int32))! : Tensor(numpy: np.array(0, dtype: np.int32))! + let qValues = qNet(tfState)[0] + return qValues[1].scalarized() > qValues[0].scalarized() ? Tensor(1) : Tensor(0) } } @@ -185,14 +174,13 @@ class Agent { let 𝛁qNet = gradient(at: qNet) { qNet -> Tensor in // Compute prediction batch let npActionBatch = tfActionBatch.makeNumpyArray() - let npFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npActionBatch.flatten()], axis: 1) + let npFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npActionBatch], axis: 1) let tfFullIndices = Tensor(numpy: npFullIndices)! let stateQValueBatch = qNet(tfStateBatch) let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices) - // TODO: Just save rewards as 1D to avoid this extra squeeze operation // Compute target batch - let targetBatch: Tensor = _Raw.squeeze(tfRewardBatch, squeezeDims: [1]) + Tensor(tfIsDoneBatch) * self.discount * _Raw.max(self.targetQNet(tfNextStateBatch), reductionIndices: Tensor(1)) + let targetBatch: Tensor = tfRewardBatch + Tensor(tfIsDoneBatch) * self.discount * _Raw.max(self.targetQNet(tfNextStateBatch), reductionIndices: Tensor(1)) return meanSquaredError(predicted: predictionBatch, expected: withoutDerivative(at: targetBatch)) } @@ -210,13 +198,9 @@ func updateTargetQNet(source: Net, target: inout Net) { class TensorFlowEnvironmentWrapper { let originalEnv: PythonObject - let action_space: PythonObject - let observation_space: PythonObject init(_ env: PythonObject) { self.originalEnv = env - self.action_space = env.action_space - self.observation_space = env.observation_space } func reset() -> Tensor { @@ -225,8 +209,7 @@ class TensorFlowEnvironmentWrapper { } func step(_ action: Tensor) -> (state: Tensor, reward: Tensor, isDone: Tensor, info: PythonObject) { - let npAction = action.makeNumpyArray().item() - let (state, reward, isDone, info) = originalEnv.step(npAction).tuple4 + let (state, reward, isDone, info) = originalEnv.step(action.scalarized()).tuple4 let tfState = Tensor(numpy: np.array(state, dtype: np.float32))! let tfReward = Tensor(numpy: np.array(reward, dtype: np.float32))! let tfIsDone = Tensor(numpy: np.array(isDone, dtype: np.bool))! @@ -249,9 +232,8 @@ let targetNetUpdateRate: Int = 32 let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0")) // Initialize agent -let actionCount = Int(env.action_space.n).unwrapped() -var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: actionCount) -var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: actionCount) +var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) +var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) updateTargetQNet(source: qNet, target: &targetQNet) let optimizer = Adam(for: qNet, learningRate: learningRate) var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity) @@ -260,8 +242,8 @@ var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, repl // RL Loop var stepIndex = 0 var episodeIndex = 0 -var episodeReturn: Int = 0 -var episodeReturns: Array = [] +var episodeReturn: Float = 0 +var episodeReturns: Array = [] var state = env.reset() while episodeIndex < maxEpisode { stepIndex += 1 @@ -270,7 +252,7 @@ while episodeIndex < maxEpisode { let epsilon = startEpsilon * Float(maxEpisode - episodeIndex) / Float(maxEpisode) let action = agent.getAction(state: state, epsilon: epsilon) let (nextState, reward, isDone, _) = env.step(action) - episodeReturn += Int(reward.scalarized()) + episodeReturn += reward.scalarized() // Save interaction to replay buffer replayBuffer.append(state: state, action: action, reward: reward, nextState: nextState, isDone: isDone) @@ -287,7 +269,7 @@ while episodeIndex < maxEpisode { if isDone.scalarized() == true { state = env.reset() episodeIndex += 1 - print(String(format: "Episode: %4d | Epsilon: %.03f | Return: %3d", episodeIndex, epsilon, episodeReturn)) + print(String(format: "Episode: %4d | Epsilon: %.03f | Return: %3d", episodeIndex, epsilon, Int(episodeReturn))) if episodeReturn > 199 { print("Solved in \(episodeIndex) episodes with \(stepIndex) steps!") break From eca8a920dafc0f9116f4804f513ef4b3562810c7 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Wed, 1 Jul 2020 03:39:50 +0000 Subject: [PATCH 13/34] Save TD loss curve --- Gym/DQN/main.swift | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index f982f9edfe6..f33fd031803 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -165,13 +165,12 @@ class Agent { } } - func train(batchSize: Int) { + func train(batchSize: Int) -> Float { // Don't train if replay buffer is too small if replayBuffer.count >= minBufferSize { let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch, tfIsDoneBatch) = replayBuffer.sample(batchSize: batchSize) - // TODO: Check gradient values - let 𝛁qNet = gradient(at: qNet) { qNet -> Tensor in + let (loss, gradients) = valueWithGradient(at: qNet) { qNet -> Tensor in // Compute prediction batch let npActionBatch = tfActionBatch.makeNumpyArray() let npFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npActionBatch], axis: 1) @@ -184,8 +183,11 @@ class Agent { return meanSquaredError(predicted: predictionBatch, expected: withoutDerivative(at: targetBatch)) } - optimizer.update(&qNet, along: 𝛁qNet) + optimizer.update(&qNet, along: gradients) + + return loss.scalarized() } + return 0 } } @@ -244,6 +246,7 @@ var stepIndex = 0 var episodeIndex = 0 var episodeReturn: Float = 0 var episodeReturns: Array = [] +var losses: Array = [] var state = env.reset() while episodeIndex < maxEpisode { stepIndex += 1 @@ -258,7 +261,7 @@ while episodeIndex < maxEpisode { replayBuffer.append(state: state, action: action, reward: reward, nextState: nextState, isDone: isDone) // Train agent - agent.train(batchSize: batchSize) + losses.append(agent.train(batchSize: batchSize)) // Periodically update Target Net if stepIndex % targetNetUpdateRate == 0 { @@ -291,3 +294,12 @@ plt.title("Deep Q-Network on CartPole-v0") plt.xlabel("Episode") plt.ylabel("Smoothed Episode Return") plt.savefig("dqnSmoothedEpisodeReturns.png") +plt.clf() + +// Save TD loss curve +plt.plot(losses) +plt.title("Deep Q-Network on CartPole-v0") +plt.xlabel("Step") +plt.ylabel("TD Loss") +plt.savefig("dqnTDLoss.png") +plt.clf() From ae087dd9e7162e7d85d66aea264e715f7266ede9 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Thu, 2 Jul 2020 08:06:15 +0000 Subject: [PATCH 14/34] Purge uses of _Raw operations --- Gym/DQN/main.swift | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index f33fd031803..e48e208a0f5 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -110,11 +110,11 @@ class ReplayBuffer { ) { let randomIndices = Tensor(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))! - let stateBatch = _Raw.gather(params: states, indices: randomIndices) - let actionBatch = _Raw.gather(params: actions, indices: randomIndices) - let rewardBatch = _Raw.gather(params: rewards, indices: randomIndices) - let nextStateBatch = _Raw.gather(params: nextStates, indices: randomIndices) - let isDoneBatch = _Raw.gather(params: isDones, indices: randomIndices) + let stateBatch = states.gathering(atIndices: randomIndices, alongAxis: 0) + let actionBatch = actions.gathering(atIndices: randomIndices, alongAxis: 0) + let rewardBatch = rewards.gathering(atIndices: randomIndices, alongAxis: 0) + let nextStateBatch = nextStates.gathering(atIndices: randomIndices, alongAxis: 0) + let isDoneBatch = isDones.gathering(atIndices: randomIndices, alongAxis: 0) return (stateBatch, actionBatch, rewardBatch, nextStateBatch, isDoneBatch) } @@ -179,8 +179,7 @@ class Agent { let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices) // Compute target batch - let targetBatch: Tensor = tfRewardBatch + Tensor(tfIsDoneBatch) * self.discount * _Raw.max(self.targetQNet(tfNextStateBatch), reductionIndices: Tensor(1)) - + let targetBatch: Tensor = tfRewardBatch + Tensor(tfIsDoneBatch) * self.discount * self.targetQNet(tfNextStateBatch).max(squeezingAxes: Tensor(1)) return meanSquaredError(predicted: predictionBatch, expected: withoutDerivative(at: targetBatch)) } optimizer.update(&qNet, along: gradients) From 4acd6ce200dd49fd4701853a6d2f7ed94e5cc1fb Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Thu, 2 Jul 2020 08:13:32 +0000 Subject: [PATCH 15/34] Use Huber loss instead of MSE --- Gym/DQN/main.swift | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index e48e208a0f5..6b2e5a5f76f 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -180,7 +180,11 @@ class Agent { // Compute target batch let targetBatch: Tensor = tfRewardBatch + Tensor(tfIsDoneBatch) * self.discount * self.targetQNet(tfNextStateBatch).max(squeezingAxes: Tensor(1)) - return meanSquaredError(predicted: predictionBatch, expected: withoutDerivative(at: targetBatch)) + return huberLoss( + predicted: predictionBatch, + expected: withoutDerivative(at: targetBatch), + delta: 1 + ) } optimizer.update(&qNet, along: gradients) From 22aaf75d9b4a56ec0087b3f64ba8d80e3b74f166 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Thu, 2 Jul 2020 10:15:03 +0000 Subject: [PATCH 16/34] Simplify Tensor initialization --- Gym/DQN/main.swift | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 6b2e5a5f76f..2e9edfd26de 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -75,11 +75,11 @@ class ReplayBuffer { init(capacity: Int) { self.capacity = capacity - states = Tensor(numpy: np.zeros([capacity, 4], dtype: np.float32))! - actions = Tensor(numpy: np.zeros([capacity], dtype: np.int32))! - rewards = Tensor(numpy: np.zeros([capacity], dtype: np.float32))! - nextStates = Tensor(numpy: np.zeros([capacity, 4], dtype: np.float32))! - isDones = Tensor(numpy: np.zeros([capacity], dtype: np.bool))! + states = Tensor(zeros: [capacity, 4]) + actions = Tensor(zeros: [capacity]) + rewards = Tensor(zeros: [capacity]) + nextStates = Tensor(zeros: [capacity, 4]) + isDones = Tensor(repeating: false, shape: [capacity]) } func append( @@ -154,8 +154,7 @@ class Agent { func getAction(state: Tensor, epsilon: Float) -> Tensor { if Float(np.random.uniform()).unwrapped() < epsilon { - let npState = np.random.randint(0, 2, dtype: np.int32) - return Tensor(numpy: np.array(npState, dtype: np.int32))! + return Tensor(numpy: np.array(np.random.randint(0, 2), dtype: np.int32))! } else { // Neural network input needs to be 2D From 24392f3aeb6caed312a3017e1f5126373db6b95f Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Thu, 2 Jul 2020 10:28:21 +0000 Subject: [PATCH 17/34] Set device explicitly on Tensor creation --- Gym/DQN/main.swift | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 2e9edfd26de..6e042b7c34f 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -63,23 +63,26 @@ let gym = Python.import("gym") let plt = Python.import("matplotlib.pyplot") class ReplayBuffer { + let capacity: Int + let device: Device + var states: Tensor var actions: Tensor var rewards: Tensor var nextStates: Tensor var isDones: Tensor - let capacity: Int var count: Int = 0 var index: Int = 0 - init(capacity: Int) { + init(capacity: Int, device: Device) { self.capacity = capacity + self.device = device - states = Tensor(zeros: [capacity, 4]) - actions = Tensor(zeros: [capacity]) - rewards = Tensor(zeros: [capacity]) - nextStates = Tensor(zeros: [capacity, 4]) - isDones = Tensor(repeating: false, shape: [capacity]) + states = Tensor(zeros: [capacity, 4], on: device) + actions = Tensor(zeros: [capacity], on: device) + rewards = Tensor(zeros: [capacity], on: device) + nextStates = Tensor(zeros: [capacity, 4], on: device) + isDones = Tensor(repeating: false, shape: [capacity], on: device) } func append( @@ -143,13 +146,15 @@ class Agent { let optimizer: Adam let replayBuffer: ReplayBuffer let discount: Float + let device: Device - init(qNet: Net, targetQNet: Net, optimizer: Adam, replayBuffer: ReplayBuffer, discount: Float) { + init(qNet: Net, targetQNet: Net, optimizer: Adam, replayBuffer: ReplayBuffer, discount: Float, device: Device) { self.qNet = qNet self.targetQNet = targetQNet self.optimizer = optimizer self.replayBuffer = replayBuffer self.discount = discount + self.device = device } func getAction(state: Tensor, epsilon: Float) -> Tensor { @@ -160,7 +165,7 @@ class Agent { // Neural network input needs to be 2D let tfState = Tensor(numpy: np.expand_dims(state.makeNumpyArray(), axis: 0))! let qValues = qNet(tfState)[0] - return qValues[1].scalarized() > qValues[0].scalarized() ? Tensor(1) : Tensor(0) + return Tensor(qValues[1].scalarized() > qValues[0].scalarized() ? 1 : 0, on: device) } } @@ -231,6 +236,7 @@ let replayBufferCapacity: Int = 5000 let minBufferSize: Int = 1000 let batchSize: Int = 64 let targetNetUpdateRate: Int = 32 +let device: Device = Device.default // Initialize environment let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0")) @@ -240,8 +246,8 @@ var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) updateTargetQNet(source: qNet, target: &targetQNet) let optimizer = Adam(for: qNet, learningRate: learningRate) -var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity) -var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount) +var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity, device: device) +var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount, device: device) // RL Loop var stepIndex = 0 From ccfa0873b3c19327e996b5839d097e9eb32f1eae Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Mon, 3 Aug 2020 05:11:47 +0000 Subject: [PATCH 18/34] Add minBufferSize to Agent argument --- Gym/DQN/main.swift | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 6e042b7c34f..6ef6b6f6500 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -146,14 +146,16 @@ class Agent { let optimizer: Adam let replayBuffer: ReplayBuffer let discount: Float + let minBufferSize: Int let device: Device - init(qNet: Net, targetQNet: Net, optimizer: Adam, replayBuffer: ReplayBuffer, discount: Float, device: Device) { + init(qNet: Net, targetQNet: Net, optimizer: Adam, replayBuffer: ReplayBuffer, discount: Float, minBufferSize: Int, device: Device) { self.qNet = qNet self.targetQNet = targetQNet self.optimizer = optimizer self.replayBuffer = replayBuffer self.discount = discount + self.minBufferSize = minBufferSize self.device = device } @@ -247,7 +249,7 @@ var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) updateTargetQNet(source: qNet, target: &targetQNet) let optimizer = Adam(for: qNet, learningRate: learningRate) var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity, device: device) -var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount, device: device) +var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount, minBufferSize: minBufferSize, device: device) // RL Loop var stepIndex = 0 From 65de04e0af372c5a42d8d72e12aaf027bef10d54 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Mon, 3 Aug 2020 05:35:39 +0000 Subject: [PATCH 19/34] Use soft target updates --- Gym/DQN/main.swift | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 6ef6b6f6500..9ab48aa1bbb 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -200,11 +200,11 @@ class Agent { } } -func updateTargetQNet(source: Net, target: inout Net) { - target.l1.weight = Tensor(source.l1.weight) - target.l1.bias = Tensor(source.l1.bias) - target.l2.weight = Tensor(source.l2.weight) - target.l2.bias = Tensor(source.l2.bias) +func updateTargetQNet(source: Net, target: inout Net, softTargetUpdateRate: Float = 0.001) { + target.l1.weight = softTargetUpdateRate * Tensor(source.l1.weight) + (1 - softTargetUpdateRate) * target.l1.weight + target.l1.bias = softTargetUpdateRate * Tensor(source.l1.bias) + (1 - softTargetUpdateRate) * target.l1.bias + target.l2.weight = softTargetUpdateRate * Tensor(source.l2.weight) + (1 - softTargetUpdateRate) * target.l2.weight + target.l2.bias = softTargetUpdateRate * Tensor(source.l2.bias) + (1 - softTargetUpdateRate) * target.l2.bias } class TensorFlowEnvironmentWrapper { From bcbb7e269d61403813916fae7a89928b7fae1dcf Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Mon, 3 Aug 2020 09:04:03 +0000 Subject: [PATCH 20/34] Fix bug where isDone was used wrong --- Gym/DQN/main.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 9ab48aa1bbb..a73726ea2be 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -185,7 +185,7 @@ class Agent { let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices) // Compute target batch - let targetBatch: Tensor = tfRewardBatch + Tensor(tfIsDoneBatch) * self.discount * self.targetQNet(tfNextStateBatch).max(squeezingAxes: Tensor(1)) + let targetBatch: Tensor = tfRewardBatch + (1 - Tensor(tfIsDoneBatch)) * self.discount * self.targetQNet(tfNextStateBatch).max(squeezingAxes: Tensor(1)) return huberLoss( predicted: predictionBatch, expected: withoutDerivative(at: targetBatch), From a20322674a9fb4f77d925d81796da93df06690a2 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Mon, 3 Aug 2020 09:07:00 +0000 Subject: [PATCH 21/34] Fix bug where target net is initialized with soft update --- Gym/DQN/main.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index a73726ea2be..9a01b555181 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -246,7 +246,7 @@ let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0")) // Initialize agent var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) -updateTargetQNet(source: qNet, target: &targetQNet) +updateTargetQNet(source: qNet, target: &targetQNet, softTargetUpdateRate: 1) let optimizer = Adam(for: qNet, learningRate: learningRate) var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity, device: device) var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount, minBufferSize: minBufferSize, device: device) From e757c0fdd263b4e03e7450bac4bf2fe073f9f771 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Mon, 3 Aug 2020 10:37:57 +0000 Subject: [PATCH 22/34] Follow hyperparameters in swift-rl --- Gym/DQN/main.swift | 52 +++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 9a01b555181..988a13f245b 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -130,8 +130,8 @@ struct Net: Layer { var l1, l2: Dense init(observationSize: Int, hiddenSize: Int, actionCount: Int) { - l1 = Dense(inputSize: observationSize, outputSize: hiddenSize, activation: relu, weightInitializer: heNormal()) - l2 = Dense(inputSize: hiddenSize, outputSize: actionCount, activation: identity, weightInitializer: heNormal()) + l1 = Dense(inputSize: observationSize, outputSize: hiddenSize, activation: relu) + l2 = Dense(inputSize: hiddenSize, outputSize: actionCount, activation: identity) } @differentiable @@ -143,13 +143,13 @@ struct Net: Layer { class Agent { var qNet: Net var targetQNet: Net - let optimizer: Adam + let optimizer: AMSGrad let replayBuffer: ReplayBuffer let discount: Float let minBufferSize: Int let device: Device - init(qNet: Net, targetQNet: Net, optimizer: Adam, replayBuffer: ReplayBuffer, discount: Float, minBufferSize: Int, device: Device) { + init(qNet: Net, targetQNet: Net, optimizer: AMSGrad, replayBuffer: ReplayBuffer, discount: Float, minBufferSize: Int, device: Device) { self.qNet = qNet self.targetQNet = targetQNet self.optimizer = optimizer @@ -185,10 +185,12 @@ class Agent { let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices) // Compute target batch - let targetBatch: Tensor = tfRewardBatch + (1 - Tensor(tfIsDoneBatch)) * self.discount * self.targetQNet(tfNextStateBatch).max(squeezingAxes: Tensor(1)) + let nextStateQValueBatch = self.targetQNet(tfNextStateBatch).max(squeezingAxes: Tensor(1)) + let targetBatch: Tensor = tfRewardBatch + self.discount * (1 - Tensor(tfIsDoneBatch)) * nextStateQValueBatch + return huberLoss( predicted: predictionBatch, - expected: withoutDerivative(at: targetBatch), + expected: targetBatch, delta: 1 ) } @@ -200,7 +202,7 @@ class Agent { } } -func updateTargetQNet(source: Net, target: inout Net, softTargetUpdateRate: Float = 0.001) { +func updateTargetQNet(source: Net, target: inout Net, softTargetUpdateRate: Float) { target.l1.weight = softTargetUpdateRate * Tensor(source.l1.weight) + (1 - softTargetUpdateRate) * target.l1.weight target.l1.bias = softTargetUpdateRate * Tensor(source.l1.bias) + (1 - softTargetUpdateRate) * target.l1.bias target.l2.weight = softTargetUpdateRate * Tensor(source.l2.weight) + (1 - softTargetUpdateRate) * target.l2.weight @@ -230,14 +232,15 @@ class TensorFlowEnvironmentWrapper { // Hyperparameters let discount: Float = 0.99 -let learningRate: Float = 0.01 -let hiddenSize: Int = 64 -let startEpsilon: Float = 0.5 +let learningRate: Float = 0.001 +let hiddenSize: Int = 100 +let startEpsilon: Float = 0.5 // TODO(seungjaeryanlee): Ignored right now let maxEpisode: Int = 1000 -let replayBufferCapacity: Int = 5000 -let minBufferSize: Int = 1000 -let batchSize: Int = 64 -let targetNetUpdateRate: Int = 32 +let replayBufferCapacity: Int = 1000 +let minBufferSize: Int = 32 +let batchSize: Int = 32 +let targetNetUpdateRate: Int = 5 +let softTargetUpdateRate: Float = 0.05 let device: Device = Device.default // Initialize environment @@ -247,7 +250,7 @@ let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0")) var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) updateTargetQNet(source: qNet, target: &targetQNet, softTargetUpdateRate: 1) -let optimizer = Adam(for: qNet, learningRate: learningRate) +let optimizer = AMSGrad(for: qNet, learningRate: learningRate) var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity, device: device) var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount, minBufferSize: minBufferSize, device: device) @@ -258,11 +261,17 @@ var episodeReturn: Float = 0 var episodeReturns: Array = [] var losses: Array = [] var state = env.reset() +var bestReturn: Float = 0 while episodeIndex < maxEpisode { stepIndex += 1 // Interact with environment - let epsilon = startEpsilon * Float(maxEpisode - episodeIndex) / Float(maxEpisode) + // let epsilon = startEpsilon * Float(maxEpisode - episodeIndex) / Float(maxEpisode) + let epsilon: Float = 0.1 + // let epsilon_start: Float = 0.9 + // let epsilon_end: Float = 0.05 + // let epsilon_decay: Int = 200 + // let epsilon: Float = epsilon_end + (epsilon_start - epsilon_end) * Float(np.exp(-1 * stepIndex / epsilon_decay, dtype: np.float32))! let action = agent.getAction(state: state, epsilon: epsilon) let (nextState, reward, isDone, _) = env.step(action) episodeReturn += reward.scalarized() @@ -275,14 +284,19 @@ while episodeIndex < maxEpisode { // Periodically update Target Net if stepIndex % targetNetUpdateRate == 0 { - updateTargetQNet(source: qNet, target: &targetQNet) + updateTargetQNet(source: qNet, target: &targetQNet, softTargetUpdateRate: softTargetUpdateRate) } // End-of-episode if isDone.scalarized() == true { state = env.reset() episodeIndex += 1 - print(String(format: "Episode: %4d | Epsilon: %.03f | Return: %3d", episodeIndex, epsilon, Int(episodeReturn))) + print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Return: %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn))) + if episodeReturn > bestReturn { + // print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Return: %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn))) + // print("New best return of \(episodeReturn)") + bestReturn = episodeReturn + } if episodeReturn > 199 { print("Solved in \(episodeIndex) episodes with \(stepIndex) steps!") break @@ -296,7 +310,7 @@ while episodeIndex < maxEpisode { } // Save smoothed learning curve -let runningMeanWindow: Int = 2 +let runningMeanWindow: Int = 1 let smoothedEpisodeReturns = np.convolve(episodeReturns, np.ones((runningMeanWindow)) / np.array(runningMeanWindow, dtype: np.int32), mode: "same") plt.plot(smoothedEpisodeReturns) From d2be5bd5dd3f66199a3cb048b6a931c1e8d86488 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Mon, 3 Aug 2020 13:51:34 +0000 Subject: [PATCH 23/34] Run evaluation episode for every training episode --- Gym/DQN/main.swift | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 988a13f245b..7bf0e0042b6 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -230,6 +230,21 @@ class TensorFlowEnvironmentWrapper { } } +func eval(agent: Agent) -> Float { + let evalEnv = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0")) + var evalEpisodeReturn: Float = 0 + var state: Tensor = evalEnv.reset() + var reward: Tensor + var evalIsDone: Tensor = Tensor(false) + while evalIsDone.scalarized() == false { + let action = agent.getAction(state: state, epsilon: 0) + (state, reward, evalIsDone, _) = evalEnv.step(action) + evalEpisodeReturn += reward.scalarized() + } + + return evalEpisodeReturn +} + // Hyperparameters let discount: Float = 0.99 let learningRate: Float = 0.001 @@ -289,19 +304,20 @@ while episodeIndex < maxEpisode { // End-of-episode if isDone.scalarized() == true { + let evalEpisodeReturn = eval(agent: agent) state = env.reset() episodeIndex += 1 - print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Return: %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn))) - if episodeReturn > bestReturn { - // print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Return: %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn))) + // print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Return: %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn))) + if evalEpisodeReturn > bestReturn { + print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Return: %3d | Eval : %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn), Int(evalEpisodeReturn))) // print("New best return of \(episodeReturn)") - bestReturn = episodeReturn + bestReturn = evalEpisodeReturn } - if episodeReturn > 199 { + if evalEpisodeReturn > 199 { print("Solved in \(episodeIndex) episodes with \(stepIndex) steps!") break } - episodeReturns.append(episodeReturn) + episodeReturns.append(evalEpisodeReturn) episodeReturn = 0 } From 6a118ab0ddc3e8be51df01a42e340586f5ed2a06 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Tue, 4 Aug 2020 01:55:23 +0000 Subject: [PATCH 24/34] Implement combined experience replay --- Gym/DQN/main.swift | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 7bf0e0042b6..b7c0dcca9b3 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -111,7 +111,11 @@ class ReplayBuffer { nextStateBatch: Tensor, isDoneBatch: Tensor ) { - let randomIndices = Tensor(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))! + // Vanilla + // let randomIndices = Tensor(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))! + // Combined Experience Replay + let sampledIndices = np.random.randint(count, size: batchSize-1, dtype: np.int32) + let randomIndices = Tensor(numpy: np.append(sampledIndices, np.array([(index + capacity - 1) % capacity], dtype: np.int32)))! let stateBatch = states.gathering(atIndices: randomIndices, alongAxis: 0) let actionBatch = actions.gathering(atIndices: randomIndices, alongAxis: 0) From ce539e564f15ec670969e560cad863340b5767a2 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Tue, 4 Aug 2020 01:55:40 +0000 Subject: [PATCH 25/34] Implement double DQN --- Gym/DQN/main.swift | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index b7c0dcca9b3..dbf9eeb59d9 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -189,7 +189,13 @@ class Agent { let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices) // Compute target batch - let nextStateQValueBatch = self.targetQNet(tfNextStateBatch).max(squeezingAxes: Tensor(1)) + // DQN + // let nextStateQValueBatch = self.targetQNet(tfNextStateBatch).max(squeezingAxes: 1) + // DDQN + let npNextStateActionBatch = self.qNet(tfNextStateBatch).argmax(squeezingAxis: 1).makeNumpyArray() + let npNextStateFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npNextStateActionBatch], axis: 1) + let tfNextStateFullIndices = Tensor(numpy: npNextStateFullIndices)! + let nextStateQValueBatch = _Raw.gatherNd(params: self.targetQNet(tfNextStateBatch), indices: tfNextStateFullIndices) let targetBatch: Tensor = tfRewardBatch + self.discount * (1 - Tensor(tfIsDoneBatch)) * nextStateQValueBatch return huberLoss( From cf7b96a4753409edf5c1f7b8cb5db57f37db7830 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Tue, 4 Aug 2020 02:07:34 +0000 Subject: [PATCH 26/34] Add options to toggle CER and DDQN --- Gym/DQN/main.swift | 70 +++++++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 22 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index dbf9eeb59d9..504630e3ddc 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -64,6 +64,7 @@ let plt = Python.import("matplotlib.pyplot") class ReplayBuffer { let capacity: Int + let combined: Bool let device: Device var states: Tensor @@ -74,8 +75,9 @@ class ReplayBuffer { var count: Int = 0 var index: Int = 0 - init(capacity: Int, device: Device) { + init(capacity: Int, combined: Bool, device: Device) { self.capacity = capacity + self.combined = combined self.device = device states = Tensor(zeros: [capacity, 4], on: device) @@ -111,17 +113,23 @@ class ReplayBuffer { nextStateBatch: Tensor, isDoneBatch: Tensor ) { - // Vanilla - // let randomIndices = Tensor(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))! - // Combined Experience Replay - let sampledIndices = np.random.randint(count, size: batchSize-1, dtype: np.int32) - let randomIndices = Tensor(numpy: np.append(sampledIndices, np.array([(index + capacity - 1) % capacity], dtype: np.int32)))! - - let stateBatch = states.gathering(atIndices: randomIndices, alongAxis: 0) - let actionBatch = actions.gathering(atIndices: randomIndices, alongAxis: 0) - let rewardBatch = rewards.gathering(atIndices: randomIndices, alongAxis: 0) - let nextStateBatch = nextStates.gathering(atIndices: randomIndices, alongAxis: 0) - let isDoneBatch = isDones.gathering(atIndices: randomIndices, alongAxis: 0) + let indices: Tensor + if self.combined == true { + // Combined Experience Replay + let sampledIndices = np.random.randint(count, size: batchSize - 1, dtype: np.int32) + let lastIndex = np.array([(index + capacity - 1) % capacity], dtype: np.int32) + indices = Tensor(numpy: np.append(sampledIndices, lastIndex))! + } + else { + // Vanilla Experience Replay + indices = Tensor(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))! + } + + let stateBatch = states.gathering(atIndices: indices, alongAxis: 0) + let actionBatch = actions.gathering(atIndices: indices, alongAxis: 0) + let rewardBatch = rewards.gathering(atIndices: indices, alongAxis: 0) + let nextStateBatch = nextStates.gathering(atIndices: indices, alongAxis: 0) + let isDoneBatch = isDones.gathering(atIndices: indices, alongAxis: 0) return (stateBatch, actionBatch, rewardBatch, nextStateBatch, isDoneBatch) } @@ -151,15 +159,26 @@ class Agent { let replayBuffer: ReplayBuffer let discount: Float let minBufferSize: Int + let doubleDQN: Bool let device: Device - init(qNet: Net, targetQNet: Net, optimizer: AMSGrad, replayBuffer: ReplayBuffer, discount: Float, minBufferSize: Int, device: Device) { + init( + qNet: Net, + targetQNet: Net, + optimizer: AMSGrad, + replayBuffer: ReplayBuffer, + discount: Float, + minBufferSize: Int, + doubleDQN: Bool, + device: Device + ) { self.qNet = qNet self.targetQNet = targetQNet self.optimizer = optimizer self.replayBuffer = replayBuffer self.discount = discount self.minBufferSize = minBufferSize + self.doubleDQN = doubleDQN self.device = device } @@ -189,13 +208,18 @@ class Agent { let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices) // Compute target batch - // DQN - // let nextStateQValueBatch = self.targetQNet(tfNextStateBatch).max(squeezingAxes: 1) - // DDQN - let npNextStateActionBatch = self.qNet(tfNextStateBatch).argmax(squeezingAxis: 1).makeNumpyArray() - let npNextStateFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npNextStateActionBatch], axis: 1) - let tfNextStateFullIndices = Tensor(numpy: npNextStateFullIndices)! - let nextStateQValueBatch = _Raw.gatherNd(params: self.targetQNet(tfNextStateBatch), indices: tfNextStateFullIndices) + let nextStateQValueBatch: Tensor + if self.doubleDQN == true { + // Double DQN + let npNextStateActionBatch = self.qNet(tfNextStateBatch).argmax(squeezingAxis: 1).makeNumpyArray() + let npNextStateFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npNextStateActionBatch], axis: 1) + let tfNextStateFullIndices = Tensor(numpy: npNextStateFullIndices)! + nextStateQValueBatch = _Raw.gatherNd(params: self.targetQNet(tfNextStateBatch), indices: tfNextStateFullIndices) + } + else { + // DQN + nextStateQValueBatch = self.targetQNet(tfNextStateBatch).max(squeezingAxes: 1) + } let targetBatch: Tensor = tfRewardBatch + self.discount * (1 - Tensor(tfIsDoneBatch)) * nextStateQValueBatch return huberLoss( @@ -262,6 +286,8 @@ let hiddenSize: Int = 100 let startEpsilon: Float = 0.5 // TODO(seungjaeryanlee): Ignored right now let maxEpisode: Int = 1000 let replayBufferCapacity: Int = 1000 +let useCombinedExperienceReplay: Bool = true +let useDoubleDQN: Bool = true let minBufferSize: Int = 32 let batchSize: Int = 32 let targetNetUpdateRate: Int = 5 @@ -276,8 +302,8 @@ var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) updateTargetQNet(source: qNet, target: &targetQNet, softTargetUpdateRate: 1) let optimizer = AMSGrad(for: qNet, learningRate: learningRate) -var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity, device: device) -var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount, minBufferSize: minBufferSize, device: device) +var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity, combined: useCombinedExperienceReplay, device: device) +var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount, minBufferSize: minBufferSize, doubleDQN: useDoubleDQN, device: device) // RL Loop var stepIndex = 0 From 98b4647107a377c15eb5d958cefdb7683a17fb45 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Tue, 4 Aug 2020 02:37:07 +0000 Subject: [PATCH 27/34] Refactor code --- Gym/DQN/main.swift | 72 +++++++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 27 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 504630e3ddc..6a4d5ed8589 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -236,12 +236,7 @@ class Agent { } } -func updateTargetQNet(source: Net, target: inout Net, softTargetUpdateRate: Float) { - target.l1.weight = softTargetUpdateRate * Tensor(source.l1.weight) + (1 - softTargetUpdateRate) * target.l1.weight - target.l1.bias = softTargetUpdateRate * Tensor(source.l1.bias) + (1 - softTargetUpdateRate) * target.l1.bias - target.l2.weight = softTargetUpdateRate * Tensor(source.l2.weight) + (1 - softTargetUpdateRate) * target.l2.weight - target.l2.bias = softTargetUpdateRate * Tensor(source.l2.bias) + (1 - softTargetUpdateRate) * target.l2.bias -} + class TensorFlowEnvironmentWrapper { let originalEnv: PythonObject @@ -280,18 +275,27 @@ func eval(agent: Agent) -> Float { } // Hyperparameters -let discount: Float = 0.99 -let learningRate: Float = 0.001 +// - Network Hyperparameters let hiddenSize: Int = 100 -let startEpsilon: Float = 0.5 // TODO(seungjaeryanlee): Ignored right now +// - Agent-Env Interaction Hyperparameters let maxEpisode: Int = 1000 -let replayBufferCapacity: Int = 1000 -let useCombinedExperienceReplay: Bool = true +let epsilonStart: Float = 0.1 +let epsilonEnd: Float = 0.1 +let epsilonDecay: Float = 10000 +// - Update Hyperparameters +let learningRate: Float = 0.001 +let discount: Float = 0.99 let useDoubleDQN: Bool = true +// - Replay Buffer Hyperparameters +let replayBufferCapacity: Int = 1000 let minBufferSize: Int = 32 let batchSize: Int = 32 +let useCombinedExperienceReplay: Bool = true +// - Target Network Hyperparameters let targetNetUpdateRate: Int = 5 let softTargetUpdateRate: Float = 0.05 + +// Setup device let device: Device = Device.default // Initialize environment @@ -302,8 +306,21 @@ var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) updateTargetQNet(source: qNet, target: &targetQNet, softTargetUpdateRate: 1) let optimizer = AMSGrad(for: qNet, learningRate: learningRate) -var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity, combined: useCombinedExperienceReplay, device: device) -var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount, minBufferSize: minBufferSize, doubleDQN: useDoubleDQN, device: device) +var replayBuffer = ReplayBuffer( + capacity: replayBufferCapacity, + combined: useCombinedExperienceReplay, + device: device +) +var agent = Agent( + qNet: qNet, + targetQNet: targetQNet, + optimizer: optimizer, + replayBuffer: replayBuffer, + discount: discount, + minBufferSize: minBufferSize, + doubleDQN: useDoubleDQN, + device: device +) // RL Loop var stepIndex = 0 @@ -317,12 +334,7 @@ while episodeIndex < maxEpisode { stepIndex += 1 // Interact with environment - // let epsilon = startEpsilon * Float(maxEpisode - episodeIndex) / Float(maxEpisode) - let epsilon: Float = 0.1 - // let epsilon_start: Float = 0.9 - // let epsilon_end: Float = 0.05 - // let epsilon_decay: Int = 200 - // let epsilon: Float = epsilon_end + (epsilon_start - epsilon_end) * Float(np.exp(-1 * stepIndex / epsilon_decay, dtype: np.float32))! + let epsilon: Float = epsilonEnd + (epsilonStart - epsilonEnd) * Float(np.exp(-1.0 * Float(stepIndex) / epsilonDecay))! let action = agent.getAction(state: state, epsilon: epsilon) let (nextState, reward, isDone, _) = env.step(action) episodeReturn += reward.scalarized() @@ -340,20 +352,18 @@ while episodeIndex < maxEpisode { // End-of-episode if isDone.scalarized() == true { - let evalEpisodeReturn = eval(agent: agent) state = env.reset() episodeIndex += 1 - // print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Return: %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn))) + let evalEpisodeReturn = eval(agent: agent) + episodeReturns.append(evalEpisodeReturn) if evalEpisodeReturn > bestReturn { - print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Return: %3d | Eval : %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn), Int(evalEpisodeReturn))) - // print("New best return of \(episodeReturn)") + print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Train: %3d | Eval: %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn), Int(evalEpisodeReturn))) bestReturn = evalEpisodeReturn } if evalEpisodeReturn > 199 { print("Solved in \(episodeIndex) episodes with \(stepIndex) steps!") break } - episodeReturns.append(evalEpisodeReturn) episodeReturn = 0 } @@ -361,18 +371,26 @@ while episodeIndex < maxEpisode { state = nextState } +// Save learning curve +plt.plot(episodeReturns) +plt.title("Deep Q-Network on CartPole-v0") +plt.xlabel("Episode") +plt.ylabel("Episode Return") +plt.savefig("dqnEpisodeReturns.png") +plt.clf() + // Save smoothed learning curve -let runningMeanWindow: Int = 1 +let runningMeanWindow: Int = 10 let smoothedEpisodeReturns = np.convolve(episodeReturns, np.ones((runningMeanWindow)) / np.array(runningMeanWindow, dtype: np.int32), mode: "same") -plt.plot(smoothedEpisodeReturns) +plt.plot(episodeReturns) plt.title("Deep Q-Network on CartPole-v0") plt.xlabel("Episode") plt.ylabel("Smoothed Episode Return") plt.savefig("dqnSmoothedEpisodeReturns.png") plt.clf() -// Save TD loss curve +// // Save TD loss curve plt.plot(losses) plt.title("Deep Q-Network on CartPole-v0") plt.xlabel("Step") From e00901adbf7633e807cfeda16a054578e5e46eea Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Tue, 4 Aug 2020 02:40:47 +0000 Subject: [PATCH 28/34] Add updateTargetQNet to Agent class --- Gym/DQN/main.swift | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 6a4d5ed8589..2786fadc578 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -180,6 +180,9 @@ class Agent { self.minBufferSize = minBufferSize self.doubleDQN = doubleDQN self.device = device + + // Copy Q-network to Target Q-network before training + updateTargetQNet(tau: 1) } func getAction(state: Tensor, epsilon: Float) -> Tensor { @@ -234,9 +237,14 @@ class Agent { } return 0 } -} - + func updateTargetQNet(tau: Float) { + self.targetQNet.l1.weight = tau * Tensor(self.qNet.l1.weight) + (1 - tau) * self.targetQNet.l1.weight + self.targetQNet.l1.bias = tau * Tensor(self.qNet.l1.bias) + (1 - tau) * self.targetQNet.l1.bias + self.targetQNet.l2.weight = tau * Tensor(self.qNet.l2.weight) + (1 - tau) * self.targetQNet.l2.weight + self.targetQNet.l2.bias = tau * Tensor(self.qNet.l2.bias) + (1 - tau) * self.targetQNet.l2.bias + } +} class TensorFlowEnvironmentWrapper { let originalEnv: PythonObject @@ -304,7 +312,6 @@ let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0")) // Initialize agent var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) -updateTargetQNet(source: qNet, target: &targetQNet, softTargetUpdateRate: 1) let optimizer = AMSGrad(for: qNet, learningRate: learningRate) var replayBuffer = ReplayBuffer( capacity: replayBufferCapacity, @@ -347,7 +354,7 @@ while episodeIndex < maxEpisode { // Periodically update Target Net if stepIndex % targetNetUpdateRate == 0 { - updateTargetQNet(source: qNet, target: &targetQNet, softTargetUpdateRate: softTargetUpdateRate) + agent.updateTargetQNet(tau: softTargetUpdateRate) } // End-of-episode From bca2614ad5e4c4e4dc875074de390d69fffedfa4 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Tue, 4 Aug 2020 03:00:33 +0000 Subject: [PATCH 29/34] Use TF-Agents hyperparameters --- Gym/DQN/main.swift | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 2786fadc578..cdba07a1993 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -155,7 +155,7 @@ struct Net: Layer { class Agent { var qNet: Net var targetQNet: Net - let optimizer: AMSGrad + let optimizer: Adam let replayBuffer: ReplayBuffer let discount: Float let minBufferSize: Int @@ -165,7 +165,7 @@ class Agent { init( qNet: Net, targetQNet: Net, - optimizer: AMSGrad, + optimizer: Adam, replayBuffer: ReplayBuffer, discount: Float, minBufferSize: Int, @@ -295,9 +295,9 @@ let learningRate: Float = 0.001 let discount: Float = 0.99 let useDoubleDQN: Bool = true // - Replay Buffer Hyperparameters -let replayBufferCapacity: Int = 1000 -let minBufferSize: Int = 32 -let batchSize: Int = 32 +let replayBufferCapacity: Int = 100000 +let minBufferSize: Int = 64 +let batchSize: Int = 64 let useCombinedExperienceReplay: Bool = true // - Target Network Hyperparameters let targetNetUpdateRate: Int = 5 @@ -312,7 +312,7 @@ let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0")) // Initialize agent var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) -let optimizer = AMSGrad(for: qNet, learningRate: learningRate) +let optimizer = Adam(for: qNet, learningRate: learningRate) var replayBuffer = ReplayBuffer( capacity: replayBufferCapacity, combined: useCombinedExperienceReplay, From 45b880ef4d943447bf955b8a5c6d1fa131e9765e Mon Sep 17 00:00:00 2001 From: Brad Larson Date: Wed, 5 Aug 2020 16:15:16 -0500 Subject: [PATCH 30/34] Changed ReplayBuffer to play better with GPU eager mode, restructured to four files, added Tensor extension, formatted via swift-format. --- Gym/DQN/Agent.swift | 145 ++++++++++++++ Gym/DQN/Gathering.swift | 45 +++++ Gym/DQN/ReplayBuffer.swift | 81 ++++++++ Gym/DQN/main.swift | 400 +++++++++---------------------------- 4 files changed, 363 insertions(+), 308 deletions(-) create mode 100644 Gym/DQN/Agent.swift create mode 100644 Gym/DQN/Gathering.swift create mode 100644 Gym/DQN/ReplayBuffer.swift diff --git a/Gym/DQN/Agent.swift b/Gym/DQN/Agent.swift new file mode 100644 index 00000000000..4a534062257 --- /dev/null +++ b/Gym/DQN/Agent.swift @@ -0,0 +1,145 @@ +// Copyright 2020 The TensorFlow Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import TensorFlow + +// Force unwrapping with `!` does not provide source location when unwrapping `nil`, so we instead +// make a utility function for debuggability. +extension Optional { + fileprivate func unwrapped(file: StaticString = #filePath, line: UInt = #line) -> Wrapped { + guard let unwrapped = self else { + fatalError("Value is nil", file: (file), line: line) + } + return unwrapped + } +} + +struct Net: Layer { + typealias Input = Tensor + typealias Output = Tensor + + var l1, l2: Dense + + init(observationSize: Int, hiddenSize: Int, actionCount: Int) { + l1 = Dense(inputSize: observationSize, outputSize: hiddenSize, activation: relu) + l2 = Dense(inputSize: hiddenSize, outputSize: actionCount, activation: identity) + } + + @differentiable + func callAsFunction(_ input: Input) -> Output { + return input.sequenced(through: l1, l2) + } +} + +class Agent { + var qNet: Net + var targetQNet: Net + let optimizer: Adam + let replayBuffer: ReplayBuffer + let discount: Float + let minBufferSize: Int + let doubleDQN: Bool + let device: Device + + init( + qNet: Net, + targetQNet: Net, + optimizer: Adam, + replayBuffer: ReplayBuffer, + discount: Float, + minBufferSize: Int, + doubleDQN: Bool, + device: Device + ) { + self.qNet = qNet + self.targetQNet = targetQNet + self.optimizer = optimizer + self.replayBuffer = replayBuffer + self.discount = discount + self.minBufferSize = minBufferSize + self.doubleDQN = doubleDQN + self.device = device + + // Copy Q-network to Target Q-network before training + updateTargetQNet(tau: 1) + } + + func getAction(state: Tensor, epsilon: Float) -> Tensor { + if Float(np.random.uniform()).unwrapped() < epsilon { + return Tensor(numpy: np.array(np.random.randint(0, 2), dtype: np.int32))! + } else { + // Neural network input needs to be 2D + let tfState = Tensor(numpy: np.expand_dims(state.makeNumpyArray(), axis: 0))! + let qValues = qNet(tfState)[0] + return Tensor(qValues[1].scalarized() > qValues[0].scalarized() ? 1 : 0, on: device) + } + } + + func train(batchSize: Int) -> Float { + // Don't train if replay buffer is too small + if replayBuffer.count >= minBufferSize { + let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch, tfIsDoneBatch) = + replayBuffer.sample(batchSize: batchSize) + + let (loss, gradients) = valueWithGradient(at: qNet) { qNet -> Tensor in + // Compute prediction batch + let npActionBatch = tfActionBatch.makeNumpyArray() + let npFullIndices = np.stack( + [np.arange(batchSize, dtype: np.int32), npActionBatch], axis: 1) + let tfFullIndices = Tensor(numpy: npFullIndices)! + let stateQValueBatch = qNet(tfStateBatch) + let predictionBatch = stateQValueBatch.dimensionGathering(atIndices: tfFullIndices) + + // Compute target batch + let nextStateQValueBatch: Tensor + if self.doubleDQN == true { + // Double DQN + let npNextStateActionBatch = self.qNet(tfNextStateBatch).argmax(squeezingAxis: 1) + .makeNumpyArray() + let npNextStateFullIndices = np.stack( + [np.arange(batchSize, dtype: np.int32), npNextStateActionBatch], axis: 1) + let tfNextStateFullIndices = Tensor(numpy: npNextStateFullIndices)! + nextStateQValueBatch = self.targetQNet(tfNextStateBatch).dimensionGathering( + atIndices: tfNextStateFullIndices) + } else { + // DQN + nextStateQValueBatch = self.targetQNet(tfNextStateBatch).max(squeezingAxes: 1) + } + let targetBatch: Tensor = + tfRewardBatch + self.discount * (1 - Tensor(tfIsDoneBatch)) * nextStateQValueBatch + + return huberLoss( + predicted: predictionBatch, + expected: targetBatch, + delta: 1 + ) + } + optimizer.update(&qNet, along: gradients) + + return loss.scalarized() + } + return 0 + } + + func updateTargetQNet(tau: Float) { + self.targetQNet.l1.weight = + tau * Tensor(self.qNet.l1.weight) + (1 - tau) * self.targetQNet.l1.weight + self.targetQNet.l1.bias = + tau * Tensor(self.qNet.l1.bias) + (1 - tau) * self.targetQNet.l1.bias + self.targetQNet.l2.weight = + tau * Tensor(self.qNet.l2.weight) + (1 - tau) * self.targetQNet.l2.weight + self.targetQNet.l2.bias = + tau * Tensor(self.qNet.l2.bias) + (1 - tau) * self.targetQNet.l2.bias + } +} diff --git a/Gym/DQN/Gathering.swift b/Gym/DQN/Gathering.swift new file mode 100644 index 00000000000..40392c5a4c6 --- /dev/null +++ b/Gym/DQN/Gathering.swift @@ -0,0 +1,45 @@ +// Copyright 2020 The TensorFlow Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import TensorFlow + +extension Tensor where Scalar: TensorFlowFloatingPoint { + @inlinable + @differentiable(wrt: self) + public func dimensionGathering( + atIndices indices: Tensor + ) -> Tensor { + return _Raw.gatherNd(params: self, indices: indices) + } + + /// Derivative of `_Raw.gatherNd`. + /// + /// Ported from TensorFlow Python reference implementation: + /// https://github.com/tensorflow/tensorflow/blob/r2.2/tensorflow/python/ops/array_grad.py#L691-L701 + @inlinable + @derivative(of: dimensionGathering) + func _vjpDimensionGathering( + atIndices indices: Tensor + ) -> (value: Tensor, pullback: (Tensor) -> Tensor) { + let shapeTensor = Tensor(self.shapeTensor) + let value = _Raw.gatherNd(params: self, indices: indices) + return ( + value, + { v in + let dparams = _Raw.scatterNd(indices: indices, updates: v, shape: shapeTensor) + return dparams + } + ) + } +} diff --git a/Gym/DQN/ReplayBuffer.swift b/Gym/DQN/ReplayBuffer.swift new file mode 100644 index 00000000000..01b64ed4ff3 --- /dev/null +++ b/Gym/DQN/ReplayBuffer.swift @@ -0,0 +1,81 @@ +// Copyright 2020 The TensorFlow Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import TensorFlow + +struct ReplayBuffer { + let capacity: Int + let combined: Bool + + @noDerivative var states: [Tensor] = [] + @noDerivative var actions: [Tensor] = [] + @noDerivative var rewards: [Tensor] = [] + @noDerivative var nextStates: [Tensor] = [] + @noDerivative var isDones: [Tensor] = [] + var count: Int { return states.count } + + init(capacity: Int, combined: Bool) { + self.capacity = capacity + self.combined = combined + } + + mutating func append( + state: Tensor, + action: Tensor, + reward: Tensor, + nextState: Tensor, + isDone: Tensor + ) { + if count >= capacity { + // Erase oldest SARS if the replay buffer is full + states.removeFirst() + actions.removeFirst() + rewards.removeFirst() + nextStates.removeFirst() + isDones.removeFirst() + } + states.append(state) + actions.append(action) + rewards.append(reward) + nextStates.append(nextState) + isDones.append(isDone) + } + + func sample(batchSize: Int) -> ( + stateBatch: Tensor, + actionBatch: Tensor, + rewardBatch: Tensor, + nextStateBatch: Tensor, + isDoneBatch: Tensor + ) { + let indices: Tensor + if self.combined == true { + // Combined Experience Replay + let sampledIndices = (0..(shape: [batchSize], scalars: sampledIndices + [Int32(count) - 1]) + } else { + // Vanilla Experience Replay + let sampledIndices = (0..(shape: [batchSize], scalars: sampledIndices) + } + + let stateBatch = Tensor(stacking: states).gathering(atIndices: indices, alongAxis: 0) + let actionBatch = Tensor(stacking: actions).gathering(atIndices: indices, alongAxis: 0) + let rewardBatch = Tensor(stacking: rewards).gathering(atIndices: indices, alongAxis: 0) + let nextStateBatch = Tensor(stacking: nextStates).gathering(atIndices: indices, alongAxis: 0) + let isDoneBatch = Tensor(stacking: isDones).gathering(atIndices: indices, alongAxis: 0) + + return (stateBatch, actionBatch, rewardBatch, nextStateBatch, isDoneBatch) + } +} diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index cdba07a1993..4006ff432e6 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -12,274 +12,52 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if canImport(PythonKit) - import PythonKit -#else - import Python -#endif +import Foundation +import PythonKit import TensorFlow -// Force unwrapping with `!` does not provide source location when unwrapping `nil`, so we instead -// make a utility function for debuggability. -fileprivate extension Optional { - func unwrapped(file: StaticString = #filePath, line: UInt = #line) -> Wrapped { - guard let unwrapped = self else { - fatalError("Value is nil", file: (file), line: line) - } - return unwrapped - } -} - -extension _Raw { - /// Derivative of `_Raw.gatherNd`. - /// - /// Ported from TensorFlow Python reference implementation: - /// https://github.com/tensorflow/tensorflow/blob/r2.2/tensorflow/python/ops/array_grad.py#L691-L701 - @inlinable - @derivative(of: gatherNd) - public static func vjpGatherNd< - Scalar: TensorFlowFloatingPoint, - Index: TensorFlowIndex - >( - params: Tensor, - indices: Tensor - ) -> ( - value: Tensor, - pullback: (Tensor) -> Tensor - ) { - let shapeTensor = Tensor(params.shapeTensor) - let value = gatherNd(params: params, indices: indices) - return (value, { v in - let dparams = scatterNd(indices: indices, updates: v, shape: shapeTensor) - return dparams - }) - } -} - // Initialize Python. This comment is a hook for internal use, do not remove. let np = Python.import("numpy") let gym = Python.import("gym") let plt = Python.import("matplotlib.pyplot") -class ReplayBuffer { - let capacity: Int - let combined: Bool - let device: Device - - var states: Tensor - var actions: Tensor - var rewards: Tensor - var nextStates: Tensor - var isDones: Tensor - var count: Int = 0 - var index: Int = 0 - - init(capacity: Int, combined: Bool, device: Device) { - self.capacity = capacity - self.combined = combined - self.device = device - - states = Tensor(zeros: [capacity, 4], on: device) - actions = Tensor(zeros: [capacity], on: device) - rewards = Tensor(zeros: [capacity], on: device) - nextStates = Tensor(zeros: [capacity, 4], on: device) - isDones = Tensor(repeating: false, shape: [capacity], on: device) - } - - func append( - state: Tensor, - action: Tensor, - reward: Tensor, - nextState: Tensor, - isDone: Tensor - ) { - if count < capacity { - count += 1 - } - // Erase oldest SARS if the replay buffer is full - states[index] = state - actions[index] = action - rewards[index] = reward - nextStates[index] = nextState - isDones[index] = isDone - index = (index + 1) % capacity - } - - func sample(batchSize: Int) -> ( - stateBatch: Tensor, - actionBatch: Tensor, - rewardBatch: Tensor, - nextStateBatch: Tensor, - isDoneBatch: Tensor - ) { - let indices: Tensor - if self.combined == true { - // Combined Experience Replay - let sampledIndices = np.random.randint(count, size: batchSize - 1, dtype: np.int32) - let lastIndex = np.array([(index + capacity - 1) % capacity], dtype: np.int32) - indices = Tensor(numpy: np.append(sampledIndices, lastIndex))! - } - else { - // Vanilla Experience Replay - indices = Tensor(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))! - } - - let stateBatch = states.gathering(atIndices: indices, alongAxis: 0) - let actionBatch = actions.gathering(atIndices: indices, alongAxis: 0) - let rewardBatch = rewards.gathering(atIndices: indices, alongAxis: 0) - let nextStateBatch = nextStates.gathering(atIndices: indices, alongAxis: 0) - let isDoneBatch = isDones.gathering(atIndices: indices, alongAxis: 0) - - return (stateBatch, actionBatch, rewardBatch, nextStateBatch, isDoneBatch) - } -} - -struct Net: Layer { - typealias Input = Tensor - typealias Output = Tensor - - var l1, l2: Dense - - init(observationSize: Int, hiddenSize: Int, actionCount: Int) { - l1 = Dense(inputSize: observationSize, outputSize: hiddenSize, activation: relu) - l2 = Dense(inputSize: hiddenSize, outputSize: actionCount, activation: identity) - } - - @differentiable - func callAsFunction(_ input: Input) -> Output { - return input.sequenced(through: l1, l2) - } -} - -class Agent { - var qNet: Net - var targetQNet: Net - let optimizer: Adam - let replayBuffer: ReplayBuffer - let discount: Float - let minBufferSize: Int - let doubleDQN: Bool - let device: Device - - init( - qNet: Net, - targetQNet: Net, - optimizer: Adam, - replayBuffer: ReplayBuffer, - discount: Float, - minBufferSize: Int, - doubleDQN: Bool, - device: Device - ) { - self.qNet = qNet - self.targetQNet = targetQNet - self.optimizer = optimizer - self.replayBuffer = replayBuffer - self.discount = discount - self.minBufferSize = minBufferSize - self.doubleDQN = doubleDQN - self.device = device - - // Copy Q-network to Target Q-network before training - updateTargetQNet(tau: 1) - } - - func getAction(state: Tensor, epsilon: Float) -> Tensor { - if Float(np.random.uniform()).unwrapped() < epsilon { - return Tensor(numpy: np.array(np.random.randint(0, 2), dtype: np.int32))! - } - else { - // Neural network input needs to be 2D - let tfState = Tensor(numpy: np.expand_dims(state.makeNumpyArray(), axis: 0))! - let qValues = qNet(tfState)[0] - return Tensor(qValues[1].scalarized() > qValues[0].scalarized() ? 1 : 0, on: device) - } - } - - func train(batchSize: Int) -> Float { - // Don't train if replay buffer is too small - if replayBuffer.count >= minBufferSize { - let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch, tfIsDoneBatch) = replayBuffer.sample(batchSize: batchSize) - - let (loss, gradients) = valueWithGradient(at: qNet) { qNet -> Tensor in - // Compute prediction batch - let npActionBatch = tfActionBatch.makeNumpyArray() - let npFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npActionBatch], axis: 1) - let tfFullIndices = Tensor(numpy: npFullIndices)! - let stateQValueBatch = qNet(tfStateBatch) - let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices) - - // Compute target batch - let nextStateQValueBatch: Tensor - if self.doubleDQN == true { - // Double DQN - let npNextStateActionBatch = self.qNet(tfNextStateBatch).argmax(squeezingAxis: 1).makeNumpyArray() - let npNextStateFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npNextStateActionBatch], axis: 1) - let tfNextStateFullIndices = Tensor(numpy: npNextStateFullIndices)! - nextStateQValueBatch = _Raw.gatherNd(params: self.targetQNet(tfNextStateBatch), indices: tfNextStateFullIndices) - } - else { - // DQN - nextStateQValueBatch = self.targetQNet(tfNextStateBatch).max(squeezingAxes: 1) - } - let targetBatch: Tensor = tfRewardBatch + self.discount * (1 - Tensor(tfIsDoneBatch)) * nextStateQValueBatch - - return huberLoss( - predicted: predictionBatch, - expected: targetBatch, - delta: 1 - ) - } - optimizer.update(&qNet, along: gradients) - - return loss.scalarized() - } - return 0 - } - - func updateTargetQNet(tau: Float) { - self.targetQNet.l1.weight = tau * Tensor(self.qNet.l1.weight) + (1 - tau) * self.targetQNet.l1.weight - self.targetQNet.l1.bias = tau * Tensor(self.qNet.l1.bias) + (1 - tau) * self.targetQNet.l1.bias - self.targetQNet.l2.weight = tau * Tensor(self.qNet.l2.weight) + (1 - tau) * self.targetQNet.l2.weight - self.targetQNet.l2.bias = tau * Tensor(self.qNet.l2.bias) + (1 - tau) * self.targetQNet.l2.bias - } -} - class TensorFlowEnvironmentWrapper { - let originalEnv: PythonObject - - init(_ env: PythonObject) { - self.originalEnv = env - } - - func reset() -> Tensor { - let state = self.originalEnv.reset() - return Tensor(numpy: np.array(state, dtype: np.float32))! - } - - func step(_ action: Tensor) -> (state: Tensor, reward: Tensor, isDone: Tensor, info: PythonObject) { - let (state, reward, isDone, info) = originalEnv.step(action.scalarized()).tuple4 - let tfState = Tensor(numpy: np.array(state, dtype: np.float32))! - let tfReward = Tensor(numpy: np.array(reward, dtype: np.float32))! - let tfIsDone = Tensor(numpy: np.array(isDone, dtype: np.bool))! - return (tfState, tfReward, tfIsDone, info) - } + let originalEnv: PythonObject + + init(_ env: PythonObject) { + self.originalEnv = env + } + + func reset() -> Tensor { + let state = self.originalEnv.reset() + return Tensor(numpy: np.array(state, dtype: np.float32))! + } + + func step(_ action: Tensor) -> ( + state: Tensor, reward: Tensor, isDone: Tensor, info: PythonObject + ) { + let (state, reward, isDone, info) = originalEnv.step(action.scalarized()).tuple4 + let tfState = Tensor(numpy: np.array(state, dtype: np.float32))! + let tfReward = Tensor(numpy: np.array(reward, dtype: np.float32))! + let tfIsDone = Tensor(numpy: np.array(isDone, dtype: np.bool))! + return (tfState, tfReward, tfIsDone, info) + } } func eval(agent: Agent) -> Float { - let evalEnv = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0")) - var evalEpisodeReturn: Float = 0 - var state: Tensor = evalEnv.reset() - var reward: Tensor - var evalIsDone: Tensor = Tensor(false) - while evalIsDone.scalarized() == false { - let action = agent.getAction(state: state, epsilon: 0) - (state, reward, evalIsDone, _) = evalEnv.step(action) - evalEpisodeReturn += reward.scalarized() - } - - return evalEpisodeReturn + let evalEnv = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0")) + var evalEpisodeReturn: Float = 0 + var state: Tensor = evalEnv.reset() + var reward: Tensor + var evalIsDone: Tensor = Tensor(false) + while evalIsDone.scalarized() == false { + let action = agent.getAction(state: state, epsilon: 0) + (state, reward, evalIsDone, _) = evalEnv.step(action) + evalEpisodeReturn += reward.scalarized() + } + + return evalEpisodeReturn } // Hyperparameters @@ -314,68 +92,72 @@ var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) let optimizer = Adam(for: qNet, learningRate: learningRate) var replayBuffer = ReplayBuffer( - capacity: replayBufferCapacity, - combined: useCombinedExperienceReplay, - device: device + capacity: replayBufferCapacity, + combined: useCombinedExperienceReplay ) var agent = Agent( - qNet: qNet, - targetQNet: targetQNet, - optimizer: optimizer, - replayBuffer: replayBuffer, - discount: discount, - minBufferSize: minBufferSize, - doubleDQN: useDoubleDQN, - device: device + qNet: qNet, + targetQNet: targetQNet, + optimizer: optimizer, + replayBuffer: replayBuffer, + discount: discount, + minBufferSize: minBufferSize, + doubleDQN: useDoubleDQN, + device: device ) // RL Loop var stepIndex = 0 var episodeIndex = 0 var episodeReturn: Float = 0 -var episodeReturns: Array = [] -var losses: Array = [] +var episodeReturns: [Float] = [] +var losses: [Float] = [] var state = env.reset() var bestReturn: Float = 0 while episodeIndex < maxEpisode { - stepIndex += 1 - - // Interact with environment - let epsilon: Float = epsilonEnd + (epsilonStart - epsilonEnd) * Float(np.exp(-1.0 * Float(stepIndex) / epsilonDecay))! - let action = agent.getAction(state: state, epsilon: epsilon) - let (nextState, reward, isDone, _) = env.step(action) - episodeReturn += reward.scalarized() - - // Save interaction to replay buffer - replayBuffer.append(state: state, action: action, reward: reward, nextState: nextState, isDone: isDone) - - // Train agent - losses.append(agent.train(batchSize: batchSize)) - - // Periodically update Target Net - if stepIndex % targetNetUpdateRate == 0 { - agent.updateTargetQNet(tau: softTargetUpdateRate) - } - - // End-of-episode - if isDone.scalarized() == true { - state = env.reset() - episodeIndex += 1 - let evalEpisodeReturn = eval(agent: agent) - episodeReturns.append(evalEpisodeReturn) - if evalEpisodeReturn > bestReturn { - print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Train: %3d | Eval: %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn), Int(evalEpisodeReturn))) - bestReturn = evalEpisodeReturn - } - if evalEpisodeReturn > 199 { - print("Solved in \(episodeIndex) episodes with \(stepIndex) steps!") - break - } - episodeReturn = 0 - } - - // End-of-step - state = nextState + stepIndex += 1 + + // Interact with environment + let epsilon: Float = + epsilonEnd + (epsilonStart - epsilonEnd) * exp(-1.0 * Float(stepIndex) / epsilonDecay) + let action = agent.getAction(state: state, epsilon: epsilon) + let (nextState, reward, isDone, _) = env.step(action) + episodeReturn += reward.scalarized() + + // Save interaction to replay buffer + replayBuffer.append( + state: state, action: action, reward: reward, nextState: nextState, isDone: isDone) + + // Train agent + losses.append(agent.train(batchSize: batchSize)) + + // Periodically update Target Net + if stepIndex % targetNetUpdateRate == 0 { + agent.updateTargetQNet(tau: softTargetUpdateRate) + } + + // End-of-episode + if isDone.scalarized() == true { + state = env.reset() + episodeIndex += 1 + let evalEpisodeReturn = eval(agent: agent) + episodeReturns.append(evalEpisodeReturn) + if evalEpisodeReturn > bestReturn { + print( + String( + format: "Episode: %4d | Step %6d | Epsilon: %.03f | Train: %3d | Eval: %3d", episodeIndex, + stepIndex, epsilon, Int(episodeReturn), Int(evalEpisodeReturn))) + bestReturn = evalEpisodeReturn + } + if evalEpisodeReturn > 199 { + print("Solved in \(episodeIndex) episodes with \(stepIndex) steps!") + break + } + episodeReturn = 0 + } + + // End-of-step + state = nextState } // Save learning curve @@ -388,7 +170,9 @@ plt.clf() // Save smoothed learning curve let runningMeanWindow: Int = 10 -let smoothedEpisodeReturns = np.convolve(episodeReturns, np.ones((runningMeanWindow)) / np.array(runningMeanWindow, dtype: np.int32), mode: "same") +let smoothedEpisodeReturns = np.convolve( + episodeReturns, np.ones((runningMeanWindow)) / np.array(runningMeanWindow, dtype: np.int32), + mode: "same") plt.plot(episodeReturns) plt.title("Deep Q-Network on CartPole-v0") From 356c989b382835ab5a4ac455cac36be742010951 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Thu, 6 Aug 2020 00:51:13 +0000 Subject: [PATCH 31/34] Fix ReplayBuffer pass-by-value bug --- Gym/DQN/ReplayBuffer.swift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Gym/DQN/ReplayBuffer.swift b/Gym/DQN/ReplayBuffer.swift index 01b64ed4ff3..31126dc4a1d 100644 --- a/Gym/DQN/ReplayBuffer.swift +++ b/Gym/DQN/ReplayBuffer.swift @@ -14,7 +14,7 @@ import TensorFlow -struct ReplayBuffer { +class ReplayBuffer { let capacity: Int let combined: Bool @@ -30,7 +30,7 @@ struct ReplayBuffer { self.combined = combined } - mutating func append( + func append( state: Tensor, action: Tensor, reward: Tensor, From d774fad85e30064ca2dd06a7f9ecc36b563cef26 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Thu, 6 Aug 2020 00:58:44 +0000 Subject: [PATCH 32/34] Use epsilon decay for more consistent performance --- Gym/DQN/main.swift | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 4006ff432e6..365a8c483e1 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -65,9 +65,9 @@ func eval(agent: Agent) -> Float { let hiddenSize: Int = 100 // - Agent-Env Interaction Hyperparameters let maxEpisode: Int = 1000 -let epsilonStart: Float = 0.1 -let epsilonEnd: Float = 0.1 -let epsilonDecay: Float = 10000 +let epsilonStart: Float = 1 +let epsilonEnd: Float = 0.01 +let epsilonDecay: Float = 1000 // - Update Hyperparameters let learningRate: Float = 0.001 let discount: Float = 0.99 From a10f20112fefd641fe1d77d08484d96ab1f0957b Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Fri, 7 Aug 2020 10:43:26 +0000 Subject: [PATCH 33/34] Add documentation and improve names --- Gym/DQN/Agent.swift | 27 ++++++++++++----- Gym/DQN/ReplayBuffer.swift | 7 +++++ Gym/DQN/main.swift | 60 +++++++++++++++++++++++++++++--------- 3 files changed, 73 insertions(+), 21 deletions(-) diff --git a/Gym/DQN/Agent.swift b/Gym/DQN/Agent.swift index 4a534062257..52f25904f87 100644 --- a/Gym/DQN/Agent.swift +++ b/Gym/DQN/Agent.swift @@ -25,7 +25,12 @@ extension Optional { } } -struct Net: Layer { +/// A Deep Q-Network. +/// +/// A Q-network is a neural network that receives the observation (state) as input and estimates +/// the action values (Q values) of each action. For more information, check Human-level control +/// through deep reinforcement learning (Mnih et al., 2015). +struct DeepQNetwork: Layer { typealias Input = Tensor typealias Output = Tensor @@ -42,10 +47,16 @@ struct Net: Layer { } } -class Agent { - var qNet: Net - var targetQNet: Net - let optimizer: Adam +/// Agent that uses the Deep Q-Network. +/// +/// Deep Q-Network is an algorithm that trains a Q-network that estimates the action values of +/// each action given an observation (state). The Q-network is trained iteratively using the +/// Bellman equation. For more information, check Human-level control through deep reinforcement +/// learning (Mnih et al., 2015). +class DeepQNetworkAgent { + var qNet: DeepQNetwork + var targetQNet: DeepQNetwork + let optimizer: Adam let replayBuffer: ReplayBuffer let discount: Float let minBufferSize: Int @@ -53,9 +64,9 @@ class Agent { let device: Device init( - qNet: Net, - targetQNet: Net, - optimizer: Adam, + qNet: DeepQNetwork, + targetQNet: DeepQNetwork, + optimizer: Adam, replayBuffer: ReplayBuffer, discount: Float, minBufferSize: Int, diff --git a/Gym/DQN/ReplayBuffer.swift b/Gym/DQN/ReplayBuffer.swift index 31126dc4a1d..c32e31c4d39 100644 --- a/Gym/DQN/ReplayBuffer.swift +++ b/Gym/DQN/ReplayBuffer.swift @@ -14,6 +14,13 @@ import TensorFlow +/// Replay buffer to store the agent's experiences. +/// +/// Vanilla Q-learning only trains on the latest experience. Deep Q-network uses +/// a technique called "experience replay", where all experience is stored into +/// a replay buffer. By storing experience, the agent can reuse the experiences +/// and also train in batches. For more information, check Human-level control +/// through deep reinforcement learning (Mnih et al., 2015). class ReplayBuffer { let capacity: Int let combined: Bool diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift index 365a8c483e1..0dfe7b996d9 100644 --- a/Gym/DQN/main.swift +++ b/Gym/DQN/main.swift @@ -45,7 +45,7 @@ class TensorFlowEnvironmentWrapper { } } -func eval(agent: Agent) -> Float { +func evaluate(_ agent: DeepQNetworkAgent) -> Float { let evalEnv = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0")) var evalEpisodeReturn: Float = 0 var state: Tensor = evalEnv.reset() @@ -61,24 +61,58 @@ func eval(agent: Agent) -> Float { } // Hyperparameters -// - Network Hyperparameters +/// The size of the hidden layer of the 2-layer Q-network. The network has the +/// shape observationSize - hiddenSize - actionCount. let hiddenSize: Int = 100 -// - Agent-Env Interaction Hyperparameters +/// Maximum number of episodes to train the agent. The training is terminated +/// early if maximum score is achieved during evaluation. let maxEpisode: Int = 1000 +/// The initial epsilon value. With probability epsilon, the agent chooses a +/// random action instead of the action that it thinks is the best. let epsilonStart: Float = 1 +/// The terminal epsilon value. let epsilonEnd: Float = 0.01 +/// The decay rate of epsilon. let epsilonDecay: Float = 1000 -// - Update Hyperparameters +/// The learning rate for the Q-network. let learningRate: Float = 0.001 +/// The discount factor. This measures how much to "discount" the future rewards +/// that the agent will receive. The discount factor must be from 0 to 1 +/// (inclusive). Discount factor of 0 means that the agent only considers the +/// immediate reward and disregards all future rewards. Discount factor of 1 +/// means that the agent values all rewards equally, no matter how distant +/// in the future they may be. let discount: Float = 0.99 +/// If enabled, uses the Double DQN update equation instead of the original DQN +/// equation. This mitigates the overestimation problem of DQN. For more +/// information about Double DQN, check Deep Reinforcement Learning with Double +/// Q-learning (Hasselt, Guez, and Silver, 2015). let useDoubleDQN: Bool = true -// - Replay Buffer Hyperparameters +/// The maximum size of the replay buffer. If the replay buffer is full, the new +/// element replaces the oldest element. let replayBufferCapacity: Int = 100000 +/// The minimum replay buffer size before the training starts. Must be at least +/// the training batch size. let minBufferSize: Int = 64 +/// The training batch size. let batchSize: Int = 64 +/// If enabled, uses Combined Experience Replay (CER) sampling instead of the +/// uniform random sampling in the original DQN paper. Original DQN samples +/// batch uniformly randomly in the replay buffer. CER always includes the most +/// recent element and samples the rest of the batch uniformly randomly. This +/// makes the agent more robust to different replay buffer capacities. For more +/// information about Combined Experience Replay, check A Deeper Look at +/// Experience Replay (Zhang and Sutton, 2017). let useCombinedExperienceReplay: Bool = true -// - Target Network Hyperparameters +/// The number of steps between target network updates. The target network is +/// a copy of the Q-network that is updated less frequently to stabilize the +/// training process. let targetNetUpdateRate: Int = 5 +/// The update rate for target network. In the original DQN paper, the target +/// network is updated to be the same as the Q-network. Soft target network +/// only updates the target network slightly towards the direction of the +/// Q-network. The softTargetUpdateRate of 0 means that the target network is +/// not updated at all, and 1 means that soft target network update is disabled. let softTargetUpdateRate: Float = 0.05 // Setup device @@ -88,14 +122,14 @@ let device: Device = Device.default let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0")) // Initialize agent -var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) -var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) +var qNet = DeepQNetwork(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) +var targetQNet = DeepQNetwork(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2) let optimizer = Adam(for: qNet, learningRate: learningRate) var replayBuffer = ReplayBuffer( capacity: replayBufferCapacity, combined: useCombinedExperienceReplay ) -var agent = Agent( +var agent = DeepQNetworkAgent( qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, @@ -140,7 +174,7 @@ while episodeIndex < maxEpisode { if isDone.scalarized() == true { state = env.reset() episodeIndex += 1 - let evalEpisodeReturn = eval(agent: agent) + let evalEpisodeReturn = evaluate(agent) episodeReturns.append(evalEpisodeReturn) if evalEpisodeReturn > bestReturn { print( @@ -165,7 +199,7 @@ plt.plot(episodeReturns) plt.title("Deep Q-Network on CartPole-v0") plt.xlabel("Episode") plt.ylabel("Episode Return") -plt.savefig("dqnEpisodeReturns.png") +plt.savefig("/tmp/dqnEpisodeReturns.png") plt.clf() // Save smoothed learning curve @@ -178,7 +212,7 @@ plt.plot(episodeReturns) plt.title("Deep Q-Network on CartPole-v0") plt.xlabel("Episode") plt.ylabel("Smoothed Episode Return") -plt.savefig("dqnSmoothedEpisodeReturns.png") +plt.savefig("/tmp/dqnSmoothedEpisodeReturns.png") plt.clf() // // Save TD loss curve @@ -186,5 +220,5 @@ plt.plot(losses) plt.title("Deep Q-Network on CartPole-v0") plt.xlabel("Step") plt.ylabel("TD Loss") -plt.savefig("dqnTDLoss.png") +plt.savefig("/tmp/dqnTDLoss.png") plt.clf() From 4aa929640d23ea4a69b1921cc013cabc43c0c019 Mon Sep 17 00:00:00 2001 From: Seungjae Ryan Lee Date: Fri, 7 Aug 2020 10:58:56 +0000 Subject: [PATCH 34/34] Document Agent and ReplayBuffer parameters --- Gym/DQN/Agent.swift | 14 ++++++++++++++ Gym/DQN/ReplayBuffer.swift | 17 +++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/Gym/DQN/Agent.swift b/Gym/DQN/Agent.swift index 52f25904f87..8522f685f8d 100644 --- a/Gym/DQN/Agent.swift +++ b/Gym/DQN/Agent.swift @@ -54,12 +54,26 @@ struct DeepQNetwork: Layer { /// Bellman equation. For more information, check Human-level control through deep reinforcement /// learning (Mnih et al., 2015). class DeepQNetworkAgent { + /// The Q-network uses to estimate the action values. var qNet: DeepQNetwork + /// The copy of the Q-network updated less frequently to stabilize the + /// training process. var targetQNet: DeepQNetwork + /// The optimizer used to train the Q-network. let optimizer: Adam + /// The replay buffer that stores experiences of the interactions between the + /// agent and the environment. The Q-network is trained from experiences + /// sampled from the replay buffer. let replayBuffer: ReplayBuffer + /// The discount factor that measures how much to weight to give to future + /// rewards when calculating the action value. let discount: Float + /// The minimum replay buffer size before the training starts. let minBufferSize: Int + /// If enabled, uses the Double DQN update equation instead of the original + /// DQN equation. This mitigates the overestimation problem of DQN. For more + /// information about Double DQN, check Deep Reinforcement Learning with + /// Double Q-learning (Hasselt, Guez, and Silver, 2015). let doubleDQN: Bool let device: Device diff --git a/Gym/DQN/ReplayBuffer.swift b/Gym/DQN/ReplayBuffer.swift index c32e31c4d39..f9e6ddf1c48 100644 --- a/Gym/DQN/ReplayBuffer.swift +++ b/Gym/DQN/ReplayBuffer.swift @@ -22,14 +22,31 @@ import TensorFlow /// and also train in batches. For more information, check Human-level control /// through deep reinforcement learning (Mnih et al., 2015). class ReplayBuffer { + /// The maximum size of the replay buffer. When the replay buffer is full, + /// new elements replace the oldest element in the replay buffer. let capacity: Int + /// If enabled, uses Combined Experience Replay (CER) sampling instead of the + /// uniform random sampling in the original DQN paper. Original DQN samples + /// batch uniformly randomly in the replay buffer. CER always includes the + /// most recent element and samples the rest of the batch uniformly randomly. + /// This makes the agent more robust to different replay buffer capacities. + /// For more information about Combined Experience Replay, check A Deeper Look + /// at Experience Replay (Zhang and Sutton, 2017). let combined: Bool + /// The states that the agent observed. @noDerivative var states: [Tensor] = [] + /// The actions that the agent took. @noDerivative var actions: [Tensor] = [] + /// The rewards that the agent received from the environment after taking + /// an action. @noDerivative var rewards: [Tensor] = [] + /// The next states that the agent received from the environment after taking + /// an action. @noDerivative var nextStates: [Tensor] = [] + /// The episode-terminal flag that the agent received after taking an action. @noDerivative var isDones: [Tensor] = [] + /// The current size of the replay buffer. var count: Int { return states.count } init(capacity: Int, combined: Bool) {