From 5dc50ebbb5d67299704f2f1e8e810d68dfaa0950 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Mon, 29 Jun 2020 17:28:26 +0000
Subject: [PATCH 01/34] Copy code from Colab

---
 Gym/DQN/main.swift | 278 +++++++++++++++++++++++++++++++++++++++++++++
 Gym/README.md      |   1 +
 Package.swift      |   1 +
 3 files changed, 280 insertions(+)
 create mode 100644 Gym/DQN/main.swift
diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
new file mode 100644
index 00000000000..150a0ef1954
--- /dev/null
+++ b/Gym/DQN/main.swift
@@ -0,0 +1,278 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if canImport(PythonKit)
+    import PythonKit
+#else
+    import Python
+#endif
+import TensorFlow
+
+// Force unwrapping with `!` does not provide source location when unwrapping `nil`, so we instead
+// make a utility function for debuggability.
+fileprivate extension Optional {
+    func unwrapped(file: StaticString = #filePath, line: UInt = #line) -> Wrapped {
+        guard let unwrapped = self else {
+            fatalError("Value is nil", file: (file), line: line)
+        }
+        return unwrapped
+    }
+}
+
+// Initialize Python. This comment is a hook for internal use, do not remove.
+
+let np = Python.import("numpy")
+let gym = Python.import("gym")
+
+typealias State = Tensor<Float>
+typealias Action = Tensor<Int32>
+typealias Reward = Tensor<Float>
+
+class ReplayBuffer {
+    var states: Tensor<Float>
+    var actions: Tensor<Int32>
+    var rewards: Tensor<Float>
+    var nextStates: Tensor<Float>
+    let capacity: Int
+    var count: Int
+    var index: Int
+
+    init(capacity: Int) {
+        self.capacity = capacity
+
+        states = Tensor<Float>(numpy: np.zeros([capacity, 4], dtype: np.float32))!
+        actions = Tensor<Int32>(numpy: np.zeros([capacity, 1], dtype: np.int32))!
+        rewards = Tensor<Float>(numpy: np.zeros([capacity, 1], dtype: np.float32))!
+        nextStates = Tensor<Float>(numpy: np.zeros([capacity, 4], dtype: np.float32))!
+        count = 0
+        index = 0
+    }
+
+    func append(state: Tensor<Float>, action: Tensor<Int32>, reward: Tensor<Float>, nextState: Tensor<Float>) {
+        if count < capacity {
+            count += 1
+        }
+        // Erase oldest SARS if the replay buffer is full
+        states[index] = state
+        actions[index] = Tensor<Int32>(numpy: np.expand_dims(action.makeNumpyArray(), axis: 0))!
+        rewards[index] = Tensor<Float>(numpy: np.expand_dims(reward.makeNumpyArray(), axis: 0))!
+        nextStates[index] = nextState
+        index = (index + 1) % capacity
+    }
+
+    func sample(batchSize: Int) -> (stateBatch: Tensor<Float>, actionBatch: Tensor<Int32>, rewardBatch: Tensor<Float>, nextStateBatch: Tensor<Float>) {
+        let randomIndices = Tensor<Int32>(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))!
+
+        let stateBatch = _Raw.gather(params: states, indices: randomIndices)
+        let actionBatch = _Raw.gather(params: actions, indices: randomIndices)
+        let rewardBatch = _Raw.gather(params: rewards, indices: randomIndices)
+        let nextStateBatch = _Raw.gather(params: nextStates, indices: randomIndices)
+
+        return (stateBatch, actionBatch, rewardBatch, nextStateBatch)
+    }
+}
+
+struct Net: Layer {
+    typealias Input = Tensor<Float>
+    typealias Output = Tensor<Float>
+
+    var l1, l2: Dense<Float>
+
+    init(observationSize: Int, hiddenSize: Int, actionCount: Int) {
+        l1 = Dense<Float>(inputSize: observationSize, outputSize: hiddenSize, activation: relu, weightInitializer: heNormal())
+        l2 = Dense<Float>(inputSize: hiddenSize, outputSize: actionCount, weightInitializer: heNormal())
+    }
+
+    @differentiable
+    func callAsFunction(_ input: Input) -> Output {
+        return input.sequenced(through: l1, l2)
+    }
+}
+
+class Agent {
+    // Q-network
+    var qNet: Net
+    // Target Q-network
+    var targetQNet: Net
+    // Optimizer
+    let optimizer: Adam<Net>
+    // Replay Buffer
+    let replayBuffer: ReplayBuffer
+    // Discount Factor
+    let discount: Float
+
+    init(qNet: Net, targetQNet: Net, optimizer: Adam<Net>, replayBuffer: ReplayBuffer, discount: Float) {
+        self.qNet = qNet
+        self.targetQNet = targetQNet
+        self.optimizer = optimizer
+        self.replayBuffer = replayBuffer
+        self.discount = discount
+    }
+
+    func getAction(state: Tensor<Float>, epsilon: Float) -> Tensor<Int32> {
+        if Float(np.random.uniform()).unwrapped() < epsilon {
+            // print("getAction | state: \(state)")
+            // print("getAction | epsilon: \(epsilon)")
+            let npState = np.random.randint(0, 2, dtype: np.int32)
+            // print("getAction | npState: \(npState)")
+            return Tensor<Int32>(numpy: np.array(npState, dtype: np.int32))!
+        }
+        else {
+            // Neural network input needs to be 2D
+            let tfState = Tensor<Float>(numpy: np.expand_dims(state.makeNumpyArray(), axis: 0))!
+            let qValues = qNet(tfState)
+            let leftQValue = Float(qValues[0][0]).unwrapped()
+            let rightQValue = Float(qValues[0][1]).unwrapped()
+            return leftQValue < rightQValue ? Tensor<Int32>(numpy: np.array(1, dtype: np.int32))! : Tensor<Int32>(numpy: np.array(0, dtype: np.int32))!
+        }
+    }
+
+    func train(batchSize: Int) {
+        // Don't train if replay buffer is too small
+        if replayBuffer.count >= batchSize {
+            // print("train | Start training")
+            let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch) = replayBuffer.sample(batchSize: batchSize)
+
+            // TODO: Find equivalent function of tf.gather_nd in S4TF to parallelize Q-value computation (_Raw.gather_nd does not exist)
+            // Gradient are accumulated since we calculate every element in the batch individually
+            var totalGrad = qNet.zeroTangentVector
+            for i in 0..<batchSize {
+                let 𝛁qNet = gradient(at: qNet) { qNet -> Tensor<Float> in
+
+                    let stateQValueBatch = qNet(tfStateBatch)
+                    let tfAction: Tensor<Int32> = tfActionBatch[i][0]
+                    let action = Int(tfAction.makeNumpyArray()).unwrapped()
+                    let prediction: Tensor<Float> = stateQValueBatch[i][action]
+
+                    let nextStateQValueBatch = self.targetQNet(tfNextStateBatch)
+                    let tfReward: Tensor<Float> = tfRewardBatch[i][0]
+                    let leftQValue = Float(nextStateQValueBatch[i][0].makeNumpyArray()).unwrapped()
+                    let rightQValue = Float(nextStateQValueBatch[i][1].makeNumpyArray()).unwrapped()
+                    let maxNextStateQValue = leftQValue > rightQValue ? leftQValue : rightQValue
+                    let target: Tensor<Float> = tfReward + self.discount * maxNextStateQValue
+
+                    return squaredDifference(prediction, withoutDerivative(at: target))
+                }
+                totalGrad += 𝛁qNet
+            }
+            optimizer.update(&qNet, along: totalGrad)
+        }
+    }
+}
+
+func updateTargetQNet(source: Net, target: inout Net) {
+    target.l1.weight = Tensor<Float>(source.l1.weight)
+    target.l1.bias = Tensor<Float>(source.l1.bias)
+    target.l2.weight = Tensor<Float>(source.l2.weight)
+    target.l2.bias = Tensor<Float>(source.l2.bias)
+}
+
+class TensorFlowEnvironmentWrapper {
+    let originalEnv: PythonObject
+    let action_space: PythonObject
+    let observation_space: PythonObject
+
+    init(_ env: PythonObject) {
+        self.originalEnv = env
+        self.action_space = env.action_space
+        self.observation_space = env.observation_space
+    }
+
+    func reset() -> Tensor<Float> {
+      let state = self.originalEnv.reset()
+      return Tensor<Float>(numpy: np.array(state, dtype: np.float32))!
+    }
+
+    func step(_ action: Tensor<Int32>) -> (Tensor<Float>, Tensor<Float>, PythonObject, PythonObject) {
+      let npAction = action.makeNumpyArray().item()
+      let (state, reward, isDone, info) = originalEnv.step(npAction).tuple4
+      let tfState = Tensor<Float>(numpy: np.array(state, dtype: np.float32))!
+      let tfReward = Tensor<Float>(numpy: np.array(reward, dtype: np.float32))!
+      return (tfState, tfReward, isDone, info)
+    }
+}
+
+// Hyperparameters
+let discount: Float = 0.99
+let learningRate: Float = 0.01
+let hiddenSize: Int = 64
+let startEpsilon: Float = 0.5
+let maxEpisode: Int = 500
+let replayBufferCapacity: Int = 1000
+let batchSize: Int = 32
+let targetNetUpdateRate: Int = 1
+
+// Initialize environment
+let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0"))
+
+// Initialize agent
+let actionCount = Int(env.action_space.n).unwrapped()
+var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: actionCount)
+var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: actionCount)
+updateTargetQNet(source: qNet, target: &targetQNet)
+let optimizer = Adam(for: qNet, learningRate: learningRate)
+var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity)
+var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount)
+
+// RL Loop
+var stepIndex = 0
+var episodeIndex = 0
+var episodeReturn: Int = 0
+var episodeReturns: Array<Int> = []
+var state = env.reset()
+while episodeIndex < maxEpisode {
+    stepIndex += 1
+    // print("Step \(stepIndex)")
+
+    // Interact with environment
+    let action = agent.getAction(state: state, epsilon: startEpsilon * Float(maxEpisode - episodeIndex))
+    // print("action: \(action)")
+    var (nextState, reward, isDone, _) = env.step(action)
+    // print("state: \(state)")
+    // print("nextState: \(nextState)")
+    // print("reward: \(reward)")
+    // print("isDone: \(isDone)")
+    episodeReturn += Int(reward.makeNumpyArray().item()).unwrapped()
+    // print("episodeReturn: \(episodeReturn)")
+
+    // Save interaction to replay buffer
+    replayBuffer.append(state: state, action: action, reward: reward, nextState: nextState)
+    // print("Append successful")
+
+    // Train agent
+    agent.train(batchSize: batchSize)
+    // print("Train successful")
+
+    // Periodically update Target Net
+    if stepIndex % targetNetUpdateRate == 0 {
+        updateTargetQNet(source: qNet, target: &targetQNet)
+    }
+    // print("Target net update successful")
+
+    // End-of-episode
+    if isDone == true {
+        state = env.reset()
+        episodeIndex += 1
+        print("Episode \(episodeIndex) Return \(episodeReturn)")
+        if episodeReturn > 199 {
+            print("Solved in \(episodeIndex) episodes with \(stepIndex) steps!")
+            break
+        }
+        episodeReturns.append(episodeReturn)
+        episodeReturn = 0
+    }
+
+    // End-of-step
+    nextState = state
+}
diff --git a/Gym/README.md b/Gym/README.md
index 97d5bfbf66b..b203de133a2 100644
--- a/Gym/README.md
+++ b/Gym/README.md
@@ -31,4 +31,5 @@ To build and run the models, run:
 swift run Gym-CartPole
 swift run Gym-FrozenLake
 swift run Gym-Blackjack
+swift run Gym-DQN
 ```
diff --git a/Package.swift b/Package.swift
index 75496f2f69c..f6f6c9ce85a 100644
--- a/Package.swift
+++ b/Package.swift
@@ -47,6 +47,7 @@ let package = Package(
         .target(name: "Gym-FrozenLake", path: "Gym/FrozenLake"),
         .target(name: "Gym-CartPole", path: "Gym/CartPole"),
         .target(name: "Gym-Blackjack", path: "Gym/Blackjack"),
+        .target(name: "Gym-DQN", path: "Gym/DQN"),
         .target(
             name: "VGG-Imagewoof", dependencies: ["ImageClassificationModels", "Datasets"],
             path: "Examples/VGG-Imagewoof"),

From 6268d58d8d951fc2f8ce0fd6f1c1e5b924cf901c Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Tue, 30 Jun 2020 09:30:58 +0000
Subject: [PATCH 02/34] Use .scalarized() to convert TF scalar to Swift

---
 Gym/DQN/main.swift | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 150a0ef1954..b7a86e561f2 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -152,13 +152,13 @@ class Agent {
 
                     let stateQValueBatch = qNet(tfStateBatch)
                     let tfAction: Tensor<Int32> = tfActionBatch[i][0]
-                    let action = Int(tfAction.makeNumpyArray()).unwrapped()
+                    let action = Int(tfAction.scalarized())
                     let prediction: Tensor<Float> = stateQValueBatch[i][action]
 
                     let nextStateQValueBatch = self.targetQNet(tfNextStateBatch)
                     let tfReward: Tensor<Float> = tfRewardBatch[i][0]
-                    let leftQValue = Float(nextStateQValueBatch[i][0].makeNumpyArray()).unwrapped()
-                    let rightQValue = Float(nextStateQValueBatch[i][1].makeNumpyArray()).unwrapped()
+                    let leftQValue = Float(nextStateQValueBatch[i][0].scalarized())
+                    let rightQValue = Float(nextStateQValueBatch[i][1].scalarized())
                     let maxNextStateQValue = leftQValue > rightQValue ? leftQValue : rightQValue
                     let target: Tensor<Float> = tfReward + self.discount * maxNextStateQValue
 
@@ -208,7 +208,7 @@ let discount: Float = 0.99
 let learningRate: Float = 0.01
 let hiddenSize: Int = 64
 let startEpsilon: Float = 0.5
-let maxEpisode: Int = 500
+let maxEpisode: Int = 100
 let replayBufferCapacity: Int = 1000
 let batchSize: Int = 32
 let targetNetUpdateRate: Int = 1
@@ -243,7 +243,7 @@ while episodeIndex < maxEpisode {
     // print("nextState: \(nextState)")
     // print("reward: \(reward)")
     // print("isDone: \(isDone)")
-    episodeReturn += Int(reward.makeNumpyArray().item()).unwrapped()
+    episodeReturn += Int(reward.scalarized())
     // print("episodeReturn: \(episodeReturn)")
 
     // Save interaction to replay buffer

From 51d1fad8f6a487c4c2386a0de7b5d2f369d863b1 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Tue, 30 Jun 2020 09:34:00 +0000
Subject: [PATCH 03/34] Improve code clarity

---
 Gym/DQN/main.swift | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index b7a86e561f2..57b7e403b64 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -45,8 +45,8 @@ class ReplayBuffer {
     var rewards: Tensor<Float>
     var nextStates: Tensor<Float>
     let capacity: Int
-    var count: Int
-    var index: Int
+    var count: Int = 0
+    var index: Int = 0
 
     init(capacity: Int) {
         self.capacity = capacity
@@ -55,8 +55,6 @@ class ReplayBuffer {
         actions = Tensor<Int32>(numpy: np.zeros([capacity, 1], dtype: np.int32))!
         rewards = Tensor<Float>(numpy: np.zeros([capacity, 1], dtype: np.float32))!
         nextStates = Tensor<Float>(numpy: np.zeros([capacity, 4], dtype: np.float32))!
-        count = 0
-        index = 0
     }
 
     func append(state: Tensor<Float>, action: Tensor<Int32>, reward: Tensor<Float>, nextState: Tensor<Float>) {
@@ -194,7 +192,7 @@ class TensorFlowEnvironmentWrapper {
       return Tensor<Float>(numpy: np.array(state, dtype: np.float32))!
     }
 
-    func step(_ action: Tensor<Int32>) -> (Tensor<Float>, Tensor<Float>, PythonObject, PythonObject) {
+    func step(_ action: Tensor<Int32>) -> (state: Tensor<Float>, reward: Tensor<Float>, isDone: PythonObject, info: PythonObject) {
       let npAction = action.makeNumpyArray().item()
       let (state, reward, isDone, info) = originalEnv.step(npAction).tuple4
       let tfState = Tensor<Float>(numpy: np.array(state, dtype: np.float32))!

From 84df320def2273edb0f8fb2391f1bfd418b11bc3 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Tue, 30 Jun 2020 10:56:56 +0000
Subject: [PATCH 04/34] Save isDone as Tensor<Bool>

---
 Gym/DQN/main.swift | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 57b7e403b64..e8acb511186 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -188,16 +188,17 @@ class TensorFlowEnvironmentWrapper {
     }
 
     func reset() -> Tensor<Float> {
-      let state = self.originalEnv.reset()
-      return Tensor<Float>(numpy: np.array(state, dtype: np.float32))!
+        let state = self.originalEnv.reset()
+        return Tensor<Float>(numpy: np.array(state, dtype: np.float32))!
     }
 
-    func step(_ action: Tensor<Int32>) -> (state: Tensor<Float>, reward: Tensor<Float>, isDone: PythonObject, info: PythonObject) {
-      let npAction = action.makeNumpyArray().item()
-      let (state, reward, isDone, info) = originalEnv.step(npAction).tuple4
-      let tfState = Tensor<Float>(numpy: np.array(state, dtype: np.float32))!
-      let tfReward = Tensor<Float>(numpy: np.array(reward, dtype: np.float32))!
-      return (tfState, tfReward, isDone, info)
+    func step(_ action: Tensor<Int32>) -> (state: Tensor<Float>, reward: Tensor<Float>, isDone: Tensor<Bool>, info: PythonObject) {
+        let npAction = action.makeNumpyArray().item()
+        let (state, reward, isDone, info) = originalEnv.step(npAction).tuple4
+        let tfState = Tensor<Float>(numpy: np.array(state, dtype: np.float32))!
+        let tfReward = Tensor<Float>(numpy: np.array(reward, dtype: np.float32))!
+        let tfIsDone = Tensor<Bool>(numpy: np.array(isDone, dtype: np.bool))!
+        return (tfState, tfReward, tfIsDone, info)
     }
 }
 
@@ -259,7 +260,7 @@ while episodeIndex < maxEpisode {
     // print("Target net update successful")
 
     // End-of-episode
-    if isDone == true {
+    if isDone.scalarized() == true {
         state = env.reset()
         episodeIndex += 1
         print("Episode \(episodeIndex) Return \(episodeReturn)")

From 2b9948945568206ebca0995e1ad54c28577eabaa Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Tue, 30 Jun 2020 11:02:38 +0000
Subject: [PATCH 05/34] Save and use isDone for target calculation

---
 Gym/DQN/main.swift | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index e8acb511186..4687b5a2567 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -44,6 +44,7 @@ class ReplayBuffer {
     var actions: Tensor<Int32>
     var rewards: Tensor<Float>
     var nextStates: Tensor<Float>
+    var isDones: Tensor<Bool>
     let capacity: Int
     var count: Int = 0
     var index: Int = 0
@@ -55,9 +56,10 @@ class ReplayBuffer {
         actions = Tensor<Int32>(numpy: np.zeros([capacity, 1], dtype: np.int32))!
         rewards = Tensor<Float>(numpy: np.zeros([capacity, 1], dtype: np.float32))!
         nextStates = Tensor<Float>(numpy: np.zeros([capacity, 4], dtype: np.float32))!
+        isDones = Tensor<Bool>(numpy: np.zeros([capacity], dtype: np.bool))!
     }
 
-    func append(state: Tensor<Float>, action: Tensor<Int32>, reward: Tensor<Float>, nextState: Tensor<Float>) {
+    func append(state: Tensor<Float>, action: Tensor<Int32>, reward: Tensor<Float>, nextState: Tensor<Float>, isDone: Tensor<Bool>) {
         if count < capacity {
             count += 1
         }
@@ -66,18 +68,20 @@ class ReplayBuffer {
         actions[index] = Tensor<Int32>(numpy: np.expand_dims(action.makeNumpyArray(), axis: 0))!
         rewards[index] = Tensor<Float>(numpy: np.expand_dims(reward.makeNumpyArray(), axis: 0))!
         nextStates[index] = nextState
+        isDones[index] = isDone
         index = (index + 1) % capacity
     }
 
-    func sample(batchSize: Int) -> (stateBatch: Tensor<Float>, actionBatch: Tensor<Int32>, rewardBatch: Tensor<Float>, nextStateBatch: Tensor<Float>) {
+    func sample(batchSize: Int) -> (stateBatch: Tensor<Float>, actionBatch: Tensor<Int32>, rewardBatch: Tensor<Float>, nextStateBatch: Tensor<Float>, isDoneBatch: Tensor<Bool>) {
         let randomIndices = Tensor<Int32>(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))!
 
         let stateBatch = _Raw.gather(params: states, indices: randomIndices)
         let actionBatch = _Raw.gather(params: actions, indices: randomIndices)
         let rewardBatch = _Raw.gather(params: rewards, indices: randomIndices)
         let nextStateBatch = _Raw.gather(params: nextStates, indices: randomIndices)
+        let isDoneBatch = _Raw.gather(params: isDones, indices: randomIndices)
 
-        return (stateBatch, actionBatch, rewardBatch, nextStateBatch)
+        return (stateBatch, actionBatch, rewardBatch, nextStateBatch, isDoneBatch)
     }
 }
 
@@ -140,7 +144,7 @@ class Agent {
         // Don't train if replay buffer is too small
         if replayBuffer.count >= batchSize {
             // print("train | Start training")
-            let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch) = replayBuffer.sample(batchSize: batchSize)
+            let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch, tfIsDoneBatch) = replayBuffer.sample(batchSize: batchSize)
 
             // TODO: Find equivalent function of tf.gather_nd in S4TF to parallelize Q-value computation (_Raw.gather_nd does not exist)
             // Gradient are accumulated since we calculate every element in the batch individually
@@ -158,7 +162,7 @@ class Agent {
                     let leftQValue = Float(nextStateQValueBatch[i][0].scalarized())
                     let rightQValue = Float(nextStateQValueBatch[i][1].scalarized())
                     let maxNextStateQValue = leftQValue > rightQValue ? leftQValue : rightQValue
-                    let target: Tensor<Float> = tfReward + self.discount * maxNextStateQValue
+                    let target: Tensor<Float> = tfReward + Tensor<Float>(tfIsDoneBatch[i]) * self.discount * maxNextStateQValue
 
                     return squaredDifference(prediction, withoutDerivative(at: target))
                 }
@@ -246,7 +250,7 @@ while episodeIndex < maxEpisode {
     // print("episodeReturn: \(episodeReturn)")
 
     // Save interaction to replay buffer
-    replayBuffer.append(state: state, action: action, reward: reward, nextState: nextState)
+    replayBuffer.append(state: state, action: action, reward: reward, nextState: nextState, isDone: isDone)
     // print("Append successful")
 
     // Train agent

From 18e629486d8cb9a56460cfc49ce7bc332da26b2b Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Tue, 30 Jun 2020 11:05:20 +0000
Subject: [PATCH 06/34] Add commented parallelized training implementation

---
 Gym/DQN/main.swift | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 4687b5a2567..5e01a2c741d 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -146,7 +146,6 @@ class Agent {
             // print("train | Start training")
             let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch, tfIsDoneBatch) = replayBuffer.sample(batchSize: batchSize)
 
-            // TODO: Find equivalent function of tf.gather_nd in S4TF to parallelize Q-value computation (_Raw.gather_nd does not exist)
             // Gradient are accumulated since we calculate every element in the batch individually
             var totalGrad = qNet.zeroTangentVector
             for i in 0..<batchSize {
@@ -169,6 +168,26 @@ class Agent {
                 totalGrad += 𝛁qNet
             }
             optimizer.update(&qNet, along: totalGrad)
+
+            // TODO: Use parallelized methods commented out below
+            // TODO: _Raw.gatherNd() is not differentiable?
+            // let 𝛁qNet = gradient(at: qNet) { qNet -> Tensor<Float> in
+            //     // Compute prediction batch
+            //     let npActionBatch = tfActionBatch.makeNumpyArray()
+            //     print("A: \(np.arange(batchSize, dtype: np.int32)))")
+            //     print("B: \(npActionBatch.flatten())")
+            //     let npFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npActionBatch.flatten()], axis: 1)
+            //     let tfFullIndices = Tensor<Int32>(numpy: npFullIndices)!
+            //     let stateQValueBatch = qNet(tfStateBatch)
+            //     let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices)
+
+            //     // TODO: Just save rewards as 1D to avoid this extra squeeze operation
+            //     // Compute target batch
+            //     let targetBatch: Tensor<Float> = _Raw.squeeze(tfRewardBatch, squeezeDims: [1]) + self.discount * _Raw.max(self.targetQNet(tfNextStateBatch), reductionIndices: Tensor<Int32>(1))
+
+            //     return squaredDifference(predictionBatch, withoutDerivative(at: targetBatch))
+            // }
+            // optimizer.update(&qNet, along: 𝛁qNet)
         }
     }
 }

From 36c1ddf7fca383b28f938e2499a94d1ab56bfe5f Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Tue, 30 Jun 2020 11:18:41 +0000
Subject: [PATCH 07/34] Save learning curve plot

---
 Gym/DQN/main.swift | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 5e01a2c741d..34558196d8f 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -34,6 +34,7 @@ fileprivate extension Optional {
 
 let np = Python.import("numpy")
 let gym = Python.import("gym")
+let plt = Python.import("matplotlib.pyplot")
 
 typealias State = Tensor<Float>
 typealias Action = Tensor<Int32>
@@ -298,3 +299,13 @@ while episodeIndex < maxEpisode {
     // End-of-step
     nextState = state
 }
+
+// Save smoothed learning curve
+let runningMeanWindow: Int = 2
+let smoothedEpisodeReturns = np.convolve(episodeReturns, np.ones((runningMeanWindow)) / np.array(runningMeanWindow, dtype: np.int32), mode: "same")
+
+plt.plot(smoothedEpisodeReturns)
+plt.title("Deep Q-Network on CartPole-v0")
+plt.xlabel("Episode")
+plt.ylabel("Smoothed Episode Return")
+plt.savefig("dqnSmoothedEpisodeReturns.png")

From da57062f121d980e0a227bd08186a19f09f10218 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Wed, 1 Jul 2020 00:32:16 +0000
Subject: [PATCH 08/34] Use parallelized training with custom gatherNd

---
 Gym/DQN/main.swift | 102 ++++++++++++++++++++++++++++-----------------
 1 file changed, 64 insertions(+), 38 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 34558196d8f..ccb30188155 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -30,6 +30,32 @@ fileprivate extension Optional {
     }
 }
 
+extension _Raw {
+    /// Derivative of `_Raw.gatherNd`.
+    ///
+    /// Ported from TensorFlow Python reference implementation:
+    /// https://github.com/tensorflow/tensorflow/blob/r2.2/tensorflow/python/ops/array_grad.py#L691-L701
+    @inlinable
+    @derivative(of: gatherNd)
+    public static func vjpGatherNd<
+        Scalar: TensorFlowFloatingPoint,
+        Index: TensorFlowIndex
+    >(
+        params: Tensor<Scalar>,
+        indices: Tensor<Index>
+    ) -> (
+        value: Tensor<Scalar>,
+        pullback: (Tensor<Scalar>) -> Tensor<Scalar>
+    ) {
+        let shapeTensor = Tensor<Index>(params.shapeTensor)
+        let value = gatherNd(params: params, indices: indices)
+        return (value, { v in
+            let dparams = scatterNd(indices: indices, updates: v, shape: shapeTensor)
+            return dparams
+        })
+    }
+}
+
 // Initialize Python. This comment is a hook for internal use, do not remove.
 
 let np = Python.import("numpy")
@@ -148,47 +174,47 @@ class Agent {
             let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch, tfIsDoneBatch) = replayBuffer.sample(batchSize: batchSize)
 
             // Gradient are accumulated since we calculate every element in the batch individually
-            var totalGrad = qNet.zeroTangentVector
-            for i in 0..<batchSize {
-                let 𝛁qNet = gradient(at: qNet) { qNet -> Tensor<Float> in
-
-                    let stateQValueBatch = qNet(tfStateBatch)
-                    let tfAction: Tensor<Int32> = tfActionBatch[i][0]
-                    let action = Int(tfAction.scalarized())
-                    let prediction: Tensor<Float> = stateQValueBatch[i][action]
-
-                    let nextStateQValueBatch = self.targetQNet(tfNextStateBatch)
-                    let tfReward: Tensor<Float> = tfRewardBatch[i][0]
-                    let leftQValue = Float(nextStateQValueBatch[i][0].scalarized())
-                    let rightQValue = Float(nextStateQValueBatch[i][1].scalarized())
-                    let maxNextStateQValue = leftQValue > rightQValue ? leftQValue : rightQValue
-                    let target: Tensor<Float> = tfReward + Tensor<Float>(tfIsDoneBatch[i]) * self.discount * maxNextStateQValue
-
-                    return squaredDifference(prediction, withoutDerivative(at: target))
-                }
-                totalGrad += 𝛁qNet
-            }
-            optimizer.update(&qNet, along: totalGrad)
+            // var totalGrad = qNet.zeroTangentVector
+            // for i in 0..<batchSize {
+            //     let 𝛁qNet = gradient(at: qNet) { qNet -> Tensor<Float> in
+
+            //         let stateQValueBatch = qNet(tfStateBatch)
+            //         let tfAction: Tensor<Int32> = tfActionBatch[i][0]
+            //         let action = Int(tfAction.scalarized())
+            //         let prediction: Tensor<Float> = stateQValueBatch[i][action]
+
+            //         let nextStateQValueBatch = self.targetQNet(tfNextStateBatch)
+            //         let tfReward: Tensor<Float> = tfRewardBatch[i][0]
+            //         let leftQValue = Float(nextStateQValueBatch[i][0].scalarized())
+            //         let rightQValue = Float(nextStateQValueBatch[i][1].scalarized())
+            //         let maxNextStateQValue = leftQValue > rightQValue ? leftQValue : rightQValue
+            //         let target: Tensor<Float> = tfReward + Tensor<Float>(tfIsDoneBatch[i]) * self.discount * maxNextStateQValue
+
+            //         return squaredDifference(prediction, withoutDerivative(at: target))
+            //     }
+            //     totalGrad += 𝛁qNet
+            // }
+            // optimizer.update(&qNet, along: totalGrad)
 
             // TODO: Use parallelized methods commented out below
             // TODO: _Raw.gatherNd() is not differentiable?
-            // let 𝛁qNet = gradient(at: qNet) { qNet -> Tensor<Float> in
-            //     // Compute prediction batch
-            //     let npActionBatch = tfActionBatch.makeNumpyArray()
-            //     print("A: \(np.arange(batchSize, dtype: np.int32)))")
-            //     print("B: \(npActionBatch.flatten())")
-            //     let npFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npActionBatch.flatten()], axis: 1)
-            //     let tfFullIndices = Tensor<Int32>(numpy: npFullIndices)!
-            //     let stateQValueBatch = qNet(tfStateBatch)
-            //     let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices)
-
-            //     // TODO: Just save rewards as 1D to avoid this extra squeeze operation
-            //     // Compute target batch
-            //     let targetBatch: Tensor<Float> = _Raw.squeeze(tfRewardBatch, squeezeDims: [1]) + self.discount * _Raw.max(self.targetQNet(tfNextStateBatch), reductionIndices: Tensor<Int32>(1))
-
-            //     return squaredDifference(predictionBatch, withoutDerivative(at: targetBatch))
-            // }
-            // optimizer.update(&qNet, along: 𝛁qNet)
+            let 𝛁qNet = gradient(at: qNet) { qNet -> Tensor<Float> in
+                // Compute prediction batch
+                let npActionBatch = tfActionBatch.makeNumpyArray()
+                // print("A: \(np.arange(batchSize, dtype: np.int32)))")
+                // print("B: \(npActionBatch.flatten())")
+                let npFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npActionBatch.flatten()], axis: 1)
+                let tfFullIndices = Tensor<Int32>(numpy: npFullIndices)!
+                let stateQValueBatch = qNet(tfStateBatch)
+                let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices)
+
+                // TODO: Just save rewards as 1D to avoid this extra squeeze operation
+                // Compute target batch
+                let targetBatch: Tensor<Float> = _Raw.squeeze(tfRewardBatch, squeezeDims: [1]) + Tensor<Float>(tfIsDoneBatch) * self.discount * _Raw.max(self.targetQNet(tfNextStateBatch), reductionIndices: Tensor<Int32>(1))
+
+                return meanSquaredError(predicted: predictionBatch, expected: withoutDerivative(at: targetBatch))
+            }
+            optimizer.update(&qNet, along: 𝛁qNet)
         }
     }
 }

From 2ec956ccbd8aa7abd8d3d8ae96e5b839608a2280 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Wed, 1 Jul 2020 00:48:28 +0000
Subject: [PATCH 09/34] Add minBufferSize parameter

---
 Gym/DQN/main.swift | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index ccb30188155..c20753e1ccb 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -169,7 +169,7 @@ class Agent {
 
     func train(batchSize: Int) {
         // Don't train if replay buffer is too small
-        if replayBuffer.count >= batchSize {
+        if replayBuffer.count >= minBufferSize {
             // print("train | Start training")
             let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch, tfIsDoneBatch) = replayBuffer.sample(batchSize: batchSize)
 
@@ -257,10 +257,11 @@ let discount: Float = 0.99
 let learningRate: Float = 0.01
 let hiddenSize: Int = 64
 let startEpsilon: Float = 0.5
-let maxEpisode: Int = 100
-let replayBufferCapacity: Int = 1000
-let batchSize: Int = 32
-let targetNetUpdateRate: Int = 1
+let maxEpisode: Int = 1000
+let replayBufferCapacity: Int = 5000
+let minBufferSize: Int = 1000
+let batchSize: Int = 64
+let targetNetUpdateRate: Int = 32
 
 // Initialize environment
 let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0"))

From dab2a3f811b0139050ce9a6cfb0e8da48405a272 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Wed, 1 Jul 2020 02:20:50 +0000
Subject: [PATCH 10/34] Remove comments and refactor code

---
 Gym/DQN/main.swift | 63 +++++++++++++---------------------------------
 1 file changed, 18 insertions(+), 45 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index c20753e1ccb..f2899d52f43 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -86,7 +86,13 @@ class ReplayBuffer {
         isDones = Tensor<Bool>(numpy: np.zeros([capacity], dtype: np.bool))!
     }
 
-    func append(state: Tensor<Float>, action: Tensor<Int32>, reward: Tensor<Float>, nextState: Tensor<Float>, isDone: Tensor<Bool>) {
+    func append(
+        state: Tensor<Float>,
+        action: Tensor<Int32>,
+        reward: Tensor<Float>,
+        nextState: Tensor<Float>,
+        isDone: Tensor<Bool>
+    ) {
         if count < capacity {
             count += 1
         }
@@ -99,7 +105,13 @@ class ReplayBuffer {
         index = (index + 1) % capacity
     }
 
-    func sample(batchSize: Int) -> (stateBatch: Tensor<Float>, actionBatch: Tensor<Int32>, rewardBatch: Tensor<Float>, nextStateBatch: Tensor<Float>, isDoneBatch: Tensor<Bool>) {
+    func sample(batchSize: Int) -> (
+        stateBatch: Tensor<Float>,
+        actionBatch: Tensor<Int32>,
+        rewardBatch: Tensor<Float>,
+        nextStateBatch: Tensor<Float>,
+        isDoneBatch: Tensor<Bool>
+    ) {
         let randomIndices = Tensor<Int32>(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))!
 
         let stateBatch = _Raw.gather(params: states, indices: randomIndices)
@@ -151,10 +163,7 @@ class Agent {
 
     func getAction(state: Tensor<Float>, epsilon: Float) -> Tensor<Int32> {
         if Float(np.random.uniform()).unwrapped() < epsilon {
-            // print("getAction | state: \(state)")
-            // print("getAction | epsilon: \(epsilon)")
             let npState = np.random.randint(0, 2, dtype: np.int32)
-            // print("getAction | npState: \(npState)")
             return Tensor<Int32>(numpy: np.array(npState, dtype: np.int32))!
         }
         else {
@@ -170,39 +179,12 @@ class Agent {
     func train(batchSize: Int) {
         // Don't train if replay buffer is too small
         if replayBuffer.count >= minBufferSize {
-            // print("train | Start training")
             let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch, tfIsDoneBatch) = replayBuffer.sample(batchSize: batchSize)
 
-            // Gradient are accumulated since we calculate every element in the batch individually
-            // var totalGrad = qNet.zeroTangentVector
-            // for i in 0..<batchSize {
-            //     let 𝛁qNet = gradient(at: qNet) { qNet -> Tensor<Float> in
-
-            //         let stateQValueBatch = qNet(tfStateBatch)
-            //         let tfAction: Tensor<Int32> = tfActionBatch[i][0]
-            //         let action = Int(tfAction.scalarized())
-            //         let prediction: Tensor<Float> = stateQValueBatch[i][action]
-
-            //         let nextStateQValueBatch = self.targetQNet(tfNextStateBatch)
-            //         let tfReward: Tensor<Float> = tfRewardBatch[i][0]
-            //         let leftQValue = Float(nextStateQValueBatch[i][0].scalarized())
-            //         let rightQValue = Float(nextStateQValueBatch[i][1].scalarized())
-            //         let maxNextStateQValue = leftQValue > rightQValue ? leftQValue : rightQValue
-            //         let target: Tensor<Float> = tfReward + Tensor<Float>(tfIsDoneBatch[i]) * self.discount * maxNextStateQValue
-
-            //         return squaredDifference(prediction, withoutDerivative(at: target))
-            //     }
-            //     totalGrad += 𝛁qNet
-            // }
-            // optimizer.update(&qNet, along: totalGrad)
-
-            // TODO: Use parallelized methods commented out below
-            // TODO: _Raw.gatherNd() is not differentiable?
+            // TODO: Check gradient values
             let 𝛁qNet = gradient(at: qNet) { qNet -> Tensor<Float> in
                 // Compute prediction batch
                 let npActionBatch = tfActionBatch.makeNumpyArray()
-                // print("A: \(np.arange(batchSize, dtype: np.int32)))")
-                // print("B: \(npActionBatch.flatten())")
                 let npFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npActionBatch.flatten()], axis: 1)
                 let tfFullIndices = Tensor<Int32>(numpy: npFullIndices)!
                 let stateQValueBatch = qNet(tfStateBatch)
@@ -283,38 +265,29 @@ var episodeReturns: Array<Int> = []
 var state = env.reset()
 while episodeIndex < maxEpisode {
     stepIndex += 1
-    // print("Step \(stepIndex)")
 
     // Interact with environment
-    let action = agent.getAction(state: state, epsilon: startEpsilon * Float(maxEpisode - episodeIndex))
-    // print("action: \(action)")
+    let epsilon = startEpsilon * Float(maxEpisode - episodeIndex) / Float(maxEpisode)
+    let action = agent.getAction(state: state, epsilon: epsilon)
     var (nextState, reward, isDone, _) = env.step(action)
-    // print("state: \(state)")
-    // print("nextState: \(nextState)")
-    // print("reward: \(reward)")
-    // print("isDone: \(isDone)")
     episodeReturn += Int(reward.scalarized())
-    // print("episodeReturn: \(episodeReturn)")
 
     // Save interaction to replay buffer
     replayBuffer.append(state: state, action: action, reward: reward, nextState: nextState, isDone: isDone)
-    // print("Append successful")
 
     // Train agent
     agent.train(batchSize: batchSize)
-    // print("Train successful")
 
     // Periodically update Target Net
     if stepIndex % targetNetUpdateRate == 0 {
         updateTargetQNet(source: qNet, target: &targetQNet)
     }
-    // print("Target net update successful")
 
     // End-of-episode
     if isDone.scalarized() == true {
         state = env.reset()
         episodeIndex += 1
-        print("Episode \(episodeIndex) Return \(episodeReturn)")
+        print(String(format: "Episode: %4d | Epsilon: %.03f | Return: %3d", episodeIndex, epsilon, episodeReturn))
         if episodeReturn > 199 {
             print("Solved in \(episodeIndex) episodes with \(stepIndex) steps!")
             break

From 0bc60ca6442b886c9007ee4c5cd41e74787eb1fe Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Wed, 1 Jul 2020 02:34:59 +0000
Subject: [PATCH 11/34] Fix bug where state was updated

---
 Gym/DQN/main.swift | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index f2899d52f43..c2d35495879 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -269,7 +269,7 @@ while episodeIndex < maxEpisode {
     // Interact with environment
     let epsilon = startEpsilon * Float(maxEpisode - episodeIndex) / Float(maxEpisode)
     let action = agent.getAction(state: state, epsilon: epsilon)
-    var (nextState, reward, isDone, _) = env.step(action)
+    let (nextState, reward, isDone, _) = env.step(action)
     episodeReturn += Int(reward.scalarized())
 
     // Save interaction to replay buffer
@@ -297,7 +297,7 @@ while episodeIndex < maxEpisode {
     }
 
     // End-of-step
-    nextState = state
+    state = nextState
 }
 
 // Save smoothed learning curve

From 01074d9075a0e561a5265edc8ff3267f137b8544 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Wed, 1 Jul 2020 02:46:32 +0000
Subject: [PATCH 12/34] Simplify code

---
 Gym/DQN/main.swift | 50 +++++++++++++++-------------------------------
 1 file changed, 16 insertions(+), 34 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index c2d35495879..f982f9edfe6 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -62,10 +62,6 @@ let np = Python.import("numpy")
 let gym = Python.import("gym")
 let plt = Python.import("matplotlib.pyplot")
 
-typealias State = Tensor<Float>
-typealias Action = Tensor<Int32>
-typealias Reward = Tensor<Float>
-
 class ReplayBuffer {
     var states: Tensor<Float>
     var actions: Tensor<Int32>
@@ -80,8 +76,8 @@ class ReplayBuffer {
         self.capacity = capacity
 
         states = Tensor<Float>(numpy: np.zeros([capacity, 4], dtype: np.float32))!
-        actions = Tensor<Int32>(numpy: np.zeros([capacity, 1], dtype: np.int32))!
-        rewards = Tensor<Float>(numpy: np.zeros([capacity, 1], dtype: np.float32))!
+        actions = Tensor<Int32>(numpy: np.zeros([capacity], dtype: np.int32))!
+        rewards = Tensor<Float>(numpy: np.zeros([capacity], dtype: np.float32))!
         nextStates = Tensor<Float>(numpy: np.zeros([capacity, 4], dtype: np.float32))!
         isDones = Tensor<Bool>(numpy: np.zeros([capacity], dtype: np.bool))!
     }
@@ -98,8 +94,8 @@ class ReplayBuffer {
         }
         // Erase oldest SARS if the replay buffer is full
         states[index] = state
-        actions[index] = Tensor<Int32>(numpy: np.expand_dims(action.makeNumpyArray(), axis: 0))!
-        rewards[index] = Tensor<Float>(numpy: np.expand_dims(reward.makeNumpyArray(), axis: 0))!
+        actions[index] = action
+        rewards[index] = reward
         nextStates[index] = nextState
         isDones[index] = isDone
         index = (index + 1) % capacity
@@ -132,7 +128,7 @@ struct Net: Layer {
 
     init(observationSize: Int, hiddenSize: Int, actionCount: Int) {
         l1 = Dense<Float>(inputSize: observationSize, outputSize: hiddenSize, activation: relu, weightInitializer: heNormal())
-        l2 = Dense<Float>(inputSize: hiddenSize, outputSize: actionCount, weightInitializer: heNormal())
+        l2 = Dense<Float>(inputSize: hiddenSize, outputSize: actionCount, activation: identity, weightInitializer: heNormal())
     }
 
     @differentiable
@@ -142,15 +138,10 @@ struct Net: Layer {
 }
 
 class Agent {
-    // Q-network
     var qNet: Net
-    // Target Q-network
     var targetQNet: Net
-    // Optimizer
     let optimizer: Adam<Net>
-    // Replay Buffer
     let replayBuffer: ReplayBuffer
-    // Discount Factor
     let discount: Float
 
     init(qNet: Net, targetQNet: Net, optimizer: Adam<Net>, replayBuffer: ReplayBuffer, discount: Float) {
@@ -169,10 +160,8 @@ class Agent {
         else {
             // Neural network input needs to be 2D
             let tfState = Tensor<Float>(numpy: np.expand_dims(state.makeNumpyArray(), axis: 0))!
-            let qValues = qNet(tfState)
-            let leftQValue = Float(qValues[0][0]).unwrapped()
-            let rightQValue = Float(qValues[0][1]).unwrapped()
-            return leftQValue < rightQValue ? Tensor<Int32>(numpy: np.array(1, dtype: np.int32))! : Tensor<Int32>(numpy: np.array(0, dtype: np.int32))!
+            let qValues = qNet(tfState)[0]
+            return qValues[1].scalarized() > qValues[0].scalarized() ? Tensor<Int32>(1) : Tensor<Int32>(0)
         }
     }
 
@@ -185,14 +174,13 @@ class Agent {
             let 𝛁qNet = gradient(at: qNet) { qNet -> Tensor<Float> in
                 // Compute prediction batch
                 let npActionBatch = tfActionBatch.makeNumpyArray()
-                let npFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npActionBatch.flatten()], axis: 1)
+                let npFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npActionBatch], axis: 1)
                 let tfFullIndices = Tensor<Int32>(numpy: npFullIndices)!
                 let stateQValueBatch = qNet(tfStateBatch)
                 let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices)
 
-                // TODO: Just save rewards as 1D to avoid this extra squeeze operation
                 // Compute target batch
-                let targetBatch: Tensor<Float> = _Raw.squeeze(tfRewardBatch, squeezeDims: [1]) + Tensor<Float>(tfIsDoneBatch) * self.discount * _Raw.max(self.targetQNet(tfNextStateBatch), reductionIndices: Tensor<Int32>(1))
+                let targetBatch: Tensor<Float> = tfRewardBatch + Tensor<Float>(tfIsDoneBatch) * self.discount * _Raw.max(self.targetQNet(tfNextStateBatch), reductionIndices: Tensor<Int32>(1))
 
                 return meanSquaredError(predicted: predictionBatch, expected: withoutDerivative(at: targetBatch))
             }
@@ -210,13 +198,9 @@ func updateTargetQNet(source: Net, target: inout Net) {
 
 class TensorFlowEnvironmentWrapper {
     let originalEnv: PythonObject
-    let action_space: PythonObject
-    let observation_space: PythonObject
 
     init(_ env: PythonObject) {
         self.originalEnv = env
-        self.action_space = env.action_space
-        self.observation_space = env.observation_space
     }
 
     func reset() -> Tensor<Float> {
@@ -225,8 +209,7 @@ class TensorFlowEnvironmentWrapper {
     }
 
     func step(_ action: Tensor<Int32>) -> (state: Tensor<Float>, reward: Tensor<Float>, isDone: Tensor<Bool>, info: PythonObject) {
-        let npAction = action.makeNumpyArray().item()
-        let (state, reward, isDone, info) = originalEnv.step(npAction).tuple4
+        let (state, reward, isDone, info) = originalEnv.step(action.scalarized()).tuple4
         let tfState = Tensor<Float>(numpy: np.array(state, dtype: np.float32))!
         let tfReward = Tensor<Float>(numpy: np.array(reward, dtype: np.float32))!
         let tfIsDone = Tensor<Bool>(numpy: np.array(isDone, dtype: np.bool))!
@@ -249,9 +232,8 @@ let targetNetUpdateRate: Int = 32
 let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0"))
 
 // Initialize agent
-let actionCount = Int(env.action_space.n).unwrapped()
-var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: actionCount)
-var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: actionCount)
+var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
+var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
 updateTargetQNet(source: qNet, target: &targetQNet)
 let optimizer = Adam(for: qNet, learningRate: learningRate)
 var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity)
@@ -260,8 +242,8 @@ var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, repl
 // RL Loop
 var stepIndex = 0
 var episodeIndex = 0
-var episodeReturn: Int = 0
-var episodeReturns: Array<Int> = []
+var episodeReturn: Float = 0
+var episodeReturns: Array<Float> = []
 var state = env.reset()
 while episodeIndex < maxEpisode {
     stepIndex += 1
@@ -270,7 +252,7 @@ while episodeIndex < maxEpisode {
     let epsilon = startEpsilon * Float(maxEpisode - episodeIndex) / Float(maxEpisode)
     let action = agent.getAction(state: state, epsilon: epsilon)
     let (nextState, reward, isDone, _) = env.step(action)
-    episodeReturn += Int(reward.scalarized())
+    episodeReturn += reward.scalarized()
 
     // Save interaction to replay buffer
     replayBuffer.append(state: state, action: action, reward: reward, nextState: nextState, isDone: isDone)
@@ -287,7 +269,7 @@ while episodeIndex < maxEpisode {
     if isDone.scalarized() == true {
         state = env.reset()
         episodeIndex += 1
-        print(String(format: "Episode: %4d | Epsilon: %.03f | Return: %3d", episodeIndex, epsilon, episodeReturn))
+        print(String(format: "Episode: %4d | Epsilon: %.03f | Return: %3d", episodeIndex, epsilon, Int(episodeReturn)))
         if episodeReturn > 199 {
             print("Solved in \(episodeIndex) episodes with \(stepIndex) steps!")
             break

From eca8a920dafc0f9116f4804f513ef4b3562810c7 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Wed, 1 Jul 2020 03:39:50 +0000
Subject: [PATCH 13/34] Save TD loss curve

---
 Gym/DQN/main.swift | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index f982f9edfe6..f33fd031803 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -165,13 +165,12 @@ class Agent {
         }
     }
 
-    func train(batchSize: Int) {
+    func train(batchSize: Int) -> Float {
         // Don't train if replay buffer is too small
         if replayBuffer.count >= minBufferSize {
             let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch, tfIsDoneBatch) = replayBuffer.sample(batchSize: batchSize)
 
-            // TODO: Check gradient values
-            let 𝛁qNet = gradient(at: qNet) { qNet -> Tensor<Float> in
+            let (loss, gradients) = valueWithGradient(at: qNet) { qNet -> Tensor<Float> in
                 // Compute prediction batch
                 let npActionBatch = tfActionBatch.makeNumpyArray()
                 let npFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npActionBatch], axis: 1)
@@ -184,8 +183,11 @@ class Agent {
 
                 return meanSquaredError(predicted: predictionBatch, expected: withoutDerivative(at: targetBatch))
             }
-            optimizer.update(&qNet, along: 𝛁qNet)
+            optimizer.update(&qNet, along: gradients)
+
+            return loss.scalarized()
         }
+        return 0
     }
 }
 
@@ -244,6 +246,7 @@ var stepIndex = 0
 var episodeIndex = 0
 var episodeReturn: Float = 0
 var episodeReturns: Array<Float> = []
+var losses: Array<Float> = []
 var state = env.reset()
 while episodeIndex < maxEpisode {
     stepIndex += 1
@@ -258,7 +261,7 @@ while episodeIndex < maxEpisode {
     replayBuffer.append(state: state, action: action, reward: reward, nextState: nextState, isDone: isDone)
 
     // Train agent
-    agent.train(batchSize: batchSize)
+    losses.append(agent.train(batchSize: batchSize))
 
     // Periodically update Target Net
     if stepIndex % targetNetUpdateRate == 0 {
@@ -291,3 +294,12 @@ plt.title("Deep Q-Network on CartPole-v0")
 plt.xlabel("Episode")
 plt.ylabel("Smoothed Episode Return")
 plt.savefig("dqnSmoothedEpisodeReturns.png")
+plt.clf()
+
+// Save TD loss curve
+plt.plot(losses)
+plt.title("Deep Q-Network on CartPole-v0")
+plt.xlabel("Step")
+plt.ylabel("TD Loss")
+plt.savefig("dqnTDLoss.png")
+plt.clf()

From ae087dd9e7162e7d85d66aea264e715f7266ede9 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Thu, 2 Jul 2020 08:06:15 +0000
Subject: [PATCH 14/34] Purge uses of _Raw operations

---
 Gym/DQN/main.swift | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index f33fd031803..e48e208a0f5 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -110,11 +110,11 @@ class ReplayBuffer {
     ) {
         let randomIndices = Tensor<Int32>(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))!
 
-        let stateBatch = _Raw.gather(params: states, indices: randomIndices)
-        let actionBatch = _Raw.gather(params: actions, indices: randomIndices)
-        let rewardBatch = _Raw.gather(params: rewards, indices: randomIndices)
-        let nextStateBatch = _Raw.gather(params: nextStates, indices: randomIndices)
-        let isDoneBatch = _Raw.gather(params: isDones, indices: randomIndices)
+        let stateBatch = states.gathering(atIndices: randomIndices, alongAxis: 0)
+        let actionBatch = actions.gathering(atIndices: randomIndices, alongAxis: 0)
+        let rewardBatch = rewards.gathering(atIndices: randomIndices, alongAxis: 0)
+        let nextStateBatch = nextStates.gathering(atIndices: randomIndices, alongAxis: 0)
+        let isDoneBatch = isDones.gathering(atIndices: randomIndices, alongAxis: 0)
 
         return (stateBatch, actionBatch, rewardBatch, nextStateBatch, isDoneBatch)
     }
@@ -179,8 +179,7 @@ class Agent {
                 let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices)
 
                 // Compute target batch
-                let targetBatch: Tensor<Float> = tfRewardBatch + Tensor<Float>(tfIsDoneBatch) * self.discount * _Raw.max(self.targetQNet(tfNextStateBatch), reductionIndices: Tensor<Int32>(1))
-
+                let targetBatch: Tensor<Float> = tfRewardBatch + Tensor<Float>(tfIsDoneBatch) * self.discount * self.targetQNet(tfNextStateBatch).max(squeezingAxes: Tensor<Int32>(1))
                 return meanSquaredError(predicted: predictionBatch, expected: withoutDerivative(at: targetBatch))
             }
             optimizer.update(&qNet, along: gradients)

From 4acd6ce200dd49fd4701853a6d2f7ed94e5cc1fb Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Thu, 2 Jul 2020 08:13:32 +0000
Subject: [PATCH 15/34] Use Huber loss instead of MSE

---
 Gym/DQN/main.swift | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index e48e208a0f5..6b2e5a5f76f 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -180,7 +180,11 @@ class Agent {
 
                 // Compute target batch
                 let targetBatch: Tensor<Float> = tfRewardBatch + Tensor<Float>(tfIsDoneBatch) * self.discount * self.targetQNet(tfNextStateBatch).max(squeezingAxes: Tensor<Int32>(1))
-                return meanSquaredError(predicted: predictionBatch, expected: withoutDerivative(at: targetBatch))
+                return huberLoss(
+                    predicted: predictionBatch,
+                    expected: withoutDerivative(at: targetBatch),
+                    delta: 1
+                )
             }
             optimizer.update(&qNet, along: gradients)
 

From 22aaf75d9b4a56ec0087b3f64ba8d80e3b74f166 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Thu, 2 Jul 2020 10:15:03 +0000
Subject: [PATCH 16/34] Simplify Tensor initialization

---
 Gym/DQN/main.swift | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 6b2e5a5f76f..2e9edfd26de 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -75,11 +75,11 @@ class ReplayBuffer {
     init(capacity: Int) {
         self.capacity = capacity
 
-        states = Tensor<Float>(numpy: np.zeros([capacity, 4], dtype: np.float32))!
-        actions = Tensor<Int32>(numpy: np.zeros([capacity], dtype: np.int32))!
-        rewards = Tensor<Float>(numpy: np.zeros([capacity], dtype: np.float32))!
-        nextStates = Tensor<Float>(numpy: np.zeros([capacity, 4], dtype: np.float32))!
-        isDones = Tensor<Bool>(numpy: np.zeros([capacity], dtype: np.bool))!
+        states = Tensor<Float>(zeros: [capacity, 4])
+        actions = Tensor<Int32>(zeros: [capacity])
+        rewards = Tensor<Float>(zeros: [capacity])
+        nextStates = Tensor<Float>(zeros: [capacity, 4])
+        isDones = Tensor<Bool>(repeating: false, shape: [capacity])
     }
 
     func append(
@@ -154,8 +154,7 @@ class Agent {
 
     func getAction(state: Tensor<Float>, epsilon: Float) -> Tensor<Int32> {
         if Float(np.random.uniform()).unwrapped() < epsilon {
-            let npState = np.random.randint(0, 2, dtype: np.int32)
-            return Tensor<Int32>(numpy: np.array(npState, dtype: np.int32))!
+            return Tensor<Int32>(numpy: np.array(np.random.randint(0, 2), dtype: np.int32))!
         }
         else {
             // Neural network input needs to be 2D

From 24392f3aeb6caed312a3017e1f5126373db6b95f Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Thu, 2 Jul 2020 10:28:21 +0000
Subject: [PATCH 17/34] Set device explicitly on Tensor creation

---
 Gym/DQN/main.swift | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 2e9edfd26de..6e042b7c34f 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -63,23 +63,26 @@ let gym = Python.import("gym")
 let plt = Python.import("matplotlib.pyplot")
 
 class ReplayBuffer {
+    let capacity: Int
+    let device: Device
+
     var states: Tensor<Float>
     var actions: Tensor<Int32>
     var rewards: Tensor<Float>
     var nextStates: Tensor<Float>
     var isDones: Tensor<Bool>
-    let capacity: Int
     var count: Int = 0
     var index: Int = 0
 
-    init(capacity: Int) {
+    init(capacity: Int, device: Device) {
         self.capacity = capacity
+        self.device = device
 
-        states = Tensor<Float>(zeros: [capacity, 4])
-        actions = Tensor<Int32>(zeros: [capacity])
-        rewards = Tensor<Float>(zeros: [capacity])
-        nextStates = Tensor<Float>(zeros: [capacity, 4])
-        isDones = Tensor<Bool>(repeating: false, shape: [capacity])
+        states = Tensor<Float>(zeros: [capacity, 4], on: device)
+        actions = Tensor<Int32>(zeros: [capacity], on: device)
+        rewards = Tensor<Float>(zeros: [capacity], on: device)
+        nextStates = Tensor<Float>(zeros: [capacity, 4], on: device)
+        isDones = Tensor<Bool>(repeating: false, shape: [capacity], on: device)
     }
 
     func append(
@@ -143,13 +146,15 @@ class Agent {
     let optimizer: Adam<Net>
     let replayBuffer: ReplayBuffer
     let discount: Float
+    let device: Device
 
-    init(qNet: Net, targetQNet: Net, optimizer: Adam<Net>, replayBuffer: ReplayBuffer, discount: Float) {
+    init(qNet: Net, targetQNet: Net, optimizer: Adam<Net>, replayBuffer: ReplayBuffer, discount: Float, device: Device) {
         self.qNet = qNet
         self.targetQNet = targetQNet
         self.optimizer = optimizer
         self.replayBuffer = replayBuffer
         self.discount = discount
+        self.device = device
     }
 
     func getAction(state: Tensor<Float>, epsilon: Float) -> Tensor<Int32> {
@@ -160,7 +165,7 @@ class Agent {
             // Neural network input needs to be 2D
             let tfState = Tensor<Float>(numpy: np.expand_dims(state.makeNumpyArray(), axis: 0))!
             let qValues = qNet(tfState)[0]
-            return qValues[1].scalarized() > qValues[0].scalarized() ? Tensor<Int32>(1) : Tensor<Int32>(0)
+            return Tensor<Int32>(qValues[1].scalarized() > qValues[0].scalarized() ? 1 : 0, on: device)
         }
     }
 
@@ -231,6 +236,7 @@ let replayBufferCapacity: Int = 5000
 let minBufferSize: Int = 1000
 let batchSize: Int = 64
 let targetNetUpdateRate: Int = 32
+let device: Device = Device.default
 
 // Initialize environment
 let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0"))
@@ -240,8 +246,8 @@ var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
 var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
 updateTargetQNet(source: qNet, target: &targetQNet)
 let optimizer = Adam(for: qNet, learningRate: learningRate)
-var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity)
-var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount)
+var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity, device: device)
+var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount, device: device)
 
 // RL Loop
 var stepIndex = 0

From ccfa0873b3c19327e996b5839d097e9eb32f1eae Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Mon, 3 Aug 2020 05:11:47 +0000
Subject: [PATCH 18/34] Add minBufferSize to Agent argument

---
 Gym/DQN/main.swift | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 6e042b7c34f..6ef6b6f6500 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -146,14 +146,16 @@ class Agent {
     let optimizer: Adam<Net>
     let replayBuffer: ReplayBuffer
     let discount: Float
+    let minBufferSize: Int
     let device: Device
 
-    init(qNet: Net, targetQNet: Net, optimizer: Adam<Net>, replayBuffer: ReplayBuffer, discount: Float, device: Device) {
+    init(qNet: Net, targetQNet: Net, optimizer: Adam<Net>, replayBuffer: ReplayBuffer, discount: Float, minBufferSize: Int, device: Device) {
         self.qNet = qNet
         self.targetQNet = targetQNet
         self.optimizer = optimizer
         self.replayBuffer = replayBuffer
         self.discount = discount
+        self.minBufferSize = minBufferSize
         self.device = device
     }
 
@@ -247,7 +249,7 @@ var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
 updateTargetQNet(source: qNet, target: &targetQNet)
 let optimizer = Adam(for: qNet, learningRate: learningRate)
 var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity, device: device)
-var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount, device: device)
+var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount, minBufferSize: minBufferSize, device: device)
 
 // RL Loop
 var stepIndex = 0

From 65de04e0af372c5a42d8d72e12aaf027bef10d54 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Mon, 3 Aug 2020 05:35:39 +0000
Subject: [PATCH 19/34] Use soft target updates

---
 Gym/DQN/main.swift | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 6ef6b6f6500..9ab48aa1bbb 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -200,11 +200,11 @@ class Agent {
     }
 }
 
-func updateTargetQNet(source: Net, target: inout Net) {
-    target.l1.weight = Tensor<Float>(source.l1.weight)
-    target.l1.bias = Tensor<Float>(source.l1.bias)
-    target.l2.weight = Tensor<Float>(source.l2.weight)
-    target.l2.bias = Tensor<Float>(source.l2.bias)
+func updateTargetQNet(source: Net, target: inout Net, softTargetUpdateRate: Float = 0.001) {
+    target.l1.weight = softTargetUpdateRate * Tensor<Float>(source.l1.weight) + (1 - softTargetUpdateRate) * target.l1.weight
+    target.l1.bias = softTargetUpdateRate * Tensor<Float>(source.l1.bias) + (1 - softTargetUpdateRate) * target.l1.bias
+    target.l2.weight = softTargetUpdateRate * Tensor<Float>(source.l2.weight) + (1 - softTargetUpdateRate) * target.l2.weight
+    target.l2.bias = softTargetUpdateRate * Tensor<Float>(source.l2.bias) + (1 - softTargetUpdateRate) * target.l2.bias
 }
 
 class TensorFlowEnvironmentWrapper {

From bcbb7e269d61403813916fae7a89928b7fae1dcf Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Mon, 3 Aug 2020 09:04:03 +0000
Subject: [PATCH 20/34] Fix bug where isDone was used wrong

---
 Gym/DQN/main.swift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 9ab48aa1bbb..a73726ea2be 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -185,7 +185,7 @@ class Agent {
                 let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices)
 
                 // Compute target batch
-                let targetBatch: Tensor<Float> = tfRewardBatch + Tensor<Float>(tfIsDoneBatch) * self.discount * self.targetQNet(tfNextStateBatch).max(squeezingAxes: Tensor<Int32>(1))
+                let targetBatch: Tensor<Float> = tfRewardBatch + (1 - Tensor<Float>(tfIsDoneBatch)) * self.discount * self.targetQNet(tfNextStateBatch).max(squeezingAxes: Tensor<Int32>(1))
                 return huberLoss(
                     predicted: predictionBatch,
                     expected: withoutDerivative(at: targetBatch),

From a20322674a9fb4f77d925d81796da93df06690a2 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Mon, 3 Aug 2020 09:07:00 +0000
Subject: [PATCH 21/34] Fix bug where target net is initialized with soft
 update

---
 Gym/DQN/main.swift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index a73726ea2be..9a01b555181 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -246,7 +246,7 @@ let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0"))
 // Initialize agent
 var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
 var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
-updateTargetQNet(source: qNet, target: &targetQNet)
+updateTargetQNet(source: qNet, target: &targetQNet, softTargetUpdateRate: 1)
 let optimizer = Adam(for: qNet, learningRate: learningRate)
 var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity, device: device)
 var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount, minBufferSize: minBufferSize, device: device)

From e757c0fdd263b4e03e7450bac4bf2fe073f9f771 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Mon, 3 Aug 2020 10:37:57 +0000
Subject: [PATCH 22/34] Follow hyperparameters in swift-rl

---
 Gym/DQN/main.swift | 52 +++++++++++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 9a01b555181..988a13f245b 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -130,8 +130,8 @@ struct Net: Layer {
     var l1, l2: Dense<Float>
 
     init(observationSize: Int, hiddenSize: Int, actionCount: Int) {
-        l1 = Dense<Float>(inputSize: observationSize, outputSize: hiddenSize, activation: relu, weightInitializer: heNormal())
-        l2 = Dense<Float>(inputSize: hiddenSize, outputSize: actionCount, activation: identity, weightInitializer: heNormal())
+        l1 = Dense<Float>(inputSize: observationSize, outputSize: hiddenSize, activation: relu)
+        l2 = Dense<Float>(inputSize: hiddenSize, outputSize: actionCount, activation: identity)
     }
 
     @differentiable
@@ -143,13 +143,13 @@ struct Net: Layer {
 class Agent {
     var qNet: Net
     var targetQNet: Net
-    let optimizer: Adam<Net>
+    let optimizer: AMSGrad<Net>
     let replayBuffer: ReplayBuffer
     let discount: Float
     let minBufferSize: Int
     let device: Device
 
-    init(qNet: Net, targetQNet: Net, optimizer: Adam<Net>, replayBuffer: ReplayBuffer, discount: Float, minBufferSize: Int, device: Device) {
+    init(qNet: Net, targetQNet: Net, optimizer: AMSGrad<Net>, replayBuffer: ReplayBuffer, discount: Float, minBufferSize: Int, device: Device) {
         self.qNet = qNet
         self.targetQNet = targetQNet
         self.optimizer = optimizer
@@ -185,10 +185,12 @@ class Agent {
                 let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices)
 
                 // Compute target batch
-                let targetBatch: Tensor<Float> = tfRewardBatch + (1 - Tensor<Float>(tfIsDoneBatch)) * self.discount * self.targetQNet(tfNextStateBatch).max(squeezingAxes: Tensor<Int32>(1))
+                let nextStateQValueBatch = self.targetQNet(tfNextStateBatch).max(squeezingAxes: Tensor<Int32>(1))
+                let targetBatch: Tensor<Float> = tfRewardBatch + self.discount * (1 - Tensor<Float>(tfIsDoneBatch)) * nextStateQValueBatch
+
                 return huberLoss(
                     predicted: predictionBatch,
-                    expected: withoutDerivative(at: targetBatch),
+                    expected: targetBatch,
                     delta: 1
                 )
             }
@@ -200,7 +202,7 @@ class Agent {
     }
 }
 
-func updateTargetQNet(source: Net, target: inout Net, softTargetUpdateRate: Float = 0.001) {
+func updateTargetQNet(source: Net, target: inout Net, softTargetUpdateRate: Float) {
     target.l1.weight = softTargetUpdateRate * Tensor<Float>(source.l1.weight) + (1 - softTargetUpdateRate) * target.l1.weight
     target.l1.bias = softTargetUpdateRate * Tensor<Float>(source.l1.bias) + (1 - softTargetUpdateRate) * target.l1.bias
     target.l2.weight = softTargetUpdateRate * Tensor<Float>(source.l2.weight) + (1 - softTargetUpdateRate) * target.l2.weight
@@ -230,14 +232,15 @@ class TensorFlowEnvironmentWrapper {
 
 // Hyperparameters
 let discount: Float = 0.99
-let learningRate: Float = 0.01
-let hiddenSize: Int = 64
-let startEpsilon: Float = 0.5
+let learningRate: Float = 0.001
+let hiddenSize: Int = 100
+let startEpsilon: Float = 0.5 // TODO(seungjaeryanlee): Ignored right now
 let maxEpisode: Int = 1000
-let replayBufferCapacity: Int = 5000
-let minBufferSize: Int = 1000
-let batchSize: Int = 64
-let targetNetUpdateRate: Int = 32
+let replayBufferCapacity: Int = 1000
+let minBufferSize: Int = 32
+let batchSize: Int = 32
+let targetNetUpdateRate: Int = 5
+let softTargetUpdateRate: Float = 0.05
 let device: Device = Device.default
 
 // Initialize environment
@@ -247,7 +250,7 @@ let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0"))
 var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
 var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
 updateTargetQNet(source: qNet, target: &targetQNet, softTargetUpdateRate: 1)
-let optimizer = Adam(for: qNet, learningRate: learningRate)
+let optimizer = AMSGrad(for: qNet, learningRate: learningRate)
 var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity, device: device)
 var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount, minBufferSize: minBufferSize, device: device)
 
@@ -258,11 +261,17 @@ var episodeReturn: Float = 0
 var episodeReturns: Array<Float> = []
 var losses: Array<Float> = []
 var state = env.reset()
+var bestReturn: Float = 0
 while episodeIndex < maxEpisode {
     stepIndex += 1
 
     // Interact with environment
-    let epsilon = startEpsilon * Float(maxEpisode - episodeIndex) / Float(maxEpisode)
+    // let epsilon = startEpsilon * Float(maxEpisode - episodeIndex) / Float(maxEpisode)
+    let epsilon: Float = 0.1
+    // let epsilon_start: Float = 0.9
+    // let epsilon_end: Float = 0.05
+    // let epsilon_decay: Int = 200
+    // let epsilon: Float = epsilon_end + (epsilon_start - epsilon_end) * Float(np.exp(-1 * stepIndex / epsilon_decay, dtype: np.float32))!
     let action = agent.getAction(state: state, epsilon: epsilon)
     let (nextState, reward, isDone, _) = env.step(action)
     episodeReturn += reward.scalarized()
@@ -275,14 +284,19 @@ while episodeIndex < maxEpisode {
 
     // Periodically update Target Net
     if stepIndex % targetNetUpdateRate == 0 {
-        updateTargetQNet(source: qNet, target: &targetQNet)
+        updateTargetQNet(source: qNet, target: &targetQNet, softTargetUpdateRate: softTargetUpdateRate)
     }
 
     // End-of-episode
     if isDone.scalarized() == true {
         state = env.reset()
         episodeIndex += 1
-        print(String(format: "Episode: %4d | Epsilon: %.03f | Return: %3d", episodeIndex, epsilon, Int(episodeReturn)))
+        print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Return: %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn)))
+        if episodeReturn > bestReturn {
+            // print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Return: %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn)))
+            // print("New best return of \(episodeReturn)")
+            bestReturn = episodeReturn
+        }
         if episodeReturn > 199 {
             print("Solved in \(episodeIndex) episodes with \(stepIndex) steps!")
             break
@@ -296,7 +310,7 @@ while episodeIndex < maxEpisode {
 }
 
 // Save smoothed learning curve
-let runningMeanWindow: Int = 2
+let runningMeanWindow: Int = 1
 let smoothedEpisodeReturns = np.convolve(episodeReturns, np.ones((runningMeanWindow)) / np.array(runningMeanWindow, dtype: np.int32), mode: "same")
 
 plt.plot(smoothedEpisodeReturns)

From d2be5bd5dd3f66199a3cb048b6a931c1e8d86488 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Mon, 3 Aug 2020 13:51:34 +0000
Subject: [PATCH 23/34] Run evaluation episode for every training episode

---
 Gym/DQN/main.swift | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 988a13f245b..7bf0e0042b6 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -230,6 +230,21 @@ class TensorFlowEnvironmentWrapper {
     }
 }
 
+func eval(agent: Agent) -> Float {
+    let evalEnv = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0"))
+    var evalEpisodeReturn: Float = 0
+    var state: Tensor<Float> = evalEnv.reset()
+    var reward: Tensor<Float>
+    var evalIsDone: Tensor<Bool> = Tensor<Bool>(false)
+    while evalIsDone.scalarized() == false {
+        let action = agent.getAction(state: state, epsilon: 0)
+        (state, reward, evalIsDone, _) = evalEnv.step(action)
+        evalEpisodeReturn += reward.scalarized()
+    }
+
+    return evalEpisodeReturn
+}
+
 // Hyperparameters
 let discount: Float = 0.99
 let learningRate: Float = 0.001
@@ -289,19 +304,20 @@ while episodeIndex < maxEpisode {
 
     // End-of-episode
     if isDone.scalarized() == true {
+        let evalEpisodeReturn = eval(agent: agent)
         state = env.reset()
         episodeIndex += 1
-        print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Return: %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn)))
-        if episodeReturn > bestReturn {
-            // print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Return: %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn)))
+        // print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Return: %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn)))
+        if evalEpisodeReturn > bestReturn {
+            print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Return: %3d | Eval : %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn), Int(evalEpisodeReturn)))
             // print("New best return of \(episodeReturn)")
-            bestReturn = episodeReturn
+            bestReturn = evalEpisodeReturn
         }
-        if episodeReturn > 199 {
+        if evalEpisodeReturn > 199 {
             print("Solved in \(episodeIndex) episodes with \(stepIndex) steps!")
             break
         }
-        episodeReturns.append(episodeReturn)
+        episodeReturns.append(evalEpisodeReturn)
         episodeReturn = 0
     }
 

From 6a118ab0ddc3e8be51df01a42e340586f5ed2a06 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Tue, 4 Aug 2020 01:55:23 +0000
Subject: [PATCH 24/34] Implement combined experience replay

---
 Gym/DQN/main.swift | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 7bf0e0042b6..b7c0dcca9b3 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -111,7 +111,11 @@ class ReplayBuffer {
         nextStateBatch: Tensor<Float>,
         isDoneBatch: Tensor<Bool>
     ) {
-        let randomIndices = Tensor<Int32>(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))!
+        // Vanilla
+        // let randomIndices = Tensor<Int32>(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))!
+        // Combined Experience Replay
+        let sampledIndices = np.random.randint(count, size: batchSize-1, dtype: np.int32)
+        let randomIndices = Tensor<Int32>(numpy: np.append(sampledIndices, np.array([(index + capacity - 1) % capacity], dtype: np.int32)))!
 
         let stateBatch = states.gathering(atIndices: randomIndices, alongAxis: 0)
         let actionBatch = actions.gathering(atIndices: randomIndices, alongAxis: 0)

From ce539e564f15ec670969e560cad863340b5767a2 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Tue, 4 Aug 2020 01:55:40 +0000
Subject: [PATCH 25/34] Implement double DQN

---
 Gym/DQN/main.swift | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index b7c0dcca9b3..dbf9eeb59d9 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -189,7 +189,13 @@ class Agent {
                 let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices)
 
                 // Compute target batch
-                let nextStateQValueBatch = self.targetQNet(tfNextStateBatch).max(squeezingAxes: Tensor<Int32>(1))
+                // DQN
+                // let nextStateQValueBatch = self.targetQNet(tfNextStateBatch).max(squeezingAxes: 1)
+                // DDQN
+                let npNextStateActionBatch = self.qNet(tfNextStateBatch).argmax(squeezingAxis: 1).makeNumpyArray()
+                let npNextStateFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npNextStateActionBatch], axis: 1)
+                let tfNextStateFullIndices = Tensor<Int32>(numpy: npNextStateFullIndices)!
+                let nextStateQValueBatch = _Raw.gatherNd(params: self.targetQNet(tfNextStateBatch), indices: tfNextStateFullIndices)
                 let targetBatch: Tensor<Float> = tfRewardBatch + self.discount * (1 - Tensor<Float>(tfIsDoneBatch)) * nextStateQValueBatch
 
                 return huberLoss(

From cf7b96a4753409edf5c1f7b8cb5db57f37db7830 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Tue, 4 Aug 2020 02:07:34 +0000
Subject: [PATCH 26/34] Add options to toggle CER and DDQN

---
 Gym/DQN/main.swift | 70 +++++++++++++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 22 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index dbf9eeb59d9..504630e3ddc 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -64,6 +64,7 @@ let plt = Python.import("matplotlib.pyplot")
 
 class ReplayBuffer {
     let capacity: Int
+    let combined: Bool
     let device: Device
 
     var states: Tensor<Float>
@@ -74,8 +75,9 @@ class ReplayBuffer {
     var count: Int = 0
     var index: Int = 0
 
-    init(capacity: Int, device: Device) {
+    init(capacity: Int, combined: Bool, device: Device) {
         self.capacity = capacity
+        self.combined = combined
         self.device = device
 
         states = Tensor<Float>(zeros: [capacity, 4], on: device)
@@ -111,17 +113,23 @@ class ReplayBuffer {
         nextStateBatch: Tensor<Float>,
         isDoneBatch: Tensor<Bool>
     ) {
-        // Vanilla
-        // let randomIndices = Tensor<Int32>(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))!
-        // Combined Experience Replay
-        let sampledIndices = np.random.randint(count, size: batchSize-1, dtype: np.int32)
-        let randomIndices = Tensor<Int32>(numpy: np.append(sampledIndices, np.array([(index + capacity - 1) % capacity], dtype: np.int32)))!
-
-        let stateBatch = states.gathering(atIndices: randomIndices, alongAxis: 0)
-        let actionBatch = actions.gathering(atIndices: randomIndices, alongAxis: 0)
-        let rewardBatch = rewards.gathering(atIndices: randomIndices, alongAxis: 0)
-        let nextStateBatch = nextStates.gathering(atIndices: randomIndices, alongAxis: 0)
-        let isDoneBatch = isDones.gathering(atIndices: randomIndices, alongAxis: 0)
+        let indices: Tensor<Int32>
+        if self.combined == true {
+            // Combined Experience Replay
+            let sampledIndices = np.random.randint(count, size: batchSize - 1, dtype: np.int32)
+            let lastIndex = np.array([(index + capacity - 1) % capacity], dtype: np.int32)
+            indices = Tensor<Int32>(numpy: np.append(sampledIndices, lastIndex))!
+        }
+        else {
+            // Vanilla Experience Replay
+            indices = Tensor<Int32>(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))!
+        }
+
+        let stateBatch = states.gathering(atIndices: indices, alongAxis: 0)
+        let actionBatch = actions.gathering(atIndices: indices, alongAxis: 0)
+        let rewardBatch = rewards.gathering(atIndices: indices, alongAxis: 0)
+        let nextStateBatch = nextStates.gathering(atIndices: indices, alongAxis: 0)
+        let isDoneBatch = isDones.gathering(atIndices: indices, alongAxis: 0)
 
         return (stateBatch, actionBatch, rewardBatch, nextStateBatch, isDoneBatch)
     }
@@ -151,15 +159,26 @@ class Agent {
     let replayBuffer: ReplayBuffer
     let discount: Float
     let minBufferSize: Int
+    let doubleDQN: Bool
     let device: Device
 
-    init(qNet: Net, targetQNet: Net, optimizer: AMSGrad<Net>, replayBuffer: ReplayBuffer, discount: Float, minBufferSize: Int, device: Device) {
+    init(
+        qNet: Net,
+        targetQNet: Net,
+        optimizer: AMSGrad<Net>,
+        replayBuffer: ReplayBuffer,
+        discount: Float,
+        minBufferSize: Int,
+        doubleDQN: Bool,
+        device: Device
+    ) {
         self.qNet = qNet
         self.targetQNet = targetQNet
         self.optimizer = optimizer
         self.replayBuffer = replayBuffer
         self.discount = discount
         self.minBufferSize = minBufferSize
+        self.doubleDQN = doubleDQN
         self.device = device
     }
 
@@ -189,13 +208,18 @@ class Agent {
                 let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices)
 
                 // Compute target batch
-                // DQN
-                // let nextStateQValueBatch = self.targetQNet(tfNextStateBatch).max(squeezingAxes: 1)
-                // DDQN
-                let npNextStateActionBatch = self.qNet(tfNextStateBatch).argmax(squeezingAxis: 1).makeNumpyArray()
-                let npNextStateFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npNextStateActionBatch], axis: 1)
-                let tfNextStateFullIndices = Tensor<Int32>(numpy: npNextStateFullIndices)!
-                let nextStateQValueBatch = _Raw.gatherNd(params: self.targetQNet(tfNextStateBatch), indices: tfNextStateFullIndices)
+                let nextStateQValueBatch: Tensor<Float>
+                if self.doubleDQN == true {
+                    // Double DQN
+                    let npNextStateActionBatch = self.qNet(tfNextStateBatch).argmax(squeezingAxis: 1).makeNumpyArray()
+                    let npNextStateFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npNextStateActionBatch], axis: 1)
+                    let tfNextStateFullIndices = Tensor<Int32>(numpy: npNextStateFullIndices)!
+                    nextStateQValueBatch = _Raw.gatherNd(params: self.targetQNet(tfNextStateBatch), indices: tfNextStateFullIndices)
+                }
+                else {
+                    // DQN
+                    nextStateQValueBatch = self.targetQNet(tfNextStateBatch).max(squeezingAxes: 1)
+                }
                 let targetBatch: Tensor<Float> = tfRewardBatch + self.discount * (1 - Tensor<Float>(tfIsDoneBatch)) * nextStateQValueBatch
 
                 return huberLoss(
@@ -262,6 +286,8 @@ let hiddenSize: Int = 100
 let startEpsilon: Float = 0.5 // TODO(seungjaeryanlee): Ignored right now
 let maxEpisode: Int = 1000
 let replayBufferCapacity: Int = 1000
+let useCombinedExperienceReplay: Bool = true
+let useDoubleDQN: Bool = true
 let minBufferSize: Int = 32
 let batchSize: Int = 32
 let targetNetUpdateRate: Int = 5
@@ -276,8 +302,8 @@ var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
 var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
 updateTargetQNet(source: qNet, target: &targetQNet, softTargetUpdateRate: 1)
 let optimizer = AMSGrad(for: qNet, learningRate: learningRate)
-var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity, device: device)
-var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount, minBufferSize: minBufferSize, device: device)
+var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity, combined: useCombinedExperienceReplay, device: device)
+var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount, minBufferSize: minBufferSize, doubleDQN: useDoubleDQN, device: device)
 
 // RL Loop
 var stepIndex = 0

From 98b4647107a377c15eb5d958cefdb7683a17fb45 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Tue, 4 Aug 2020 02:37:07 +0000
Subject: [PATCH 27/34] Refactor code

---
 Gym/DQN/main.swift | 72 +++++++++++++++++++++++++++++-----------------
 1 file changed, 45 insertions(+), 27 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 504630e3ddc..6a4d5ed8589 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -236,12 +236,7 @@ class Agent {
     }
 }
 
-func updateTargetQNet(source: Net, target: inout Net, softTargetUpdateRate: Float) {
-    target.l1.weight = softTargetUpdateRate * Tensor<Float>(source.l1.weight) + (1 - softTargetUpdateRate) * target.l1.weight
-    target.l1.bias = softTargetUpdateRate * Tensor<Float>(source.l1.bias) + (1 - softTargetUpdateRate) * target.l1.bias
-    target.l2.weight = softTargetUpdateRate * Tensor<Float>(source.l2.weight) + (1 - softTargetUpdateRate) * target.l2.weight
-    target.l2.bias = softTargetUpdateRate * Tensor<Float>(source.l2.bias) + (1 - softTargetUpdateRate) * target.l2.bias
-}
+
 
 class TensorFlowEnvironmentWrapper {
     let originalEnv: PythonObject
@@ -280,18 +275,27 @@ func eval(agent: Agent) -> Float {
 }
 
 // Hyperparameters
-let discount: Float = 0.99
-let learningRate: Float = 0.001
+// - Network Hyperparameters
 let hiddenSize: Int = 100
-let startEpsilon: Float = 0.5 // TODO(seungjaeryanlee): Ignored right now
+// - Agent-Env Interaction Hyperparameters
 let maxEpisode: Int = 1000
-let replayBufferCapacity: Int = 1000
-let useCombinedExperienceReplay: Bool = true
+let epsilonStart: Float = 0.1
+let epsilonEnd: Float = 0.1
+let epsilonDecay: Float = 10000
+// - Update Hyperparameters
+let learningRate: Float = 0.001
+let discount: Float = 0.99
 let useDoubleDQN: Bool = true
+// - Replay Buffer Hyperparameters
+let replayBufferCapacity: Int = 1000
 let minBufferSize: Int = 32
 let batchSize: Int = 32
+let useCombinedExperienceReplay: Bool = true
+// - Target Network Hyperparameters
 let targetNetUpdateRate: Int = 5
 let softTargetUpdateRate: Float = 0.05
+
+// Setup device
 let device: Device = Device.default
 
 // Initialize environment
@@ -302,8 +306,21 @@ var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
 var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
 updateTargetQNet(source: qNet, target: &targetQNet, softTargetUpdateRate: 1)
 let optimizer = AMSGrad(for: qNet, learningRate: learningRate)
-var replayBuffer: ReplayBuffer = ReplayBuffer(capacity: replayBufferCapacity, combined: useCombinedExperienceReplay, device: device)
-var agent = Agent(qNet: qNet, targetQNet: targetQNet, optimizer: optimizer, replayBuffer: replayBuffer, discount: discount, minBufferSize: minBufferSize, doubleDQN: useDoubleDQN, device: device)
+var replayBuffer = ReplayBuffer(
+    capacity: replayBufferCapacity,
+    combined: useCombinedExperienceReplay,
+    device: device
+)
+var agent = Agent(
+    qNet: qNet,
+    targetQNet: targetQNet,
+    optimizer: optimizer,
+    replayBuffer: replayBuffer,
+    discount: discount,
+    minBufferSize: minBufferSize,
+    doubleDQN: useDoubleDQN,
+    device: device
+)
 
 // RL Loop
 var stepIndex = 0
@@ -317,12 +334,7 @@ while episodeIndex < maxEpisode {
     stepIndex += 1
 
     // Interact with environment
-    // let epsilon = startEpsilon * Float(maxEpisode - episodeIndex) / Float(maxEpisode)
-    let epsilon: Float = 0.1
-    // let epsilon_start: Float = 0.9
-    // let epsilon_end: Float = 0.05
-    // let epsilon_decay: Int = 200
-    // let epsilon: Float = epsilon_end + (epsilon_start - epsilon_end) * Float(np.exp(-1 * stepIndex / epsilon_decay, dtype: np.float32))!
+    let epsilon: Float = epsilonEnd + (epsilonStart - epsilonEnd) * Float(np.exp(-1.0 * Float(stepIndex) / epsilonDecay))!
     let action = agent.getAction(state: state, epsilon: epsilon)
     let (nextState, reward, isDone, _) = env.step(action)
     episodeReturn += reward.scalarized()
@@ -340,20 +352,18 @@ while episodeIndex < maxEpisode {
 
     // End-of-episode
     if isDone.scalarized() == true {
-        let evalEpisodeReturn = eval(agent: agent)
         state = env.reset()
         episodeIndex += 1
-        // print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Return: %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn)))
+        let evalEpisodeReturn = eval(agent: agent)
+        episodeReturns.append(evalEpisodeReturn)
         if evalEpisodeReturn > bestReturn {
-            print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Return: %3d | Eval : %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn), Int(evalEpisodeReturn)))
-            // print("New best return of \(episodeReturn)")
+            print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Train: %3d | Eval: %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn), Int(evalEpisodeReturn)))
             bestReturn = evalEpisodeReturn
         }
         if evalEpisodeReturn > 199 {
             print("Solved in \(episodeIndex) episodes with \(stepIndex) steps!")
             break
         }
-        episodeReturns.append(evalEpisodeReturn)
         episodeReturn = 0
     }
 
@@ -361,18 +371,26 @@ while episodeIndex < maxEpisode {
     state = nextState
 }
 
+// Save learning curve
+plt.plot(episodeReturns)
+plt.title("Deep Q-Network on CartPole-v0")
+plt.xlabel("Episode")
+plt.ylabel("Episode Return")
+plt.savefig("dqnEpisodeReturns.png")
+plt.clf()
+
 // Save smoothed learning curve
-let runningMeanWindow: Int = 1
+let runningMeanWindow: Int = 10
 let smoothedEpisodeReturns = np.convolve(episodeReturns, np.ones((runningMeanWindow)) / np.array(runningMeanWindow, dtype: np.int32), mode: "same")
 
-plt.plot(smoothedEpisodeReturns)
+plt.plot(episodeReturns)
 plt.title("Deep Q-Network on CartPole-v0")
 plt.xlabel("Episode")
 plt.ylabel("Smoothed Episode Return")
 plt.savefig("dqnSmoothedEpisodeReturns.png")
 plt.clf()
 
-// Save TD loss curve
+// // Save TD loss curve
 plt.plot(losses)
 plt.title("Deep Q-Network on CartPole-v0")
 plt.xlabel("Step")

From e00901adbf7633e807cfeda16a054578e5e46eea Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Tue, 4 Aug 2020 02:40:47 +0000
Subject: [PATCH 28/34] Add updateTargetQNet to Agent class

---
 Gym/DQN/main.swift | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 6a4d5ed8589..2786fadc578 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -180,6 +180,9 @@ class Agent {
         self.minBufferSize = minBufferSize
         self.doubleDQN = doubleDQN
         self.device = device
+
+        // Copy Q-network to Target Q-network before training
+        updateTargetQNet(tau: 1)
     }
 
     func getAction(state: Tensor<Float>, epsilon: Float) -> Tensor<Int32> {
@@ -234,9 +237,14 @@ class Agent {
         }
         return 0
     }
-}
-
 
+    func updateTargetQNet(tau: Float) {
+        self.targetQNet.l1.weight = tau * Tensor<Float>(self.qNet.l1.weight) + (1 - tau) * self.targetQNet.l1.weight
+        self.targetQNet.l1.bias = tau * Tensor<Float>(self.qNet.l1.bias) + (1 - tau) * self.targetQNet.l1.bias
+        self.targetQNet.l2.weight = tau * Tensor<Float>(self.qNet.l2.weight) + (1 - tau) * self.targetQNet.l2.weight
+        self.targetQNet.l2.bias = tau * Tensor<Float>(self.qNet.l2.bias) + (1 - tau) * self.targetQNet.l2.bias
+    }
+}
 
 class TensorFlowEnvironmentWrapper {
     let originalEnv: PythonObject
@@ -304,7 +312,6 @@ let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0"))
 // Initialize agent
 var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
 var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
-updateTargetQNet(source: qNet, target: &targetQNet, softTargetUpdateRate: 1)
 let optimizer = AMSGrad(for: qNet, learningRate: learningRate)
 var replayBuffer = ReplayBuffer(
     capacity: replayBufferCapacity,
@@ -347,7 +354,7 @@ while episodeIndex < maxEpisode {
 
     // Periodically update Target Net
     if stepIndex % targetNetUpdateRate == 0 {
-        updateTargetQNet(source: qNet, target: &targetQNet, softTargetUpdateRate: softTargetUpdateRate)
+        agent.updateTargetQNet(tau: softTargetUpdateRate)
     }
 
     // End-of-episode

From bca2614ad5e4c4e4dc875074de390d69fffedfa4 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Tue, 4 Aug 2020 03:00:33 +0000
Subject: [PATCH 29/34] Use TF-Agents hyperparameters

---
 Gym/DQN/main.swift | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 2786fadc578..cdba07a1993 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -155,7 +155,7 @@ struct Net: Layer {
 class Agent {
     var qNet: Net
     var targetQNet: Net
-    let optimizer: AMSGrad<Net>
+    let optimizer: Adam<Net>
     let replayBuffer: ReplayBuffer
     let discount: Float
     let minBufferSize: Int
@@ -165,7 +165,7 @@ class Agent {
     init(
         qNet: Net,
         targetQNet: Net,
-        optimizer: AMSGrad<Net>,
+        optimizer: Adam<Net>,
         replayBuffer: ReplayBuffer,
         discount: Float,
         minBufferSize: Int,
@@ -295,9 +295,9 @@ let learningRate: Float = 0.001
 let discount: Float = 0.99
 let useDoubleDQN: Bool = true
 // - Replay Buffer Hyperparameters
-let replayBufferCapacity: Int = 1000
-let minBufferSize: Int = 32
-let batchSize: Int = 32
+let replayBufferCapacity: Int = 100000
+let minBufferSize: Int = 64
+let batchSize: Int = 64
 let useCombinedExperienceReplay: Bool = true
 // - Target Network Hyperparameters
 let targetNetUpdateRate: Int = 5
@@ -312,7 +312,7 @@ let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0"))
 // Initialize agent
 var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
 var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
-let optimizer = AMSGrad(for: qNet, learningRate: learningRate)
+let optimizer = Adam(for: qNet, learningRate: learningRate)
 var replayBuffer = ReplayBuffer(
     capacity: replayBufferCapacity,
     combined: useCombinedExperienceReplay,

From 45b880ef4d943447bf955b8a5c6d1fa131e9765e Mon Sep 17 00:00:00 2001
From: Brad Larson <bradlarson@google.com>
Date: Wed, 5 Aug 2020 16:15:16 -0500
Subject: [PATCH 30/34] Changed ReplayBuffer to play better with GPU eager
 mode, restructured to four files, added Tensor extension, formatted via
 swift-format.

---
 Gym/DQN/Agent.swift        | 145 ++++++++++++++
 Gym/DQN/Gathering.swift    |  45 +++++
 Gym/DQN/ReplayBuffer.swift |  81 ++++++++
 Gym/DQN/main.swift         | 400 +++++++++----------------------------
 4 files changed, 363 insertions(+), 308 deletions(-)
 create mode 100644 Gym/DQN/Agent.swift
 create mode 100644 Gym/DQN/Gathering.swift
 create mode 100644 Gym/DQN/ReplayBuffer.swift

diff --git a/Gym/DQN/Agent.swift b/Gym/DQN/Agent.swift
new file mode 100644
index 00000000000..4a534062257
--- /dev/null
+++ b/Gym/DQN/Agent.swift
@@ -0,0 +1,145 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import TensorFlow
+
+// Force unwrapping with `!` does not provide source location when unwrapping `nil`, so we instead
+// make a utility function for debuggability.
+extension Optional {
+  fileprivate func unwrapped(file: StaticString = #filePath, line: UInt = #line) -> Wrapped {
+    guard let unwrapped = self else {
+      fatalError("Value is nil", file: (file), line: line)
+    }
+    return unwrapped
+  }
+}
+
+struct Net: Layer {
+  typealias Input = Tensor<Float>
+  typealias Output = Tensor<Float>
+
+  var l1, l2: Dense<Float>
+
+  init(observationSize: Int, hiddenSize: Int, actionCount: Int) {
+    l1 = Dense<Float>(inputSize: observationSize, outputSize: hiddenSize, activation: relu)
+    l2 = Dense<Float>(inputSize: hiddenSize, outputSize: actionCount, activation: identity)
+  }
+
+  @differentiable
+  func callAsFunction(_ input: Input) -> Output {
+    return input.sequenced(through: l1, l2)
+  }
+}
+
+class Agent {
+  var qNet: Net
+  var targetQNet: Net
+  let optimizer: Adam<Net>
+  let replayBuffer: ReplayBuffer
+  let discount: Float
+  let minBufferSize: Int
+  let doubleDQN: Bool
+  let device: Device
+
+  init(
+    qNet: Net,
+    targetQNet: Net,
+    optimizer: Adam<Net>,
+    replayBuffer: ReplayBuffer,
+    discount: Float,
+    minBufferSize: Int,
+    doubleDQN: Bool,
+    device: Device
+  ) {
+    self.qNet = qNet
+    self.targetQNet = targetQNet
+    self.optimizer = optimizer
+    self.replayBuffer = replayBuffer
+    self.discount = discount
+    self.minBufferSize = minBufferSize
+    self.doubleDQN = doubleDQN
+    self.device = device
+
+    // Copy Q-network to Target Q-network before training
+    updateTargetQNet(tau: 1)
+  }
+
+  func getAction(state: Tensor<Float>, epsilon: Float) -> Tensor<Int32> {
+    if Float(np.random.uniform()).unwrapped() < epsilon {
+      return Tensor<Int32>(numpy: np.array(np.random.randint(0, 2), dtype: np.int32))!
+    } else {
+      // Neural network input needs to be 2D
+      let tfState = Tensor<Float>(numpy: np.expand_dims(state.makeNumpyArray(), axis: 0))!
+      let qValues = qNet(tfState)[0]
+      return Tensor<Int32>(qValues[1].scalarized() > qValues[0].scalarized() ? 1 : 0, on: device)
+    }
+  }
+
+  func train(batchSize: Int) -> Float {
+    // Don't train if replay buffer is too small
+    if replayBuffer.count >= minBufferSize {
+      let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch, tfIsDoneBatch) =
+        replayBuffer.sample(batchSize: batchSize)
+
+      let (loss, gradients) = valueWithGradient(at: qNet) { qNet -> Tensor<Float> in
+        // Compute prediction batch
+        let npActionBatch = tfActionBatch.makeNumpyArray()
+        let npFullIndices = np.stack(
+          [np.arange(batchSize, dtype: np.int32), npActionBatch], axis: 1)
+        let tfFullIndices = Tensor<Int32>(numpy: npFullIndices)!
+        let stateQValueBatch = qNet(tfStateBatch)
+        let predictionBatch = stateQValueBatch.dimensionGathering(atIndices: tfFullIndices)
+
+        // Compute target batch
+        let nextStateQValueBatch: Tensor<Float>
+        if self.doubleDQN == true {
+          // Double DQN
+          let npNextStateActionBatch = self.qNet(tfNextStateBatch).argmax(squeezingAxis: 1)
+            .makeNumpyArray()
+          let npNextStateFullIndices = np.stack(
+            [np.arange(batchSize, dtype: np.int32), npNextStateActionBatch], axis: 1)
+          let tfNextStateFullIndices = Tensor<Int32>(numpy: npNextStateFullIndices)!
+          nextStateQValueBatch = self.targetQNet(tfNextStateBatch).dimensionGathering(
+            atIndices: tfNextStateFullIndices)
+        } else {
+          // DQN
+          nextStateQValueBatch = self.targetQNet(tfNextStateBatch).max(squeezingAxes: 1)
+        }
+        let targetBatch: Tensor<Float> =
+          tfRewardBatch + self.discount * (1 - Tensor<Float>(tfIsDoneBatch)) * nextStateQValueBatch
+
+        return huberLoss(
+          predicted: predictionBatch,
+          expected: targetBatch,
+          delta: 1
+        )
+      }
+      optimizer.update(&qNet, along: gradients)
+
+      return loss.scalarized()
+    }
+    return 0
+  }
+
+  func updateTargetQNet(tau: Float) {
+    self.targetQNet.l1.weight =
+      tau * Tensor<Float>(self.qNet.l1.weight) + (1 - tau) * self.targetQNet.l1.weight
+    self.targetQNet.l1.bias =
+      tau * Tensor<Float>(self.qNet.l1.bias) + (1 - tau) * self.targetQNet.l1.bias
+    self.targetQNet.l2.weight =
+      tau * Tensor<Float>(self.qNet.l2.weight) + (1 - tau) * self.targetQNet.l2.weight
+    self.targetQNet.l2.bias =
+      tau * Tensor<Float>(self.qNet.l2.bias) + (1 - tau) * self.targetQNet.l2.bias
+  }
+}
diff --git a/Gym/DQN/Gathering.swift b/Gym/DQN/Gathering.swift
new file mode 100644
index 00000000000..40392c5a4c6
--- /dev/null
+++ b/Gym/DQN/Gathering.swift
@@ -0,0 +1,45 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import TensorFlow
+
+extension Tensor where Scalar: TensorFlowFloatingPoint {
+  @inlinable
+  @differentiable(wrt: self)
+  public func dimensionGathering<Index: TensorFlowIndex>(
+    atIndices indices: Tensor<Index>
+  ) -> Tensor {
+    return _Raw.gatherNd(params: self, indices: indices)
+  }
+
+  /// Derivative of `_Raw.gatherNd`.
+  ///
+  /// Ported from TensorFlow Python reference implementation:
+  /// https://github.com/tensorflow/tensorflow/blob/r2.2/tensorflow/python/ops/array_grad.py#L691-L701
+  @inlinable
+  @derivative(of: dimensionGathering)
+  func _vjpDimensionGathering<Index: TensorFlowIndex>(
+    atIndices indices: Tensor<Index>
+  ) -> (value: Tensor, pullback: (Tensor) -> Tensor) {
+    let shapeTensor = Tensor<Index>(self.shapeTensor)
+    let value = _Raw.gatherNd(params: self, indices: indices)
+    return (
+      value,
+      { v in
+        let dparams = _Raw.scatterNd(indices: indices, updates: v, shape: shapeTensor)
+        return dparams
+      }
+    )
+  }
+}
diff --git a/Gym/DQN/ReplayBuffer.swift b/Gym/DQN/ReplayBuffer.swift
new file mode 100644
index 00000000000..01b64ed4ff3
--- /dev/null
+++ b/Gym/DQN/ReplayBuffer.swift
@@ -0,0 +1,81 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import TensorFlow
+
+struct ReplayBuffer {
+  let capacity: Int
+  let combined: Bool
+
+  @noDerivative var states: [Tensor<Float>] = []
+  @noDerivative var actions: [Tensor<Int32>] = []
+  @noDerivative var rewards: [Tensor<Float>] = []
+  @noDerivative var nextStates: [Tensor<Float>] = []
+  @noDerivative var isDones: [Tensor<Bool>] = []
+  var count: Int { return states.count }
+
+  init(capacity: Int, combined: Bool) {
+    self.capacity = capacity
+    self.combined = combined
+  }
+
+  mutating func append(
+    state: Tensor<Float>,
+    action: Tensor<Int32>,
+    reward: Tensor<Float>,
+    nextState: Tensor<Float>,
+    isDone: Tensor<Bool>
+  ) {
+    if count >= capacity {
+      // Erase oldest SARS if the replay buffer is full
+      states.removeFirst()
+      actions.removeFirst()
+      rewards.removeFirst()
+      nextStates.removeFirst()
+      isDones.removeFirst()
+    }
+    states.append(state)
+    actions.append(action)
+    rewards.append(reward)
+    nextStates.append(nextState)
+    isDones.append(isDone)
+  }
+
+  func sample(batchSize: Int) -> (
+    stateBatch: Tensor<Float>,
+    actionBatch: Tensor<Int32>,
+    rewardBatch: Tensor<Float>,
+    nextStateBatch: Tensor<Float>,
+    isDoneBatch: Tensor<Bool>
+  ) {
+    let indices: Tensor<Int32>
+    if self.combined == true {
+      // Combined Experience Replay
+      let sampledIndices = (0..<batchSize - 1).map { _ in Int32.random(in: 0..<Int32(count)) }
+      indices = Tensor<Int32>(shape: [batchSize], scalars: sampledIndices + [Int32(count) - 1])
+    } else {
+      // Vanilla Experience Replay
+      let sampledIndices = (0..<batchSize).map { _ in Int32.random(in: 0..<Int32(count)) }
+      indices = Tensor<Int32>(shape: [batchSize], scalars: sampledIndices)
+    }
+
+    let stateBatch = Tensor(stacking: states).gathering(atIndices: indices, alongAxis: 0)
+    let actionBatch = Tensor(stacking: actions).gathering(atIndices: indices, alongAxis: 0)
+    let rewardBatch = Tensor(stacking: rewards).gathering(atIndices: indices, alongAxis: 0)
+    let nextStateBatch = Tensor(stacking: nextStates).gathering(atIndices: indices, alongAxis: 0)
+    let isDoneBatch = Tensor(stacking: isDones).gathering(atIndices: indices, alongAxis: 0)
+
+    return (stateBatch, actionBatch, rewardBatch, nextStateBatch, isDoneBatch)
+  }
+}
diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index cdba07a1993..4006ff432e6 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -12,274 +12,52 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if canImport(PythonKit)
-    import PythonKit
-#else
-    import Python
-#endif
+import Foundation
+import PythonKit
 import TensorFlow
 
-// Force unwrapping with `!` does not provide source location when unwrapping `nil`, so we instead
-// make a utility function for debuggability.
-fileprivate extension Optional {
-    func unwrapped(file: StaticString = #filePath, line: UInt = #line) -> Wrapped {
-        guard let unwrapped = self else {
-            fatalError("Value is nil", file: (file), line: line)
-        }
-        return unwrapped
-    }
-}
-
-extension _Raw {
-    /// Derivative of `_Raw.gatherNd`.
-    ///
-    /// Ported from TensorFlow Python reference implementation:
-    /// https://github.com/tensorflow/tensorflow/blob/r2.2/tensorflow/python/ops/array_grad.py#L691-L701
-    @inlinable
-    @derivative(of: gatherNd)
-    public static func vjpGatherNd<
-        Scalar: TensorFlowFloatingPoint,
-        Index: TensorFlowIndex
-    >(
-        params: Tensor<Scalar>,
-        indices: Tensor<Index>
-    ) -> (
-        value: Tensor<Scalar>,
-        pullback: (Tensor<Scalar>) -> Tensor<Scalar>
-    ) {
-        let shapeTensor = Tensor<Index>(params.shapeTensor)
-        let value = gatherNd(params: params, indices: indices)
-        return (value, { v in
-            let dparams = scatterNd(indices: indices, updates: v, shape: shapeTensor)
-            return dparams
-        })
-    }
-}
-
 // Initialize Python. This comment is a hook for internal use, do not remove.
 
 let np = Python.import("numpy")
 let gym = Python.import("gym")
 let plt = Python.import("matplotlib.pyplot")
 
-class ReplayBuffer {
-    let capacity: Int
-    let combined: Bool
-    let device: Device
-
-    var states: Tensor<Float>
-    var actions: Tensor<Int32>
-    var rewards: Tensor<Float>
-    var nextStates: Tensor<Float>
-    var isDones: Tensor<Bool>
-    var count: Int = 0
-    var index: Int = 0
-
-    init(capacity: Int, combined: Bool, device: Device) {
-        self.capacity = capacity
-        self.combined = combined
-        self.device = device
-
-        states = Tensor<Float>(zeros: [capacity, 4], on: device)
-        actions = Tensor<Int32>(zeros: [capacity], on: device)
-        rewards = Tensor<Float>(zeros: [capacity], on: device)
-        nextStates = Tensor<Float>(zeros: [capacity, 4], on: device)
-        isDones = Tensor<Bool>(repeating: false, shape: [capacity], on: device)
-    }
-
-    func append(
-        state: Tensor<Float>,
-        action: Tensor<Int32>,
-        reward: Tensor<Float>,
-        nextState: Tensor<Float>,
-        isDone: Tensor<Bool>
-    ) {
-        if count < capacity {
-            count += 1
-        }
-        // Erase oldest SARS if the replay buffer is full
-        states[index] = state
-        actions[index] = action
-        rewards[index] = reward
-        nextStates[index] = nextState
-        isDones[index] = isDone
-        index = (index + 1) % capacity
-    }
-
-    func sample(batchSize: Int) -> (
-        stateBatch: Tensor<Float>,
-        actionBatch: Tensor<Int32>,
-        rewardBatch: Tensor<Float>,
-        nextStateBatch: Tensor<Float>,
-        isDoneBatch: Tensor<Bool>
-    ) {
-        let indices: Tensor<Int32>
-        if self.combined == true {
-            // Combined Experience Replay
-            let sampledIndices = np.random.randint(count, size: batchSize - 1, dtype: np.int32)
-            let lastIndex = np.array([(index + capacity - 1) % capacity], dtype: np.int32)
-            indices = Tensor<Int32>(numpy: np.append(sampledIndices, lastIndex))!
-        }
-        else {
-            // Vanilla Experience Replay
-            indices = Tensor<Int32>(numpy: np.random.randint(count, size: batchSize, dtype: np.int32))!
-        }
-
-        let stateBatch = states.gathering(atIndices: indices, alongAxis: 0)
-        let actionBatch = actions.gathering(atIndices: indices, alongAxis: 0)
-        let rewardBatch = rewards.gathering(atIndices: indices, alongAxis: 0)
-        let nextStateBatch = nextStates.gathering(atIndices: indices, alongAxis: 0)
-        let isDoneBatch = isDones.gathering(atIndices: indices, alongAxis: 0)
-
-        return (stateBatch, actionBatch, rewardBatch, nextStateBatch, isDoneBatch)
-    }
-}
-
-struct Net: Layer {
-    typealias Input = Tensor<Float>
-    typealias Output = Tensor<Float>
-
-    var l1, l2: Dense<Float>
-
-    init(observationSize: Int, hiddenSize: Int, actionCount: Int) {
-        l1 = Dense<Float>(inputSize: observationSize, outputSize: hiddenSize, activation: relu)
-        l2 = Dense<Float>(inputSize: hiddenSize, outputSize: actionCount, activation: identity)
-    }
-
-    @differentiable
-    func callAsFunction(_ input: Input) -> Output {
-        return input.sequenced(through: l1, l2)
-    }
-}
-
-class Agent {
-    var qNet: Net
-    var targetQNet: Net
-    let optimizer: Adam<Net>
-    let replayBuffer: ReplayBuffer
-    let discount: Float
-    let minBufferSize: Int
-    let doubleDQN: Bool
-    let device: Device
-
-    init(
-        qNet: Net,
-        targetQNet: Net,
-        optimizer: Adam<Net>,
-        replayBuffer: ReplayBuffer,
-        discount: Float,
-        minBufferSize: Int,
-        doubleDQN: Bool,
-        device: Device
-    ) {
-        self.qNet = qNet
-        self.targetQNet = targetQNet
-        self.optimizer = optimizer
-        self.replayBuffer = replayBuffer
-        self.discount = discount
-        self.minBufferSize = minBufferSize
-        self.doubleDQN = doubleDQN
-        self.device = device
-
-        // Copy Q-network to Target Q-network before training
-        updateTargetQNet(tau: 1)
-    }
-
-    func getAction(state: Tensor<Float>, epsilon: Float) -> Tensor<Int32> {
-        if Float(np.random.uniform()).unwrapped() < epsilon {
-            return Tensor<Int32>(numpy: np.array(np.random.randint(0, 2), dtype: np.int32))!
-        }
-        else {
-            // Neural network input needs to be 2D
-            let tfState = Tensor<Float>(numpy: np.expand_dims(state.makeNumpyArray(), axis: 0))!
-            let qValues = qNet(tfState)[0]
-            return Tensor<Int32>(qValues[1].scalarized() > qValues[0].scalarized() ? 1 : 0, on: device)
-        }
-    }
-
-    func train(batchSize: Int) -> Float {
-        // Don't train if replay buffer is too small
-        if replayBuffer.count >= minBufferSize {
-            let (tfStateBatch, tfActionBatch, tfRewardBatch, tfNextStateBatch, tfIsDoneBatch) = replayBuffer.sample(batchSize: batchSize)
-
-            let (loss, gradients) = valueWithGradient(at: qNet) { qNet -> Tensor<Float> in
-                // Compute prediction batch
-                let npActionBatch = tfActionBatch.makeNumpyArray()
-                let npFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npActionBatch], axis: 1)
-                let tfFullIndices = Tensor<Int32>(numpy: npFullIndices)!
-                let stateQValueBatch = qNet(tfStateBatch)
-                let predictionBatch = _Raw.gatherNd(params: stateQValueBatch, indices: tfFullIndices)
-
-                // Compute target batch
-                let nextStateQValueBatch: Tensor<Float>
-                if self.doubleDQN == true {
-                    // Double DQN
-                    let npNextStateActionBatch = self.qNet(tfNextStateBatch).argmax(squeezingAxis: 1).makeNumpyArray()
-                    let npNextStateFullIndices = np.stack([np.arange(batchSize, dtype: np.int32), npNextStateActionBatch], axis: 1)
-                    let tfNextStateFullIndices = Tensor<Int32>(numpy: npNextStateFullIndices)!
-                    nextStateQValueBatch = _Raw.gatherNd(params: self.targetQNet(tfNextStateBatch), indices: tfNextStateFullIndices)
-                }
-                else {
-                    // DQN
-                    nextStateQValueBatch = self.targetQNet(tfNextStateBatch).max(squeezingAxes: 1)
-                }
-                let targetBatch: Tensor<Float> = tfRewardBatch + self.discount * (1 - Tensor<Float>(tfIsDoneBatch)) * nextStateQValueBatch
-
-                return huberLoss(
-                    predicted: predictionBatch,
-                    expected: targetBatch,
-                    delta: 1
-                )
-            }
-            optimizer.update(&qNet, along: gradients)
-
-            return loss.scalarized()
-        }
-        return 0
-    }
-
-    func updateTargetQNet(tau: Float) {
-        self.targetQNet.l1.weight = tau * Tensor<Float>(self.qNet.l1.weight) + (1 - tau) * self.targetQNet.l1.weight
-        self.targetQNet.l1.bias = tau * Tensor<Float>(self.qNet.l1.bias) + (1 - tau) * self.targetQNet.l1.bias
-        self.targetQNet.l2.weight = tau * Tensor<Float>(self.qNet.l2.weight) + (1 - tau) * self.targetQNet.l2.weight
-        self.targetQNet.l2.bias = tau * Tensor<Float>(self.qNet.l2.bias) + (1 - tau) * self.targetQNet.l2.bias
-    }
-}
-
 class TensorFlowEnvironmentWrapper {
-    let originalEnv: PythonObject
-
-    init(_ env: PythonObject) {
-        self.originalEnv = env
-    }
-
-    func reset() -> Tensor<Float> {
-        let state = self.originalEnv.reset()
-        return Tensor<Float>(numpy: np.array(state, dtype: np.float32))!
-    }
-
-    func step(_ action: Tensor<Int32>) -> (state: Tensor<Float>, reward: Tensor<Float>, isDone: Tensor<Bool>, info: PythonObject) {
-        let (state, reward, isDone, info) = originalEnv.step(action.scalarized()).tuple4
-        let tfState = Tensor<Float>(numpy: np.array(state, dtype: np.float32))!
-        let tfReward = Tensor<Float>(numpy: np.array(reward, dtype: np.float32))!
-        let tfIsDone = Tensor<Bool>(numpy: np.array(isDone, dtype: np.bool))!
-        return (tfState, tfReward, tfIsDone, info)
-    }
+  let originalEnv: PythonObject
+
+  init(_ env: PythonObject) {
+    self.originalEnv = env
+  }
+
+  func reset() -> Tensor<Float> {
+    let state = self.originalEnv.reset()
+    return Tensor<Float>(numpy: np.array(state, dtype: np.float32))!
+  }
+
+  func step(_ action: Tensor<Int32>) -> (
+    state: Tensor<Float>, reward: Tensor<Float>, isDone: Tensor<Bool>, info: PythonObject
+  ) {
+    let (state, reward, isDone, info) = originalEnv.step(action.scalarized()).tuple4
+    let tfState = Tensor<Float>(numpy: np.array(state, dtype: np.float32))!
+    let tfReward = Tensor<Float>(numpy: np.array(reward, dtype: np.float32))!
+    let tfIsDone = Tensor<Bool>(numpy: np.array(isDone, dtype: np.bool))!
+    return (tfState, tfReward, tfIsDone, info)
+  }
 }
 
 func eval(agent: Agent) -> Float {
-    let evalEnv = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0"))
-    var evalEpisodeReturn: Float = 0
-    var state: Tensor<Float> = evalEnv.reset()
-    var reward: Tensor<Float>
-    var evalIsDone: Tensor<Bool> = Tensor<Bool>(false)
-    while evalIsDone.scalarized() == false {
-        let action = agent.getAction(state: state, epsilon: 0)
-        (state, reward, evalIsDone, _) = evalEnv.step(action)
-        evalEpisodeReturn += reward.scalarized()
-    }
-
-    return evalEpisodeReturn
+  let evalEnv = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0"))
+  var evalEpisodeReturn: Float = 0
+  var state: Tensor<Float> = evalEnv.reset()
+  var reward: Tensor<Float>
+  var evalIsDone: Tensor<Bool> = Tensor<Bool>(false)
+  while evalIsDone.scalarized() == false {
+    let action = agent.getAction(state: state, epsilon: 0)
+    (state, reward, evalIsDone, _) = evalEnv.step(action)
+    evalEpisodeReturn += reward.scalarized()
+  }
+
+  return evalEpisodeReturn
 }
 
 // Hyperparameters
@@ -314,68 +92,72 @@ var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
 var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
 let optimizer = Adam(for: qNet, learningRate: learningRate)
 var replayBuffer = ReplayBuffer(
-    capacity: replayBufferCapacity,
-    combined: useCombinedExperienceReplay,
-    device: device
+  capacity: replayBufferCapacity,
+  combined: useCombinedExperienceReplay
 )
 var agent = Agent(
-    qNet: qNet,
-    targetQNet: targetQNet,
-    optimizer: optimizer,
-    replayBuffer: replayBuffer,
-    discount: discount,
-    minBufferSize: minBufferSize,
-    doubleDQN: useDoubleDQN,
-    device: device
+  qNet: qNet,
+  targetQNet: targetQNet,
+  optimizer: optimizer,
+  replayBuffer: replayBuffer,
+  discount: discount,
+  minBufferSize: minBufferSize,
+  doubleDQN: useDoubleDQN,
+  device: device
 )
 
 // RL Loop
 var stepIndex = 0
 var episodeIndex = 0
 var episodeReturn: Float = 0
-var episodeReturns: Array<Float> = []
-var losses: Array<Float> = []
+var episodeReturns: [Float] = []
+var losses: [Float] = []
 var state = env.reset()
 var bestReturn: Float = 0
 while episodeIndex < maxEpisode {
-    stepIndex += 1
-
-    // Interact with environment
-    let epsilon: Float = epsilonEnd + (epsilonStart - epsilonEnd) * Float(np.exp(-1.0 * Float(stepIndex) / epsilonDecay))!
-    let action = agent.getAction(state: state, epsilon: epsilon)
-    let (nextState, reward, isDone, _) = env.step(action)
-    episodeReturn += reward.scalarized()
-
-    // Save interaction to replay buffer
-    replayBuffer.append(state: state, action: action, reward: reward, nextState: nextState, isDone: isDone)
-
-    // Train agent
-    losses.append(agent.train(batchSize: batchSize))
-
-    // Periodically update Target Net
-    if stepIndex % targetNetUpdateRate == 0 {
-        agent.updateTargetQNet(tau: softTargetUpdateRate)
-    }
-
-    // End-of-episode
-    if isDone.scalarized() == true {
-        state = env.reset()
-        episodeIndex += 1
-        let evalEpisodeReturn = eval(agent: agent)
-        episodeReturns.append(evalEpisodeReturn)
-        if evalEpisodeReturn > bestReturn {
-            print(String(format: "Episode: %4d | Step %6d | Epsilon: %.03f | Train: %3d | Eval: %3d", episodeIndex, stepIndex, epsilon, Int(episodeReturn), Int(evalEpisodeReturn)))
-            bestReturn = evalEpisodeReturn
-        }
-        if evalEpisodeReturn > 199 {
-            print("Solved in \(episodeIndex) episodes with \(stepIndex) steps!")
-            break
-        }
-        episodeReturn = 0
-    }
-
-    // End-of-step
-    state = nextState
+  stepIndex += 1
+
+  // Interact with environment
+  let epsilon: Float =
+    epsilonEnd + (epsilonStart - epsilonEnd) * exp(-1.0 * Float(stepIndex) / epsilonDecay)
+  let action = agent.getAction(state: state, epsilon: epsilon)
+  let (nextState, reward, isDone, _) = env.step(action)
+  episodeReturn += reward.scalarized()
+
+  // Save interaction to replay buffer
+  replayBuffer.append(
+    state: state, action: action, reward: reward, nextState: nextState, isDone: isDone)
+
+  // Train agent
+  losses.append(agent.train(batchSize: batchSize))
+
+  // Periodically update Target Net
+  if stepIndex % targetNetUpdateRate == 0 {
+    agent.updateTargetQNet(tau: softTargetUpdateRate)
+  }
+
+  // End-of-episode
+  if isDone.scalarized() == true {
+    state = env.reset()
+    episodeIndex += 1
+    let evalEpisodeReturn = eval(agent: agent)
+    episodeReturns.append(evalEpisodeReturn)
+    if evalEpisodeReturn > bestReturn {
+      print(
+        String(
+          format: "Episode: %4d | Step %6d | Epsilon: %.03f | Train: %3d | Eval: %3d", episodeIndex,
+          stepIndex, epsilon, Int(episodeReturn), Int(evalEpisodeReturn)))
+      bestReturn = evalEpisodeReturn
+    }
+    if evalEpisodeReturn > 199 {
+      print("Solved in \(episodeIndex) episodes with \(stepIndex) steps!")
+      break
+    }
+    episodeReturn = 0
+  }
+
+  // End-of-step
+  state = nextState
 }
 
 // Save learning curve
@@ -388,7 +170,9 @@ plt.clf()
 
 // Save smoothed learning curve
 let runningMeanWindow: Int = 10
-let smoothedEpisodeReturns = np.convolve(episodeReturns, np.ones((runningMeanWindow)) / np.array(runningMeanWindow, dtype: np.int32), mode: "same")
+let smoothedEpisodeReturns = np.convolve(
+  episodeReturns, np.ones((runningMeanWindow)) / np.array(runningMeanWindow, dtype: np.int32),
+  mode: "same")
 
 plt.plot(episodeReturns)
 plt.title("Deep Q-Network on CartPole-v0")

From 356c989b382835ab5a4ac455cac36be742010951 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Thu, 6 Aug 2020 00:51:13 +0000
Subject: [PATCH 31/34] Fix ReplayBuffer pass-by-value bug

---
 Gym/DQN/ReplayBuffer.swift | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Gym/DQN/ReplayBuffer.swift b/Gym/DQN/ReplayBuffer.swift
index 01b64ed4ff3..31126dc4a1d 100644
--- a/Gym/DQN/ReplayBuffer.swift
+++ b/Gym/DQN/ReplayBuffer.swift
@@ -14,7 +14,7 @@
 
 import TensorFlow
 
-struct ReplayBuffer {
+class ReplayBuffer {
   let capacity: Int
   let combined: Bool
 
@@ -30,7 +30,7 @@ struct ReplayBuffer {
     self.combined = combined
   }
 
-  mutating func append(
+  func append(
     state: Tensor<Float>,
     action: Tensor<Int32>,
     reward: Tensor<Float>,

From d774fad85e30064ca2dd06a7f9ecc36b563cef26 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Thu, 6 Aug 2020 00:58:44 +0000
Subject: [PATCH 32/34] Use epsilon decay for more consistent performance

---
 Gym/DQN/main.swift | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 4006ff432e6..365a8c483e1 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -65,9 +65,9 @@ func eval(agent: Agent) -> Float {
 let hiddenSize: Int = 100
 // - Agent-Env Interaction Hyperparameters
 let maxEpisode: Int = 1000
-let epsilonStart: Float = 0.1
-let epsilonEnd: Float = 0.1
-let epsilonDecay: Float = 10000
+let epsilonStart: Float = 1
+let epsilonEnd: Float = 0.01
+let epsilonDecay: Float = 1000
 // - Update Hyperparameters
 let learningRate: Float = 0.001
 let discount: Float = 0.99

From a10f20112fefd641fe1d77d08484d96ab1f0957b Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Fri, 7 Aug 2020 10:43:26 +0000
Subject: [PATCH 33/34] Add documentation and improve names

---
 Gym/DQN/Agent.swift        | 27 ++++++++++++-----
 Gym/DQN/ReplayBuffer.swift |  7 +++++
 Gym/DQN/main.swift         | 60 +++++++++++++++++++++++++++++---------
 3 files changed, 73 insertions(+), 21 deletions(-)

diff --git a/Gym/DQN/Agent.swift b/Gym/DQN/Agent.swift
index 4a534062257..52f25904f87 100644
--- a/Gym/DQN/Agent.swift
+++ b/Gym/DQN/Agent.swift
@@ -25,7 +25,12 @@ extension Optional {
   }
 }
 
-struct Net: Layer {
+/// A Deep Q-Network.
+///
+/// A Q-network is a neural network that receives the observation (state) as input and estimates
+/// the action values (Q values) of each action. For more information, check Human-level control
+/// through deep reinforcement learning (Mnih et al., 2015).
+struct DeepQNetwork: Layer {
   typealias Input = Tensor<Float>
   typealias Output = Tensor<Float>
 
@@ -42,10 +47,16 @@ struct Net: Layer {
   }
 }
 
-class Agent {
-  var qNet: Net
-  var targetQNet: Net
-  let optimizer: Adam<Net>
+/// Agent that uses the Deep Q-Network.
+///
+/// Deep Q-Network is an algorithm that trains a Q-network that estimates the action values of
+/// each action given an observation (state). The Q-network is trained iteratively using the 
+/// Bellman equation. For more information, check Human-level control through deep reinforcement
+/// learning (Mnih et al., 2015).
+class DeepQNetworkAgent {
+  var qNet: DeepQNetwork
+  var targetQNet: DeepQNetwork
+  let optimizer: Adam<DeepQNetwork>
   let replayBuffer: ReplayBuffer
   let discount: Float
   let minBufferSize: Int
@@ -53,9 +64,9 @@ class Agent {
   let device: Device
 
   init(
-    qNet: Net,
-    targetQNet: Net,
-    optimizer: Adam<Net>,
+    qNet: DeepQNetwork,
+    targetQNet: DeepQNetwork,
+    optimizer: Adam<DeepQNetwork>,
     replayBuffer: ReplayBuffer,
     discount: Float,
     minBufferSize: Int,
diff --git a/Gym/DQN/ReplayBuffer.swift b/Gym/DQN/ReplayBuffer.swift
index 31126dc4a1d..c32e31c4d39 100644
--- a/Gym/DQN/ReplayBuffer.swift
+++ b/Gym/DQN/ReplayBuffer.swift
@@ -14,6 +14,13 @@
 
 import TensorFlow
 
+/// Replay buffer to store the agent's experiences.
+///
+/// Vanilla Q-learning only trains on the latest experience. Deep Q-network uses
+/// a technique called "experience replay", where all experience is stored into
+/// a replay buffer. By storing experience, the agent can reuse the experiences
+/// and also train in batches. For more information, check Human-level control
+/// through deep reinforcement learning (Mnih et al., 2015).
 class ReplayBuffer {
   let capacity: Int
   let combined: Bool
diff --git a/Gym/DQN/main.swift b/Gym/DQN/main.swift
index 365a8c483e1..0dfe7b996d9 100644
--- a/Gym/DQN/main.swift
+++ b/Gym/DQN/main.swift
@@ -45,7 +45,7 @@ class TensorFlowEnvironmentWrapper {
   }
 }
 
-func eval(agent: Agent) -> Float {
+func evaluate(_ agent: DeepQNetworkAgent) -> Float {
   let evalEnv = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0"))
   var evalEpisodeReturn: Float = 0
   var state: Tensor<Float> = evalEnv.reset()
@@ -61,24 +61,58 @@ func eval(agent: Agent) -> Float {
 }
 
 // Hyperparameters
-// - Network Hyperparameters
+/// The size of the hidden layer of the 2-layer Q-network. The network has the
+/// shape observationSize - hiddenSize - actionCount.
 let hiddenSize: Int = 100
-// - Agent-Env Interaction Hyperparameters
+/// Maximum number of episodes to train the agent. The training is terminated
+/// early if maximum score is achieved during evaluation.
 let maxEpisode: Int = 1000
+/// The initial epsilon value. With probability epsilon, the agent chooses a
+/// random action instead of the action that it thinks is the best.
 let epsilonStart: Float = 1
+/// The terminal epsilon value.
 let epsilonEnd: Float = 0.01
+/// The decay rate of epsilon.
 let epsilonDecay: Float = 1000
-// - Update Hyperparameters
+/// The learning rate for the Q-network.
 let learningRate: Float = 0.001
+/// The discount factor. This measures how much to "discount" the future rewards
+/// that the agent will receive. The discount factor must be from 0 to 1
+/// (inclusive). Discount factor of 0 means that the agent only considers the
+/// immediate reward and disregards all future rewards. Discount factor of 1
+/// means that the agent values all rewards equally, no matter how distant
+/// in the future they may be.
 let discount: Float = 0.99
+/// If enabled, uses the Double DQN update equation instead of the original DQN
+/// equation. This mitigates the overestimation problem of DQN. For more
+/// information about Double DQN, check Deep Reinforcement Learning with Double
+/// Q-learning (Hasselt, Guez, and Silver, 2015).
 let useDoubleDQN: Bool = true
-// - Replay Buffer Hyperparameters
+/// The maximum size of the replay buffer. If the replay buffer is full, the new
+/// element replaces the oldest element.
 let replayBufferCapacity: Int = 100000
+/// The minimum replay buffer size before the training starts. Must be at least
+/// the training batch size.
 let minBufferSize: Int = 64
+/// The training batch size.
 let batchSize: Int = 64
+/// If enabled, uses Combined Experience Replay (CER) sampling instead of the
+/// uniform random sampling in the original DQN paper. Original DQN samples
+/// batch uniformly randomly in the replay buffer. CER always includes the most
+/// recent element and samples the rest of the batch uniformly randomly. This
+/// makes the agent more robust to different replay buffer capacities. For more
+/// information about Combined Experience Replay, check A Deeper Look at
+/// Experience Replay (Zhang and Sutton, 2017).
 let useCombinedExperienceReplay: Bool = true
-// - Target Network Hyperparameters
+/// The number of steps between target network updates. The target network is
+/// a copy of the Q-network that is updated less frequently to stabilize the
+/// training process.
 let targetNetUpdateRate: Int = 5
+/// The update rate for target network. In the original DQN paper, the target
+/// network is updated to be the same as the Q-network. Soft target network
+/// only updates the target network slightly towards the direction of the
+/// Q-network. The softTargetUpdateRate of 0 means that the target network is
+/// not updated at all, and 1 means that soft target network update is disabled.
 let softTargetUpdateRate: Float = 0.05
 
 // Setup device
@@ -88,14 +122,14 @@ let device: Device = Device.default
 let env = TensorFlowEnvironmentWrapper(gym.make("CartPole-v0"))
 
 // Initialize agent
-var qNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
-var targetQNet = Net(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
+var qNet = DeepQNetwork(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
+var targetQNet = DeepQNetwork(observationSize: 4, hiddenSize: hiddenSize, actionCount: 2)
 let optimizer = Adam(for: qNet, learningRate: learningRate)
 var replayBuffer = ReplayBuffer(
   capacity: replayBufferCapacity,
   combined: useCombinedExperienceReplay
 )
-var agent = Agent(
+var agent = DeepQNetworkAgent(
   qNet: qNet,
   targetQNet: targetQNet,
   optimizer: optimizer,
@@ -140,7 +174,7 @@ while episodeIndex < maxEpisode {
   if isDone.scalarized() == true {
     state = env.reset()
     episodeIndex += 1
-    let evalEpisodeReturn = eval(agent: agent)
+    let evalEpisodeReturn = evaluate(agent)
     episodeReturns.append(evalEpisodeReturn)
     if evalEpisodeReturn > bestReturn {
       print(
@@ -165,7 +199,7 @@ plt.plot(episodeReturns)
 plt.title("Deep Q-Network on CartPole-v0")
 plt.xlabel("Episode")
 plt.ylabel("Episode Return")
-plt.savefig("dqnEpisodeReturns.png")
+plt.savefig("/tmp/dqnEpisodeReturns.png")
 plt.clf()
 
 // Save smoothed learning curve
@@ -178,7 +212,7 @@ plt.plot(episodeReturns)
 plt.title("Deep Q-Network on CartPole-v0")
 plt.xlabel("Episode")
 plt.ylabel("Smoothed Episode Return")
-plt.savefig("dqnSmoothedEpisodeReturns.png")
+plt.savefig("/tmp/dqnSmoothedEpisodeReturns.png")
 plt.clf()
 
 // // Save TD loss curve
@@ -186,5 +220,5 @@ plt.plot(losses)
 plt.title("Deep Q-Network on CartPole-v0")
 plt.xlabel("Step")
 plt.ylabel("TD Loss")
-plt.savefig("dqnTDLoss.png")
+plt.savefig("/tmp/dqnTDLoss.png")
 plt.clf()

From 4aa929640d23ea4a69b1921cc013cabc43c0c019 Mon Sep 17 00:00:00 2001
From: Seungjae Ryan Lee <seungjaeryanlee@gmail.com>
Date: Fri, 7 Aug 2020 10:58:56 +0000
Subject: [PATCH 34/34] Document Agent and ReplayBuffer parameters

---
 Gym/DQN/Agent.swift        | 14 ++++++++++++++
 Gym/DQN/ReplayBuffer.swift | 17 +++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/Gym/DQN/Agent.swift b/Gym/DQN/Agent.swift
index 52f25904f87..8522f685f8d 100644
--- a/Gym/DQN/Agent.swift
+++ b/Gym/DQN/Agent.swift
@@ -54,12 +54,26 @@ struct DeepQNetwork: Layer {
 /// Bellman equation. For more information, check Human-level control through deep reinforcement
 /// learning (Mnih et al., 2015).
 class DeepQNetworkAgent {
+  /// The Q-network uses to estimate the action values.
   var qNet: DeepQNetwork
+  /// The copy of the Q-network updated less frequently to stabilize the
+  /// training process.
   var targetQNet: DeepQNetwork
+  /// The optimizer used to train the Q-network.
   let optimizer: Adam<DeepQNetwork>
+  /// The replay buffer that stores experiences of the interactions between the
+  /// agent and the environment. The Q-network is trained from experiences
+  /// sampled from the replay buffer.
   let replayBuffer: ReplayBuffer
+  /// The discount factor that measures how much to weight to give to future
+  /// rewards when calculating the action value.
   let discount: Float
+  /// The minimum replay buffer size before the training starts.
   let minBufferSize: Int
+  /// If enabled, uses the Double DQN update equation instead of the original
+  /// DQN equation. This mitigates the overestimation problem of DQN. For more
+  /// information about Double DQN, check Deep Reinforcement Learning with
+  /// Double Q-learning (Hasselt, Guez, and Silver, 2015).
   let doubleDQN: Bool
   let device: Device
 
diff --git a/Gym/DQN/ReplayBuffer.swift b/Gym/DQN/ReplayBuffer.swift
index c32e31c4d39..f9e6ddf1c48 100644
--- a/Gym/DQN/ReplayBuffer.swift
+++ b/Gym/DQN/ReplayBuffer.swift
@@ -22,14 +22,31 @@ import TensorFlow
 /// and also train in batches. For more information, check Human-level control
 /// through deep reinforcement learning (Mnih et al., 2015).
 class ReplayBuffer {
+  /// The maximum size of the replay buffer. When the replay buffer is full,
+  /// new elements replace the oldest element in the replay buffer.
   let capacity: Int
+  /// If enabled, uses Combined Experience Replay (CER) sampling instead of the
+  /// uniform random sampling in the original DQN paper. Original DQN samples
+  /// batch uniformly randomly in the replay buffer. CER always includes the
+  /// most recent element and samples the rest of the batch uniformly randomly.
+  /// This makes the agent more robust to different replay buffer capacities.
+  /// For more information about Combined Experience Replay, check A Deeper Look
+  /// at Experience Replay (Zhang and Sutton, 2017).
   let combined: Bool
 
+  /// The states that the agent observed.
   @noDerivative var states: [Tensor<Float>] = []
+  /// The actions that the agent took.
   @noDerivative var actions: [Tensor<Int32>] = []
+  /// The rewards that the agent received from the environment after taking
+  /// an action.
   @noDerivative var rewards: [Tensor<Float>] = []
+  /// The next states that the agent received from the environment after taking
+  /// an action.
   @noDerivative var nextStates: [Tensor<Float>] = []
+  /// The episode-terminal flag that the agent received after taking an action.
   @noDerivative var isDones: [Tensor<Bool>] = []
+  /// The current size of the replay buffer.
   var count: Int { return states.count }
 
   init(capacity: Int, combined: Bool) {