From 5e6f09dc04b409a6234fb77a244eb7e4ba4d42f3 Mon Sep 17 00:00:00 2001
From: Cas Bex <cas.bex@kuleuven.be>
Date: Fri, 2 Jun 2023 15:18:13 +0200
Subject: [PATCH 01/24] NFQ before refactor

---
 .../src/algorithms/offline_rl/NFQ.jl          | 77 +++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl

diff --git a/src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl
new file mode 100644
index 000000000..3336520bd
--- /dev/null
+++ b/src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl
@@ -0,0 +1,77 @@
+"""
+    NFQ{A<:AbstractApproximator, F, R} <: AbstractLearner
+    NFQ(approximator::A, num_iterations::Integer epochs::Integer, loss_function::F, batch_size::Integer, rng::R, γ::Float32) where {A<:AbstractApproximator, F, R}
+Neural Fitted Q-iteration as implemented in [1]
+
+# Keyword arguments
+- `approximator::AbstractApproximator` neural network
+- `num_iterations::Integer` number of value iteration iterations in FQI loop (i.e. the outer loop)
+- `epochs` number of epochs to train neural network per iteration
+- `loss_function::F` loss function of the NN
+- `sampler::BatchSampler{SARTS}` data sampler
+- `rng::R` random number generator
+- `γ::Float32` discount rate
+
+# References
+[1] Riedmiller, M. (2005). Neural Fitted Q Iteration – First Experiences with a Data Efficient Neural Reinforcement Learning Method. In: Gama, J., Camacho, R., Brazdil, P.B., Jorge, A.M., Torgo, L. (eds) Machine Learning: ECML 2005. ECML 2005. Lecture Notes in Computer Science(), vol 3720. Springer, Berlin, Heidelberg. https://doi.org/10.1007/11564096_32
+"""
+Base.@kwdef struct NFQ{A<:AbstractApproximator, F, R} <: AbstractLearner
+    approximator::A
+    num_iterations::Integer = 20
+    epochs::Integer = 100
+    loss_function::F = mse
+    sampler::BatchSampler{SARTS} = BatchSampler(32)
+    rng::R = Random.GLOBAL_RNG
+    γ::Float32 = 0.9f0
+end
+
+function NFQ(;
+    approximator::A,
+    num_iterations::Integer = 20,
+    epochs::Integer = 1000,
+    loss_function::F = mse,
+    batch_size::Integer=32,
+    rng=Random.GLOBAL_RNG,
+    γ::Float32 = 0.9f0,
+    ) where {A<:AbstractApproximator, F}
+    NFQ(approximator, num_iterations, epochs, loss_function, BatchSampler{SARTS}(batch_size), rng, γ)
+end
+
+# Copied from BasicDQN but sure whether it's appropriate
+Flux.functor(x::NFQ) = (Q = x.approximator,), y -> begin
+    x = @set x.approximator = y.Q
+    x
+end
+
+function (learner::NFQ)(env)
+    as = action_space(env)
+    return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> learner.approximator |> send_to_host |> vec
+end
+
+function RLBase.update!(learner::NFQ, traj::AbstractTrajectory, ::AbstractEnv, ::PreExperimentStage) end
+function RLBase.update!(learner::NFQ, traj::AbstractTrajectory, ::AbstractEnv, ::PreActStage) end
+function RLBase.update!(learner::NFQ, traj::AbstractTrajectory, ::AbstractEnv, ::PreEpisodeStage) end
+function RLBase.update!(learner::NFQ, traj::AbstractTrajectory, env::AbstractEnv, ::PostEpisodeStage)
+    isempty(traj) && return
+    inds, batch = sample(learner.rng, traj, learner.sampler)
+    update!(learner, batch, env)
+end
+
+function RLBase.update!(learner::NFQ, batch::NamedTuple{SARTS}, env::AbstractEnv)
+    Q = learner.approximator
+    γ = learner.γ
+    loss_func = learner.loss_function
+    as = action_space(env)
+    las = length(as)
+
+    (s, a, r, ss) = batch[[:state, :action, :reward, :next_state]]
+    a = Float32.(a)
+    s, a, r, ss = map(x->send_to_device(device(Q), x), (s, a, r, ss))
+    for i = 1:learner.num_iterations
+        # Make an input x samples x |action space| array -- Q --> samples x |action space| -- max --> samples
+        G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> Q |> x -> maximum(x, dims=3) |> vec)
+        for e = 1:learner.epochs
+            Flux.train!((x, y) -> loss_func(Q(x), y), params(Q.model), [(vcat(s, transpose(a)), transpose(G))], Q.optimizer)
+        end
+    end
+end

From c42304894a70fa07644022dc0efad1a11712a5b9 Mon Sep 17 00:00:00 2001
From: Cas Bex <cas.bex@kuleuven.be>
Date: Fri, 2 Jun 2023 15:25:38 +0200
Subject: [PATCH 02/24] NFQ after refactor

---
 .../src/algorithms/offline_rl/NFQ.jl          | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl
index 3336520bd..145f611e9 100644
--- a/src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl
+++ b/src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl
@@ -15,12 +15,11 @@ Neural Fitted Q-iteration as implemented in [1]
 # References
 [1] Riedmiller, M. (2005). Neural Fitted Q Iteration – First Experiences with a Data Efficient Neural Reinforcement Learning Method. In: Gama, J., Camacho, R., Brazdil, P.B., Jorge, A.M., Torgo, L. (eds) Machine Learning: ECML 2005. ECML 2005. Lecture Notes in Computer Science(), vol 3720. Springer, Berlin, Heidelberg. https://doi.org/10.1007/11564096_32
 """
-Base.@kwdef struct NFQ{A<:AbstractApproximator, F, R} <: AbstractLearner
+Base.@kwdef struct NFQ{A<:NeuralNetworkApproximator, F, R} <: AbstractLearner
     approximator::A
     num_iterations::Integer = 20
     epochs::Integer = 100
     loss_function::F = mse
-    sampler::BatchSampler{SARTS} = BatchSampler(32)
     rng::R = Random.GLOBAL_RNG
     γ::Float32 = 0.9f0
 end
@@ -30,11 +29,10 @@ function NFQ(;
     num_iterations::Integer = 20,
     epochs::Integer = 1000,
     loss_function::F = mse,
-    batch_size::Integer=32,
     rng=Random.GLOBAL_RNG,
     γ::Float32 = 0.9f0,
-    ) where {A<:AbstractApproximator, F}
-    NFQ(approximator, num_iterations, epochs, loss_function, BatchSampler{SARTS}(batch_size), rng, γ)
+    ) where {A<:NeuralNetworkApproximator, F}
+    NFQ(approximator, num_iterations, epochs, loss_function, rng, γ)
 end
 
 # Copied from BasicDQN but sure whether it's appropriate
@@ -43,27 +41,30 @@ Flux.functor(x::NFQ) = (Q = x.approximator,), y -> begin
     x
 end
 
-function (learner::NFQ)(env)
+function RLBase.plan!(learner::NFQ, env::AbstractEnv)
     as = action_space(env)
     return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> learner.approximator |> send_to_host |> vec
 end
 
-function RLBase.update!(learner::NFQ, traj::AbstractTrajectory, ::AbstractEnv, ::PreExperimentStage) end
-function RLBase.update!(learner::NFQ, traj::AbstractTrajectory, ::AbstractEnv, ::PreActStage) end
-function RLBase.update!(learner::NFQ, traj::AbstractTrajectory, ::AbstractEnv, ::PreEpisodeStage) end
-function RLBase.update!(learner::NFQ, traj::AbstractTrajectory, env::AbstractEnv, ::PostEpisodeStage)
-    isempty(traj) && return
-    inds, batch = sample(learner.rng, traj, learner.sampler)
-    update!(learner, batch, env)
+# Avoid optimisation in the middle of an episode
+function RLBase.optimise!(::NFQ, ::NamedTuple) end
+
+# Instead do optimisation at the end of an episode
+function Base.push!(agent::Agent{<:QBasedPolicy{<:NFQ}}, ::PostEpisodeStage, env::AbstractEnv)
+    for batch in agent.trajectory
+        _optimise!(agent.policy.learner, batch, env)
+    end
 end
 
-function RLBase.update!(learner::NFQ, batch::NamedTuple{SARTS}, env::AbstractEnv)
+function _optimise!(learner::NFQ, batch::NamedTuple, env::AbstractEnv)
     Q = learner.approximator
     γ = learner.γ
     loss_func = learner.loss_function
+
     as = action_space(env)
     las = length(as)
 
+
     (s, a, r, ss) = batch[[:state, :action, :reward, :next_state]]
     a = Float32.(a)
     s, a, r, ss = map(x->send_to_device(device(Q), x), (s, a, r, ss))

From c88559e7f8a44269f11e8c5e64a8f7ca7cc405a9 Mon Sep 17 00:00:00 2001
From: Cas Bex <cas.bex@kuleuven.be>
Date: Wed, 7 Jun 2023 17:49:36 +0200
Subject: [PATCH 03/24] Move to dqns

---
 .../src/algorithms/{offline_rl => dqns}/NFQ.jl                    | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/ReinforcementLearningZoo/src/algorithms/{offline_rl => dqns}/NFQ.jl (100%)

diff --git a/src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
similarity index 100%
rename from src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl
rename to src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl

From 1560ad1b8acbed34a3bfcad1f1cb12f062dad6ed Mon Sep 17 00:00:00 2001
From: Cas Bex <cas.bex@kuleuven.be>
Date: Thu, 8 Jun 2023 09:59:02 +0200
Subject: [PATCH 04/24] Refactor

---
 .../src/algorithms/dqns/NFQ.jl                | 33 ++++++++++++-------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
index 145f611e9..176697e7e 100644
--- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
+++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
@@ -1,3 +1,8 @@
+export NFQ
+
+using Flux
+using Functors: @functor
+
 """
     NFQ{A<:AbstractApproximator, F, R} <: AbstractLearner
     NFQ(approximator::A, num_iterations::Integer epochs::Integer, loss_function::F, batch_size::Integer, rng::R, γ::Float32) where {A<:AbstractApproximator, F, R}
@@ -15,11 +20,11 @@ Neural Fitted Q-iteration as implemented in [1]
 # References
 [1] Riedmiller, M. (2005). Neural Fitted Q Iteration – First Experiences with a Data Efficient Neural Reinforcement Learning Method. In: Gama, J., Camacho, R., Brazdil, P.B., Jorge, A.M., Torgo, L. (eds) Machine Learning: ECML 2005. ECML 2005. Lecture Notes in Computer Science(), vol 3720. Springer, Berlin, Heidelberg. https://doi.org/10.1007/11564096_32
 """
-Base.@kwdef struct NFQ{A<:NeuralNetworkApproximator, F, R} <: AbstractLearner
+Base.@kwdef struct NFQ{A, R} <: AbstractLearner
     approximator::A
     num_iterations::Integer = 20
     epochs::Integer = 100
-    loss_function::F = mse
+    loss_function::Any = mse
     rng::R = Random.GLOBAL_RNG
     γ::Float32 = 0.9f0
 end
@@ -28,22 +33,26 @@ function NFQ(;
     approximator::A,
     num_iterations::Integer = 20,
     epochs::Integer = 1000,
-    loss_function::F = mse,
+    loss_function::Any = mse,
     rng=Random.GLOBAL_RNG,
     γ::Float32 = 0.9f0,
-    ) where {A<:NeuralNetworkApproximator, F}
+    ) where {A}
     NFQ(approximator, num_iterations, epochs, loss_function, rng, γ)
 end
 
 # Copied from BasicDQN but sure whether it's appropriate
-Flux.functor(x::NFQ) = (Q = x.approximator,), y -> begin
-    x = @set x.approximator = y.Q
-    x
-end
+@functor NFQ (approximator,)
+
+RLCore.forward(L::NFQ, s::AbstractArray) = RLCore.forward(L.approximator, s)
+
+# Flux.functor(x::NFQ) = (Q = x.approximator,), y -> begin
+#     x = @set x.approximator = y.Q
+#     x
+# end
 
-function RLBase.plan!(learner::NFQ, env::AbstractEnv)
+function RLCore.forward(learner::NFQ, env::AbstractEnv)
     as = action_space(env)
-    return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> learner.approximator |> send_to_host |> vec
+    return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> x->RLCore.forward(learner, x) |> send_to_host |> vec 
 end
 
 # Avoid optimisation in the middle of an episode
@@ -70,9 +79,9 @@ function _optimise!(learner::NFQ, batch::NamedTuple, env::AbstractEnv)
     s, a, r, ss = map(x->send_to_device(device(Q), x), (s, a, r, ss))
     for i = 1:learner.num_iterations
         # Make an input x samples x |action space| array -- Q --> samples x |action space| -- max --> samples
-        G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> Q |> x -> maximum(x, dims=3) |> vec)
+        G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> x -> maximum(RLCore.forward(Q, x), dims=3) |> vec)
         for e = 1:learner.epochs
-            Flux.train!((x, y) -> loss_func(Q(x), y), params(Q.model), [(vcat(s, transpose(a)), transpose(G))], Q.optimizer)
+            Flux.train!((x, y) -> loss_func(RLCore.forward(Q, x), y), params(Q.model), [(vcat(s, transpose(a)), transpose(G))], Q.optimiser)
         end
     end
 end

From 02ab01b111bd036f6c78b7d0e75d3f25defe3627 Mon Sep 17 00:00:00 2001
From: Cas Bex <cas.bex@kuleuven.be>
Date: Wed, 7 Jun 2023 19:23:57 +0200
Subject: [PATCH 05/24] Add NFQ to RLZoo

---
 src/ReinforcementLearningZoo/src/algorithms/dqns/dqns.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/dqns.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/dqns.jl
index b20517abb..49f88f54a 100644
--- a/src/ReinforcementLearningZoo/src/algorithms/dqns/dqns.jl
+++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/dqns.jl
@@ -1,8 +1,9 @@
 include("basic_dqn.jl")
+include("NFQ.jl")
 include("dqn.jl")
 include("prioritized_dqn.jl")
 include("qr_dqn.jl")
 include("rem_dqn.jl")
 include("iqn.jl")
 include("rainbow.jl")
-# include("common.jl")
\ No newline at end of file
+# include("common.jl")

From 093f1f4a642412be2db8902b4a0f1ecd405e2f6e Mon Sep 17 00:00:00 2001
From: Cas Bex <cas.bex@kuleuven.be>
Date: Wed, 7 Jun 2023 19:23:07 +0200
Subject: [PATCH 06/24] Set up experiment

---
 .../experiments/DQN/JuliaRL_NFQ_CartPole.jl   | 92 +++++++++++++++++++
 .../experiments/experiments/DQN/config.json   |  1 +
 .../src/ReinforcementLearningExperiments.jl   |  1 +
 .../test/runtests.jl                          |  1 +
 4 files changed, 95 insertions(+)
 create mode 100644 src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl

diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl
new file mode 100644
index 000000000..a53576080
--- /dev/null
+++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl
@@ -0,0 +1,92 @@
+# ---
+# title: JuliaRL\_NFQ\_PendulumDiscrete
+# cover: assets/JuliaRL_BasicDQN_CartPole.png
+# description: NFQ applied to discrete Pendulum
+# date: 2023-06
+# author: "[Lucas Bex](https://github.com/CasBex)"
+# ---
+
+#+ tangle=true
+using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo, ReinforcementLearningZoo
+using ReinforcementLearningEnvironments
+using Flux
+using Flux: glorot_uniform
+
+using StableRNGs: StableRNG
+using Flux.Losses: huber_loss
+
+function RLCore.Experiment(
+    ::Val{:JuliaRL},
+    ::Val{:NFQ},
+    ::Val{:CartPole},
+    seed = 123,
+)
+    rng = StableRNG(seed)
+    env = CartPoleEnv(; T=Float32, rng=rng)
+    ns, na = length(state(env)), length(first(action_space(env)))
+
+    agent = Agent(
+        policy=QBasedPolicy(
+            learner=NFQ(
+                approximator=Approximator(
+                    model=Chain(
+                        Dense(ns, 128, relu; init=glorot_uniform(rng)),
+                        Dense(128, 128, relu; init=glorot_uniform(rng)),
+                        Dense(128, na; init=glorot_uniform(rng)),
+                    ) |> gpu,
+                    optimiser=RMSProp()
+                ),
+                loss_func=huber_loss,
+                epochs=500,
+                num_iterations=10
+            ),
+            explorer=EpsilonGreedyExplorer(
+                kind=:exp,
+                ϵ_stable=0.01,
+                decay_steps=500,
+                rng=rng,
+            ),
+        ),
+        trajectory=Trajectory(
+            container=CircularArraySARTTraces(
+                capacity=1000,
+                state=Float32 => (ns,),
+            ),
+            sampler=BatchSampler{SS′ART}(
+                batch_size=1000,
+                rng=rng
+            ),
+            controller=InsertSampleRatioController(
+                threshold=100,
+                n_inserted=-1
+            )
+        )
+    )
+
+    stop_condition = StopAfterStep(10_000, is_show_progress=!haskey(ENV, "CI"))
+    hook = TotalRewardPerEpisode()
+    Experiment(agent, env, stop_condition, hook)
+    end
+
+#+ tangle=false
+using Plots
+pyplot() # hide
+ex = E`JuliaRL_NFQ_CartPole`
+run(ex)
+plot(ex.hook.rewards)
+savefig("assets/JuliaRL_NFQ_CartPole.png") #hide
+
+#=
+## Watch a demo episode with the trained agent
+
+```julia
+demo = Experiment(ex.policy,
+                  CartPoleEnv(),
+                  StopWhenDone(),
+                  RolloutHook(plot, closeall),
+                  "DQN <-> Demo")
+run(demo)
+```
+=#
+
+# ![](assets/JuliaRL_NFQ_CartPole.png)
diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/config.json b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/config.json
index e6568ee09..65db0d95d 100644
--- a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/config.json
+++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/config.json
@@ -1,6 +1,7 @@
 {
   "description": "DQN related experiments.",
   "order": [
+    "JuliaRL_NFQ_CartPole.jl",
     "JuliaRL_BasicDQN_CartPole.jl",
     "JuliaRL_BasicDQN_MountainCar.jl",
     "JuliaRL_BasicDQN_PendulumDiscrete.jl",
diff --git a/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl b/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl
index a280e7746..ca27a1031 100644
--- a/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl
+++ b/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl
@@ -8,6 +8,7 @@ const EXPERIMENTS_DIR = joinpath(@__DIR__, "experiments")
 # for f in readdir(EXPERIMENTS_DIR)
 #     include(joinpath(EXPERIMENTS_DIR, f))
 # end
+include(joinpath(EXPERIMENTS_DIR, "JuliaRL_NFQ_CartPole.jl"))
 include(joinpath(EXPERIMENTS_DIR, "JuliaRL_BasicDQN_CartPole.jl"))
 include(joinpath(EXPERIMENTS_DIR, "JuliaRL_DQN_CartPole.jl"))
 include(joinpath(EXPERIMENTS_DIR, "JuliaRL_PrioritizedDQN_CartPole.jl"))
diff --git a/src/ReinforcementLearningExperiments/test/runtests.jl b/src/ReinforcementLearningExperiments/test/runtests.jl
index 723a5dbd3..1086313e0 100644
--- a/src/ReinforcementLearningExperiments/test/runtests.jl
+++ b/src/ReinforcementLearningExperiments/test/runtests.jl
@@ -3,6 +3,7 @@ using CUDA
 
 CUDA.allowscalar(false)
 
+run(E`JuliaRL_NFQ_CartPole`)
 run(E`JuliaRL_BasicDQN_CartPole`)
 run(E`JuliaRL_DQN_CartPole`)
 run(E`JuliaRL_PrioritizedDQN_CartPole`)

From c1e49da5eac7b5cfef9f97cb6485d119d0881961 Mon Sep 17 00:00:00 2001
From: Cas Bex <cas.bex@kuleuven.be>
Date: Tue, 13 Jun 2023 10:18:24 +0200
Subject: [PATCH 07/24] Update algorithm for refactor

---
 .../experiments/DQN/JuliaRL_NFQ_CartPole.jl      |  9 +++++----
 .../src/algorithms/dqns/NFQ.jl                   | 16 +++++-----------
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl
index a53576080..eb9e8395b 100644
--- a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl
+++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl
@@ -28,15 +28,16 @@ function RLCore.Experiment(
     agent = Agent(
         policy=QBasedPolicy(
             learner=NFQ(
+                action_space=action_space(env),
                 approximator=Approximator(
                     model=Chain(
-                        Dense(ns, 128, relu; init=glorot_uniform(rng)),
-                        Dense(128, 128, relu; init=glorot_uniform(rng)),
-                        Dense(128, na; init=glorot_uniform(rng)),
+                        Dense(ns+na, 64, relu; init=glorot_uniform(rng)),
+                        Dense(64, 64, relu; init=glorot_uniform(rng)),
+                        Dense(64, 1; init=glorot_uniform(rng)),
                     ) |> gpu,
                     optimiser=RMSProp()
                 ),
-                loss_func=huber_loss,
+                loss_function=huber_loss,
                 epochs=500,
                 num_iterations=10
             ),
diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
index 176697e7e..a6fdd9c23 100644
--- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
+++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
@@ -21,6 +21,7 @@ Neural Fitted Q-iteration as implemented in [1]
 [1] Riedmiller, M. (2005). Neural Fitted Q Iteration – First Experiences with a Data Efficient Neural Reinforcement Learning Method. In: Gama, J., Camacho, R., Brazdil, P.B., Jorge, A.M., Torgo, L. (eds) Machine Learning: ECML 2005. ECML 2005. Lecture Notes in Computer Science(), vol 3720. Springer, Berlin, Heidelberg. https://doi.org/10.1007/11564096_32
 """
 Base.@kwdef struct NFQ{A, R} <: AbstractLearner
+    action_space::AbstractVector
     approximator::A
     num_iterations::Integer = 20
     epochs::Integer = 100
@@ -30,6 +31,7 @@ Base.@kwdef struct NFQ{A, R} <: AbstractLearner
 end
 
 function NFQ(;
+    action_space::AbstractVector,
     approximator::A,
     num_iterations::Integer = 20,
     epochs::Integer = 1000,
@@ -37,7 +39,7 @@ function NFQ(;
     rng=Random.GLOBAL_RNG,
     γ::Float32 = 0.9f0,
     ) where {A}
-    NFQ(approximator, num_iterations, epochs, loss_function, rng, γ)
+    NFQ(action_space, approximator, num_iterations, epochs, loss_function, rng, γ)
 end
 
 # Copied from BasicDQN but sure whether it's appropriate
@@ -58,19 +60,11 @@ end
 # Avoid optimisation in the middle of an episode
 function RLBase.optimise!(::NFQ, ::NamedTuple) end
 
-# Instead do optimisation at the end of an episode
-function Base.push!(agent::Agent{<:QBasedPolicy{<:NFQ}}, ::PostEpisodeStage, env::AbstractEnv)
-    for batch in agent.trajectory
-        _optimise!(agent.policy.learner, batch, env)
-    end
-end
-
-function _optimise!(learner::NFQ, batch::NamedTuple, env::AbstractEnv)
+function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, batch::NamedTuple)
     Q = learner.approximator
     γ = learner.γ
     loss_func = learner.loss_function
-
-    as = action_space(env)
+    as = learner.action_space
     las = length(as)
 
 

From 0edd287181f7abe3b56ffef578166e59ed158373 Mon Sep 17 00:00:00 2001
From: Henri Dehaybe <47037088+HenriDeh@users.noreply.github.com>
Date: Thu, 15 Jun 2023 10:09:12 +0200
Subject: [PATCH 08/24] rng and loss type

---
 src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
index a6fdd9c23..6ecc09b6f 100644
--- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
+++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
@@ -20,13 +20,13 @@ Neural Fitted Q-iteration as implemented in [1]
 # References
 [1] Riedmiller, M. (2005). Neural Fitted Q Iteration – First Experiences with a Data Efficient Neural Reinforcement Learning Method. In: Gama, J., Camacho, R., Brazdil, P.B., Jorge, A.M., Torgo, L. (eds) Machine Learning: ECML 2005. ECML 2005. Lecture Notes in Computer Science(), vol 3720. Springer, Berlin, Heidelberg. https://doi.org/10.1007/11564096_32
 """
-Base.@kwdef struct NFQ{A, R} <: AbstractLearner
+Base.@kwdef struct NFQ{A, R, F} <: AbstractLearner
     action_space::AbstractVector
     approximator::A
     num_iterations::Integer = 20
     epochs::Integer = 100
-    loss_function::Any = mse
-    rng::R = Random.GLOBAL_RNG
+    loss_function::F = mse
+    rng::R = Random.default_rng()
     γ::Float32 = 0.9f0
 end
 
@@ -36,7 +36,7 @@ function NFQ(;
     num_iterations::Integer = 20,
     epochs::Integer = 1000,
     loss_function::Any = mse,
-    rng=Random.GLOBAL_RNG,
+    rng=Random.default_rng(),
     γ::Float32 = 0.9f0,
     ) where {A}
     NFQ(action_space, approximator, num_iterations, epochs, loss_function, rng, γ)

From 8461c159bb15769f9c2eba2246a37fea14a42768 Mon Sep 17 00:00:00 2001
From: Henri Dehaybe <47037088+HenriDeh@users.noreply.github.com>
Date: Thu, 15 Jun 2023 10:10:24 +0200
Subject: [PATCH 09/24] remove duplicate

---
 .../deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl
index eb9e8395b..d2ed6e724 100644
--- a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl
+++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl
@@ -7,7 +7,7 @@
 # ---
 
 #+ tangle=true
-using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo, ReinforcementLearningZoo
+using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo
 using ReinforcementLearningEnvironments
 using Flux
 using Flux: glorot_uniform

From b89f67e5eadff840ddffd12e80fd4aaa666629fd Mon Sep 17 00:00:00 2001
From: Henri Dehaybe <47037088+HenriDeh@users.noreply.github.com>
Date: Thu, 15 Jun 2023 11:10:06 +0200
Subject: [PATCH 10/24] dispatch on trajectory

---
 src/ReinforcementLearningCore/src/policies/q_based_policy.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ReinforcementLearningCore/src/policies/q_based_policy.jl b/src/ReinforcementLearningCore/src/policies/q_based_policy.jl
index cb3376038..91d97d8ca 100644
--- a/src/ReinforcementLearningCore/src/policies/q_based_policy.jl
+++ b/src/ReinforcementLearningCore/src/policies/q_based_policy.jl
@@ -37,4 +37,4 @@ end
 RLBase.prob(p::QBasedPolicy{L,Ex}, env::AbstractEnv) where {L<:AbstractLearner,Ex<:AbstractExplorer} =
     prob(p.explorer, forward(p.learner, env), legal_action_space_mask(env))
 
-RLBase.optimise!(p::QBasedPolicy{L,Ex}, stage::S, x::NamedTuple) where {L<:AbstractLearner,Ex<:AbstractExplorer, S<:AbstractStage} = optimise!(p.learner, x)
+RLBase.optimise!(p::QBasedPolicy{L,Ex}, stage::S, t::Trajectory) where {L<:AbstractLearner,Ex<:AbstractExplorer, S<:AbstractStage} = optimise!(p.learner, stage, t)

From 19e0a977258b06480d457cdc25eb43d18f4e2999 Mon Sep 17 00:00:00 2001
From: Henri Dehaybe <47037088+HenriDeh@users.noreply.github.com>
Date: Thu, 15 Jun 2023 11:10:27 +0200
Subject: [PATCH 11/24] optimise is dummy by default

---
 src/ReinforcementLearningCore/src/policies/agent/base.jl | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/ReinforcementLearningCore/src/policies/agent/base.jl b/src/ReinforcementLearningCore/src/policies/agent/base.jl
index dc7798dc2..798442f8b 100644
--- a/src/ReinforcementLearningCore/src/policies/agent/base.jl
+++ b/src/ReinforcementLearningCore/src/policies/agent/base.jl
@@ -46,11 +46,8 @@ RLBase.optimise!(::SyncTrajectoryStyle, agent::Agent, stage::S) where {S<:Abstra
 # already spawn a task to optimise inner policy when initializing the agent
 RLBase.optimise!(::AsyncTrajectoryStyle, agent::Agent, stage::S) where {S<:AbstractStage} = nothing
 
-function RLBase.optimise!(policy::AbstractPolicy, stage::S, trajectory::Trajectory) where {S<:AbstractStage}
-    for batch in trajectory
-        optimise!(policy, stage, batch)
-    end
-end
+#by default, optimise does nothing at all stage
+function RLBase.optimise!(policy::AbstractPolicy, stage::AbstractStage, trajectory::Trajectory) end
 
 @functor Agent (policy,)
 

From 98444e42fafd3a9c17b6a671fc71f46766fabe40 Mon Sep 17 00:00:00 2001
From: Henri Dehaybe <47037088+HenriDeh@users.noreply.github.com>
Date: Thu, 15 Jun 2023 11:11:06 +0200
Subject: [PATCH 12/24] optimise! is dispatched on traj and loops it

---
 .../src/algorithms/dqns/NFQ.jl                | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
index 6ecc09b6f..00c620ad6 100644
--- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
+++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
@@ -58,24 +58,24 @@ function RLCore.forward(learner::NFQ, env::AbstractEnv)
 end
 
 # Avoid optimisation in the middle of an episode
-function RLBase.optimise!(::NFQ, ::NamedTuple) end
+function RLBase.optimise!(::NFQ, ::AbstractStage, ::Trajectory) end
 
-function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, batch::NamedTuple)
+function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajectory)
     Q = learner.approximator
     γ = learner.γ
     loss_func = learner.loss_function
     as = learner.action_space
     las = length(as)
-
-
-    (s, a, r, ss) = batch[[:state, :action, :reward, :next_state]]
-    a = Float32.(a)
-    s, a, r, ss = map(x->send_to_device(device(Q), x), (s, a, r, ss))
-    for i = 1:learner.num_iterations
-        # Make an input x samples x |action space| array -- Q --> samples x |action space| -- max --> samples
-        G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> x -> maximum(RLCore.forward(Q, x), dims=3) |> vec)
-        for e = 1:learner.epochs
-            Flux.train!((x, y) -> loss_func(RLCore.forward(Q, x), y), params(Q.model), [(vcat(s, transpose(a)), transpose(G))], Q.optimiser)
+    for batch in trajectory
+        (s, a, r, ss) = batch[[:state, :action, :reward, :next_state]]
+        a = Float32.(a)
+        s, a, r, ss = map(x->send_to_device(device(Q), x), (s, a, r, ss))
+        for i = 1:learner.num_iterations
+            # Make an input x samples x |action space| array -- Q --> samples x |action space| -- max --> samples
+            G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> x -> maximum(RLCore.forward(Q, x), dims=3) |> vec)
+            for e = 1:learner.epochs
+                Flux.train!((x, y) -> loss_func(RLCore.forward(Q, x), y), params(Q.model), [(vcat(s, transpose(a)), transpose(G))], Q.optimiser)
+            end
         end
     end
 end

From 6be2450a0f5d802be341d4b3accb00ac20a5e217 Mon Sep 17 00:00:00 2001
From: Cas Bex <cas.bex@kuleuven.be>
Date: Fri, 16 Jun 2023 14:59:52 +0200
Subject: [PATCH 13/24] Fix precompilation warnings

---
 .../src/algorithms/dqns/NFQ.jl                | 27 ++++++++-----------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
index 00c620ad6..ceb686c72 100644
--- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
+++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
@@ -30,28 +30,23 @@ Base.@kwdef struct NFQ{A, R, F} <: AbstractLearner
     γ::Float32 = 0.9f0
 end
 
-function NFQ(;
-    action_space::AbstractVector,
-    approximator::A,
-    num_iterations::Integer = 20,
-    epochs::Integer = 1000,
-    loss_function::Any = mse,
-    rng=Random.default_rng(),
-    γ::Float32 = 0.9f0,
-    ) where {A}
-    NFQ(action_space, approximator, num_iterations, epochs, loss_function, rng, γ)
-end
+# function NFQ(;
+#     action_space::AbstractVector,
+#     approximator::A,
+#     num_iterations::Integer = 20,
+#     epochs::Integer = 1000,
+#     loss_function::Any = mse,
+#     rng=Random.default_rng(),
+#     γ::Float32 = 0.9f0,
+#     ) where {A}
+#     NFQ(action_space, approximator, num_iterations, epochs, loss_function, rng, γ)
+# end
 
 # Copied from BasicDQN but sure whether it's appropriate
 @functor NFQ (approximator,)
 
 RLCore.forward(L::NFQ, s::AbstractArray) = RLCore.forward(L.approximator, s)
 
-# Flux.functor(x::NFQ) = (Q = x.approximator,), y -> begin
-#     x = @set x.approximator = y.Q
-#     x
-# end
-
 function RLCore.forward(learner::NFQ, env::AbstractEnv)
     as = action_space(env)
     return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> x->RLCore.forward(learner, x) |> send_to_host |> vec 

From 2ed5ffbba34915fcca9d8c3b2e26d7eb6f146784 Mon Sep 17 00:00:00 2001
From: Cas Bex <cas.bex@kuleuven.be>
Date: Fri, 16 Jun 2023 15:00:05 +0200
Subject: [PATCH 14/24] Avoid running post episode optimise! multiple times

---
 .../src/algorithms/dqns/NFQ.jl                | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
index ceb686c72..fb5ed80b1 100644
--- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
+++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
@@ -61,16 +61,20 @@ function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajecto
     loss_func = learner.loss_function
     as = learner.action_space
     las = length(as)
-    for batch in trajectory
-        (s, a, r, ss) = batch[[:state, :action, :reward, :next_state]]
-        a = Float32.(a)
-        s, a, r, ss = map(x->send_to_device(device(Q), x), (s, a, r, ss))
-        for i = 1:learner.num_iterations
-            # Make an input x samples x |action space| array -- Q --> samples x |action space| -- max --> samples
-            G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> x -> maximum(RLCore.forward(Q, x), dims=3) |> vec)
-            for e = 1:learner.epochs
-                Flux.train!((x, y) -> loss_func(RLCore.forward(Q, x), y), params(Q.model), [(vcat(s, transpose(a)), transpose(G))], Q.optimiser)
-            end
+    batch = nothing
+    for b in trajectory
+        batch = b
+    end
+    batch === nothing && return
+    
+    (s, a, r, ss) = batch[[:state, :action, :reward, :next_state]]
+    a = Float32.(a)
+    s, a, r, ss = map(x->send_to_device(device(Q), x), (s, a, r, ss))
+    for i = 1:learner.num_iterations
+        # Make an input x samples x |action space| array -- Q --> samples x |action space| -- max --> samples
+        G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> x -> maximum(RLCore.forward(Q, x), dims=3) |> vec)
+        for e = 1:learner.epochs
+            Flux.train!((x, y) -> loss_func(RLCore.forward(Q, x), y), params(Q.model), [(vcat(s, a), transpose(G))], Q.optimiser)
         end
     end
 end

From da384b206929d56ad6163047f502954d6364055a Mon Sep 17 00:00:00 2001
From: Cas Bex <cas.bex@kuleuven.be>
Date: Fri, 16 Jun 2023 15:27:25 +0200
Subject: [PATCH 15/24] Tune experiment

---
 .../experiments/DQN/JuliaRL_NFQ_CartPole.jl   | 30 ++++++++++---------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl
index d2ed6e724..e313004d6 100644
--- a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl
+++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl
@@ -1,7 +1,7 @@
 # ---
-# title: JuliaRL\_NFQ\_PendulumDiscrete
+# title: JuliaRL\_NFQ\_CartPole
 # cover: assets/JuliaRL_BasicDQN_CartPole.png
-# description: NFQ applied to discrete Pendulum
+# description: NFQ applied to the cartpole environment
 # date: 2023-06
 # author: "[Lucas Bex](https://github.com/CasBex)"
 # ---
@@ -13,7 +13,7 @@ using Flux
 using Flux: glorot_uniform
 
 using StableRNGs: StableRNG
-using Flux.Losses: huber_loss
+using Flux.Losses: mse
 
 function RLCore.Experiment(
     ::Val{:JuliaRL},
@@ -31,30 +31,32 @@ function RLCore.Experiment(
                 action_space=action_space(env),
                 approximator=Approximator(
                     model=Chain(
-                        Dense(ns+na, 64, relu; init=glorot_uniform(rng)),
-                        Dense(64, 64, relu; init=glorot_uniform(rng)),
-                        Dense(64, 1; init=glorot_uniform(rng)),
+                        Dense(ns+na, 5, σ; init=glorot_uniform(rng)),
+                        Dense(5, 5, σ; init=glorot_uniform(rng)),
+                        Dense(5, 1; init=glorot_uniform(rng)),
                     ) |> gpu,
                     optimiser=RMSProp()
                 ),
-                loss_function=huber_loss,
-                epochs=500,
-                num_iterations=10
+                loss_function=mse,
+                epochs=100,
+                num_iterations=10,
+                γ = 0.95f0
             ),
             explorer=EpsilonGreedyExplorer(
                 kind=:exp,
-                ϵ_stable=0.01,
-                decay_steps=500,
+                ϵ_stable=0.001,
+                warmup_steps=500,
                 rng=rng,
             ),
         ),
         trajectory=Trajectory(
             container=CircularArraySARTTraces(
-                capacity=1000,
+                capacity=10_000,
                 state=Float32 => (ns,),
+                action=Float32 => (na,),
             ),
             sampler=BatchSampler{SS′ART}(
-                batch_size=1000,
+                batch_size=10_000,
                 rng=rng
             ),
             controller=InsertSampleRatioController(
@@ -71,7 +73,7 @@ function RLCore.Experiment(
 
 #+ tangle=false
 using Plots
-pyplot() # hide
+# pyplot() # hide
 ex = E`JuliaRL_NFQ_CartPole`
 run(ex)
 plot(ex.hook.rewards)

From 5ab7a1c850e5755a7969d5448d008e144ccf8f32 Mon Sep 17 00:00:00 2001
From: Cas Bex <cas.bex@kuleuven.be>
Date: Mon, 19 Jun 2023 09:36:48 +0200
Subject: [PATCH 16/24] Remove commented code

---
 .../src/algorithms/dqns/NFQ.jl                      | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
index fb5ed80b1..a132ec7d1 100644
--- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
+++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
@@ -30,19 +30,6 @@ Base.@kwdef struct NFQ{A, R, F} <: AbstractLearner
     γ::Float32 = 0.9f0
 end
 
-# function NFQ(;
-#     action_space::AbstractVector,
-#     approximator::A,
-#     num_iterations::Integer = 20,
-#     epochs::Integer = 1000,
-#     loss_function::Any = mse,
-#     rng=Random.default_rng(),
-#     γ::Float32 = 0.9f0,
-#     ) where {A}
-#     NFQ(action_space, approximator, num_iterations, epochs, loss_function, rng, γ)
-# end
-
-# Copied from BasicDQN but sure whether it's appropriate
 @functor NFQ (approximator,)
 
 RLCore.forward(L::NFQ, s::AbstractArray) = RLCore.forward(L.approximator, s)

From afc21b6522b393cd5519967305c0e689d99c22ce Mon Sep 17 00:00:00 2001
From: CasBex <123587431+CasBex@users.noreply.github.com>
Date: Mon, 19 Jun 2023 09:39:06 +0200
Subject: [PATCH 17/24] Drop gpu call

Co-authored-by: Henri Dehaybe <47037088+HenriDeh@users.noreply.github.com>
---
 .../deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl
index e313004d6..07a48ecef 100644
--- a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl
+++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl
@@ -34,7 +34,7 @@ function RLCore.Experiment(
                         Dense(ns+na, 5, σ; init=glorot_uniform(rng)),
                         Dense(5, 5, σ; init=glorot_uniform(rng)),
                         Dense(5, 1; init=glorot_uniform(rng)),
-                    ) |> gpu,
+                    ),
                     optimiser=RMSProp()
                 ),
                 loss_function=mse,

From 033dcdf3a59878d2438bb168a8136dd3e3885efe Mon Sep 17 00:00:00 2001
From: Cas Bex <cas.bex@kuleuven.be>
Date: Mon, 19 Jun 2023 10:59:37 +0200
Subject: [PATCH 18/24] Use `sample` to get batch from trajectory

---
 src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
index a132ec7d1..f4de8b127 100644
--- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
+++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
@@ -48,11 +48,7 @@ function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajecto
     loss_func = learner.loss_function
     as = learner.action_space
     las = length(as)
-    batch = nothing
-    for b in trajectory
-        batch = b
-    end
-    batch === nothing && return
+    batch = ReinforcementLearningTrajectories.sample(trajectory)
     
     (s, a, r, ss) = batch[[:state, :action, :reward, :next_state]]
     a = Float32.(a)

From 31b55b5f85dcb98bd5c757bc2454b1ccb60b6bf6 Mon Sep 17 00:00:00 2001
From: Cas Bex <cas.bex@kuleuven.be>
Date: Mon, 19 Jun 2023 13:38:15 +0200
Subject: [PATCH 19/24] optimise! for AbstractLearner

---
 src/ReinforcementLearningCore/src/policies/learners.jl  | 2 ++
 src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/ReinforcementLearningCore/src/policies/learners.jl b/src/ReinforcementLearningCore/src/policies/learners.jl
index 0a0197a4d..13f505e24 100644
--- a/src/ReinforcementLearningCore/src/policies/learners.jl
+++ b/src/ReinforcementLearningCore/src/policies/learners.jl
@@ -10,6 +10,8 @@ Base.show(io::IO, m::MIME"text/plain", L::AbstractLearner) = show(io, m, convert
 # Take Learner and Environment, get state, send to RLCore.forward(Learner, State)
 forward(L::Le, env::E) where {Le <: AbstractLearner, E <: AbstractEnv} = env |> state |> send_to_device(L.approximator) |> x -> forward(L, x) |> send_to_device(env) 
 
+function RLBase.optimise!(::AbstractLearner, ::AbstractStage, ::Trajectory) end
+
 Base.@kwdef mutable struct Approximator{M,O}
     model::M
     optimiser::O
diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
index f4de8b127..8dbbaa339 100644
--- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
+++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
@@ -39,9 +39,6 @@ function RLCore.forward(learner::NFQ, env::AbstractEnv)
     return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> x->RLCore.forward(learner, x) |> send_to_host |> vec 
 end
 
-# Avoid optimisation in the middle of an episode
-function RLBase.optimise!(::NFQ, ::AbstractStage, ::Trajectory) end
-
 function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajectory)
     Q = learner.approximator
     γ = learner.γ

From b53c96b4cefd347e07943e793ce6b9a16494bcbb Mon Sep 17 00:00:00 2001
From: Cas Bex <cas.bex@kuleuven.be>
Date: Mon, 19 Jun 2023 14:07:06 +0200
Subject: [PATCH 20/24] NFQ optimise! calls at the correct time

---
 src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
index 8dbbaa339..16990fc84 100644
--- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
+++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
@@ -39,6 +39,12 @@ function RLCore.forward(learner::NFQ, env::AbstractEnv)
     return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> x->RLCore.forward(learner, x) |> send_to_host |> vec 
 end
 
+function RLBase.optimise!(::NFQ, ::NamedTuple) end
+
+function RLBase.optimise!(p::QBasedPolicy{L,Ex}, s::PostEpisodeStage, trajectory::Trajectory) where {L<:NFQ,Ex<:AbstractExplorer} 
+       RLBase.optimise!(p.learner, s, trajectory)
+end
+
 function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajectory)
     Q = learner.approximator
     γ = learner.γ

From f77e198efae830091373893e926ba4afec56ee76 Mon Sep 17 00:00:00 2001
From: Cas Bex <cas.bex@kuleuven.be>
Date: Fri, 23 Jun 2023 11:51:42 +0200
Subject: [PATCH 21/24] Remove superfluous function due to main merge

---
 src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
index 16990fc84..8dbbaa339 100644
--- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
+++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
@@ -39,12 +39,6 @@ function RLCore.forward(learner::NFQ, env::AbstractEnv)
     return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> x->RLCore.forward(learner, x) |> send_to_host |> vec 
 end
 
-function RLBase.optimise!(::NFQ, ::NamedTuple) end
-
-function RLBase.optimise!(p::QBasedPolicy{L,Ex}, s::PostEpisodeStage, trajectory::Trajectory) where {L<:NFQ,Ex<:AbstractExplorer} 
-       RLBase.optimise!(p.learner, s, trajectory)
-end
-
 function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajectory)
     Q = learner.approximator
     γ = learner.γ

From 66ea89bf4c7d6af2f58a937258ebd401ce1c6aef Mon Sep 17 00:00:00 2001
From: Cas Bex <cas.bex@kuleuven.be>
Date: Fri, 23 Jun 2023 11:52:01 +0200
Subject: [PATCH 22/24] Anonymous loop variable

---
 src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
index 8dbbaa339..30442bdbb 100644
--- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
+++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
@@ -53,7 +53,7 @@ function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajecto
     for i = 1:learner.num_iterations
         # Make an input x samples x |action space| array -- Q --> samples x |action space| -- max --> samples
         G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> x -> maximum(RLCore.forward(Q, x), dims=3) |> vec)
-        for e = 1:learner.epochs
+        for _ = 1:learner.epochs
             Flux.train!((x, y) -> loss_func(RLCore.forward(Q, x), y), params(Q.model), [(vcat(s, a), transpose(G))], Q.optimiser)
         end
     end

From 37be2a6353a862702bfc4b8bec11b35af982b14a Mon Sep 17 00:00:00 2001
From: Cas Bex <cas.bex@kuleuven.be>
Date: Fri, 23 Jun 2023 11:54:34 +0200
Subject: [PATCH 23/24] Update NFQ docs

---
 src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
index 30442bdbb..a14d36c5b 100644
--- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
+++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl
@@ -5,15 +5,15 @@ using Functors: @functor
 
 """
     NFQ{A<:AbstractApproximator, F, R} <: AbstractLearner
-    NFQ(approximator::A, num_iterations::Integer epochs::Integer, loss_function::F, batch_size::Integer, rng::R, γ::Float32) where {A<:AbstractApproximator, F, R}
+    NFQ(action_space::AbstractVector, approximator::A, num_iterations::Integer epochs::Integer, loss_function::F, rng::R, γ::Float32) where {A, F, R}
 Neural Fitted Q-iteration as implemented in [1]
 
 # Keyword arguments
-- `approximator::AbstractApproximator` neural network
+- `action_space::AbstractVector` Action space of the environment (necessary in the optimise! step)
+- `approximator::A` Q-function approximator (typically a neural network)
 - `num_iterations::Integer` number of value iteration iterations in FQI loop (i.e. the outer loop)
-- `epochs` number of epochs to train neural network per iteration
+- `epochs::Integer` number of epochs to train neural network per iteration
 - `loss_function::F` loss function of the NN
-- `sampler::BatchSampler{SARTS}` data sampler
 - `rng::R` random number generator
 - `γ::Float32` discount rate
 

From c43f37ab60a634960020bdbbb7c993e6bd888bc7 Mon Sep 17 00:00:00 2001
From: Henri Dehaybe <47037088+HenriDeh@users.noreply.github.com>
Date: Mon, 26 Jun 2023 14:46:00 +0200
Subject: [PATCH 24/24] Update julia_words.txt

---
 .cspell/julia_words.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.cspell/julia_words.txt b/.cspell/julia_words.txt
index 08196fd32..eb3b29b18 100644
--- a/.cspell/julia_words.txt
+++ b/.cspell/julia_words.txt
@@ -5294,4 +5294,5 @@ sqmahal
 logdpf
 devmode
 logpdfs
-kldivs
\ No newline at end of file
+kldivs
+Riedmiller