From 5e6f09dc04b409a6234fb77a244eb7e4ba4d42f3 Mon Sep 17 00:00:00 2001 From: Cas Bex Date: Fri, 2 Jun 2023 15:18:13 +0200 Subject: [PATCH 01/24] NFQ before refactor --- .../src/algorithms/offline_rl/NFQ.jl | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl diff --git a/src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl new file mode 100644 index 000000000..3336520bd --- /dev/null +++ b/src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl @@ -0,0 +1,77 @@ +""" + NFQ{A<:AbstractApproximator, F, R} <: AbstractLearner + NFQ(approximator::A, num_iterations::Integer epochs::Integer, loss_function::F, batch_size::Integer, rng::R, γ::Float32) where {A<:AbstractApproximator, F, R} +Neural Fitted Q-iteration as implemented in [1] + +# Keyword arguments +- `approximator::AbstractApproximator` neural network +- `num_iterations::Integer` number of value iteration iterations in FQI loop (i.e. the outer loop) +- `epochs` number of epochs to train neural network per iteration +- `loss_function::F` loss function of the NN +- `sampler::BatchSampler{SARTS}` data sampler +- `rng::R` random number generator +- `γ::Float32` discount rate + +# References +[1] Riedmiller, M. (2005). Neural Fitted Q Iteration – First Experiences with a Data Efficient Neural Reinforcement Learning Method. In: Gama, J., Camacho, R., Brazdil, P.B., Jorge, A.M., Torgo, L. (eds) Machine Learning: ECML 2005. ECML 2005. Lecture Notes in Computer Science(), vol 3720. Springer, Berlin, Heidelberg. https://doi.org/10.1007/11564096_32 +""" +Base.@kwdef struct NFQ{A<:AbstractApproximator, F, R} <: AbstractLearner + approximator::A + num_iterations::Integer = 20 + epochs::Integer = 100 + loss_function::F = mse + sampler::BatchSampler{SARTS} = BatchSampler(32) + rng::R = Random.GLOBAL_RNG + γ::Float32 = 0.9f0 +end + +function NFQ(; + approximator::A, + num_iterations::Integer = 20, + epochs::Integer = 1000, + loss_function::F = mse, + batch_size::Integer=32, + rng=Random.GLOBAL_RNG, + γ::Float32 = 0.9f0, + ) where {A<:AbstractApproximator, F} + NFQ(approximator, num_iterations, epochs, loss_function, BatchSampler{SARTS}(batch_size), rng, γ) +end + +# Copied from BasicDQN but sure whether it's appropriate +Flux.functor(x::NFQ) = (Q = x.approximator,), y -> begin + x = @set x.approximator = y.Q + x +end + +function (learner::NFQ)(env) + as = action_space(env) + return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> learner.approximator |> send_to_host |> vec +end + +function RLBase.update!(learner::NFQ, traj::AbstractTrajectory, ::AbstractEnv, ::PreExperimentStage) end +function RLBase.update!(learner::NFQ, traj::AbstractTrajectory, ::AbstractEnv, ::PreActStage) end +function RLBase.update!(learner::NFQ, traj::AbstractTrajectory, ::AbstractEnv, ::PreEpisodeStage) end +function RLBase.update!(learner::NFQ, traj::AbstractTrajectory, env::AbstractEnv, ::PostEpisodeStage) + isempty(traj) && return + inds, batch = sample(learner.rng, traj, learner.sampler) + update!(learner, batch, env) +end + +function RLBase.update!(learner::NFQ, batch::NamedTuple{SARTS}, env::AbstractEnv) + Q = learner.approximator + γ = learner.γ + loss_func = learner.loss_function + as = action_space(env) + las = length(as) + + (s, a, r, ss) = batch[[:state, :action, :reward, :next_state]] + a = Float32.(a) + s, a, r, ss = map(x->send_to_device(device(Q), x), (s, a, r, ss)) + for i = 1:learner.num_iterations + # Make an input x samples x |action space| array -- Q --> samples x |action space| -- max --> samples + G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> Q |> x -> maximum(x, dims=3) |> vec) + for e = 1:learner.epochs + Flux.train!((x, y) -> loss_func(Q(x), y), params(Q.model), [(vcat(s, transpose(a)), transpose(G))], Q.optimizer) + end + end +end From c42304894a70fa07644022dc0efad1a11712a5b9 Mon Sep 17 00:00:00 2001 From: Cas Bex Date: Fri, 2 Jun 2023 15:25:38 +0200 Subject: [PATCH 02/24] NFQ after refactor --- .../src/algorithms/offline_rl/NFQ.jl | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl index 3336520bd..145f611e9 100644 --- a/src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl +++ b/src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl @@ -15,12 +15,11 @@ Neural Fitted Q-iteration as implemented in [1] # References [1] Riedmiller, M. (2005). Neural Fitted Q Iteration – First Experiences with a Data Efficient Neural Reinforcement Learning Method. In: Gama, J., Camacho, R., Brazdil, P.B., Jorge, A.M., Torgo, L. (eds) Machine Learning: ECML 2005. ECML 2005. Lecture Notes in Computer Science(), vol 3720. Springer, Berlin, Heidelberg. https://doi.org/10.1007/11564096_32 """ -Base.@kwdef struct NFQ{A<:AbstractApproximator, F, R} <: AbstractLearner +Base.@kwdef struct NFQ{A<:NeuralNetworkApproximator, F, R} <: AbstractLearner approximator::A num_iterations::Integer = 20 epochs::Integer = 100 loss_function::F = mse - sampler::BatchSampler{SARTS} = BatchSampler(32) rng::R = Random.GLOBAL_RNG γ::Float32 = 0.9f0 end @@ -30,11 +29,10 @@ function NFQ(; num_iterations::Integer = 20, epochs::Integer = 1000, loss_function::F = mse, - batch_size::Integer=32, rng=Random.GLOBAL_RNG, γ::Float32 = 0.9f0, - ) where {A<:AbstractApproximator, F} - NFQ(approximator, num_iterations, epochs, loss_function, BatchSampler{SARTS}(batch_size), rng, γ) + ) where {A<:NeuralNetworkApproximator, F} + NFQ(approximator, num_iterations, epochs, loss_function, rng, γ) end # Copied from BasicDQN but sure whether it's appropriate @@ -43,27 +41,30 @@ Flux.functor(x::NFQ) = (Q = x.approximator,), y -> begin x end -function (learner::NFQ)(env) +function RLBase.plan!(learner::NFQ, env::AbstractEnv) as = action_space(env) return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> learner.approximator |> send_to_host |> vec end -function RLBase.update!(learner::NFQ, traj::AbstractTrajectory, ::AbstractEnv, ::PreExperimentStage) end -function RLBase.update!(learner::NFQ, traj::AbstractTrajectory, ::AbstractEnv, ::PreActStage) end -function RLBase.update!(learner::NFQ, traj::AbstractTrajectory, ::AbstractEnv, ::PreEpisodeStage) end -function RLBase.update!(learner::NFQ, traj::AbstractTrajectory, env::AbstractEnv, ::PostEpisodeStage) - isempty(traj) && return - inds, batch = sample(learner.rng, traj, learner.sampler) - update!(learner, batch, env) +# Avoid optimisation in the middle of an episode +function RLBase.optimise!(::NFQ, ::NamedTuple) end + +# Instead do optimisation at the end of an episode +function Base.push!(agent::Agent{<:QBasedPolicy{<:NFQ}}, ::PostEpisodeStage, env::AbstractEnv) + for batch in agent.trajectory + _optimise!(agent.policy.learner, batch, env) + end end -function RLBase.update!(learner::NFQ, batch::NamedTuple{SARTS}, env::AbstractEnv) +function _optimise!(learner::NFQ, batch::NamedTuple, env::AbstractEnv) Q = learner.approximator γ = learner.γ loss_func = learner.loss_function + as = action_space(env) las = length(as) + (s, a, r, ss) = batch[[:state, :action, :reward, :next_state]] a = Float32.(a) s, a, r, ss = map(x->send_to_device(device(Q), x), (s, a, r, ss)) From c88559e7f8a44269f11e8c5e64a8f7ca7cc405a9 Mon Sep 17 00:00:00 2001 From: Cas Bex Date: Wed, 7 Jun 2023 17:49:36 +0200 Subject: [PATCH 03/24] Move to dqns --- .../src/algorithms/{offline_rl => dqns}/NFQ.jl | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/ReinforcementLearningZoo/src/algorithms/{offline_rl => dqns}/NFQ.jl (100%) diff --git a/src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl similarity index 100% rename from src/ReinforcementLearningZoo/src/algorithms/offline_rl/NFQ.jl rename to src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl From 1560ad1b8acbed34a3bfcad1f1cb12f062dad6ed Mon Sep 17 00:00:00 2001 From: Cas Bex Date: Thu, 8 Jun 2023 09:59:02 +0200 Subject: [PATCH 04/24] Refactor --- .../src/algorithms/dqns/NFQ.jl | 33 ++++++++++++------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl index 145f611e9..176697e7e 100644 --- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl +++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl @@ -1,3 +1,8 @@ +export NFQ + +using Flux +using Functors: @functor + """ NFQ{A<:AbstractApproximator, F, R} <: AbstractLearner NFQ(approximator::A, num_iterations::Integer epochs::Integer, loss_function::F, batch_size::Integer, rng::R, γ::Float32) where {A<:AbstractApproximator, F, R} @@ -15,11 +20,11 @@ Neural Fitted Q-iteration as implemented in [1] # References [1] Riedmiller, M. (2005). Neural Fitted Q Iteration – First Experiences with a Data Efficient Neural Reinforcement Learning Method. In: Gama, J., Camacho, R., Brazdil, P.B., Jorge, A.M., Torgo, L. (eds) Machine Learning: ECML 2005. ECML 2005. Lecture Notes in Computer Science(), vol 3720. Springer, Berlin, Heidelberg. https://doi.org/10.1007/11564096_32 """ -Base.@kwdef struct NFQ{A<:NeuralNetworkApproximator, F, R} <: AbstractLearner +Base.@kwdef struct NFQ{A, R} <: AbstractLearner approximator::A num_iterations::Integer = 20 epochs::Integer = 100 - loss_function::F = mse + loss_function::Any = mse rng::R = Random.GLOBAL_RNG γ::Float32 = 0.9f0 end @@ -28,22 +33,26 @@ function NFQ(; approximator::A, num_iterations::Integer = 20, epochs::Integer = 1000, - loss_function::F = mse, + loss_function::Any = mse, rng=Random.GLOBAL_RNG, γ::Float32 = 0.9f0, - ) where {A<:NeuralNetworkApproximator, F} + ) where {A} NFQ(approximator, num_iterations, epochs, loss_function, rng, γ) end # Copied from BasicDQN but sure whether it's appropriate -Flux.functor(x::NFQ) = (Q = x.approximator,), y -> begin - x = @set x.approximator = y.Q - x -end +@functor NFQ (approximator,) + +RLCore.forward(L::NFQ, s::AbstractArray) = RLCore.forward(L.approximator, s) + +# Flux.functor(x::NFQ) = (Q = x.approximator,), y -> begin +# x = @set x.approximator = y.Q +# x +# end -function RLBase.plan!(learner::NFQ, env::AbstractEnv) +function RLCore.forward(learner::NFQ, env::AbstractEnv) as = action_space(env) - return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> learner.approximator |> send_to_host |> vec + return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> x->RLCore.forward(learner, x) |> send_to_host |> vec end # Avoid optimisation in the middle of an episode @@ -70,9 +79,9 @@ function _optimise!(learner::NFQ, batch::NamedTuple, env::AbstractEnv) s, a, r, ss = map(x->send_to_device(device(Q), x), (s, a, r, ss)) for i = 1:learner.num_iterations # Make an input x samples x |action space| array -- Q --> samples x |action space| -- max --> samples - G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> Q |> x -> maximum(x, dims=3) |> vec) + G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> x -> maximum(RLCore.forward(Q, x), dims=3) |> vec) for e = 1:learner.epochs - Flux.train!((x, y) -> loss_func(Q(x), y), params(Q.model), [(vcat(s, transpose(a)), transpose(G))], Q.optimizer) + Flux.train!((x, y) -> loss_func(RLCore.forward(Q, x), y), params(Q.model), [(vcat(s, transpose(a)), transpose(G))], Q.optimiser) end end end From 02ab01b111bd036f6c78b7d0e75d3f25defe3627 Mon Sep 17 00:00:00 2001 From: Cas Bex Date: Wed, 7 Jun 2023 19:23:57 +0200 Subject: [PATCH 05/24] Add NFQ to RLZoo --- src/ReinforcementLearningZoo/src/algorithms/dqns/dqns.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/dqns.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/dqns.jl index b20517abb..49f88f54a 100644 --- a/src/ReinforcementLearningZoo/src/algorithms/dqns/dqns.jl +++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/dqns.jl @@ -1,8 +1,9 @@ include("basic_dqn.jl") +include("NFQ.jl") include("dqn.jl") include("prioritized_dqn.jl") include("qr_dqn.jl") include("rem_dqn.jl") include("iqn.jl") include("rainbow.jl") -# include("common.jl") \ No newline at end of file +# include("common.jl") From 093f1f4a642412be2db8902b4a0f1ecd405e2f6e Mon Sep 17 00:00:00 2001 From: Cas Bex Date: Wed, 7 Jun 2023 19:23:07 +0200 Subject: [PATCH 06/24] Set up experiment --- .../experiments/DQN/JuliaRL_NFQ_CartPole.jl | 92 +++++++++++++++++++ .../experiments/experiments/DQN/config.json | 1 + .../src/ReinforcementLearningExperiments.jl | 1 + .../test/runtests.jl | 1 + 4 files changed, 95 insertions(+) create mode 100644 src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl new file mode 100644 index 000000000..a53576080 --- /dev/null +++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl @@ -0,0 +1,92 @@ +# --- +# title: JuliaRL\_NFQ\_PendulumDiscrete +# cover: assets/JuliaRL_BasicDQN_CartPole.png +# description: NFQ applied to discrete Pendulum +# date: 2023-06 +# author: "[Lucas Bex](https://github.com/CasBex)" +# --- + +#+ tangle=true +using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo, ReinforcementLearningZoo +using ReinforcementLearningEnvironments +using Flux +using Flux: glorot_uniform + +using StableRNGs: StableRNG +using Flux.Losses: huber_loss + +function RLCore.Experiment( + ::Val{:JuliaRL}, + ::Val{:NFQ}, + ::Val{:CartPole}, + seed = 123, +) + rng = StableRNG(seed) + env = CartPoleEnv(; T=Float32, rng=rng) + ns, na = length(state(env)), length(first(action_space(env))) + + agent = Agent( + policy=QBasedPolicy( + learner=NFQ( + approximator=Approximator( + model=Chain( + Dense(ns, 128, relu; init=glorot_uniform(rng)), + Dense(128, 128, relu; init=glorot_uniform(rng)), + Dense(128, na; init=glorot_uniform(rng)), + ) |> gpu, + optimiser=RMSProp() + ), + loss_func=huber_loss, + epochs=500, + num_iterations=10 + ), + explorer=EpsilonGreedyExplorer( + kind=:exp, + ϵ_stable=0.01, + decay_steps=500, + rng=rng, + ), + ), + trajectory=Trajectory( + container=CircularArraySARTTraces( + capacity=1000, + state=Float32 => (ns,), + ), + sampler=BatchSampler{SS′ART}( + batch_size=1000, + rng=rng + ), + controller=InsertSampleRatioController( + threshold=100, + n_inserted=-1 + ) + ) + ) + + stop_condition = StopAfterStep(10_000, is_show_progress=!haskey(ENV, "CI")) + hook = TotalRewardPerEpisode() + Experiment(agent, env, stop_condition, hook) + end + +#+ tangle=false +using Plots +pyplot() # hide +ex = E`JuliaRL_NFQ_CartPole` +run(ex) +plot(ex.hook.rewards) +savefig("assets/JuliaRL_NFQ_CartPole.png") #hide + +#= +## Watch a demo episode with the trained agent + +```julia +demo = Experiment(ex.policy, + CartPoleEnv(), + StopWhenDone(), + RolloutHook(plot, closeall), + "DQN <-> Demo") +run(demo) +``` +=# + +# ![](assets/JuliaRL_NFQ_CartPole.png) diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/config.json b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/config.json index e6568ee09..65db0d95d 100644 --- a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/config.json +++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/config.json @@ -1,6 +1,7 @@ { "description": "DQN related experiments.", "order": [ + "JuliaRL_NFQ_CartPole.jl", "JuliaRL_BasicDQN_CartPole.jl", "JuliaRL_BasicDQN_MountainCar.jl", "JuliaRL_BasicDQN_PendulumDiscrete.jl", diff --git a/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl b/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl index a280e7746..ca27a1031 100644 --- a/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl +++ b/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl @@ -8,6 +8,7 @@ const EXPERIMENTS_DIR = joinpath(@__DIR__, "experiments") # for f in readdir(EXPERIMENTS_DIR) # include(joinpath(EXPERIMENTS_DIR, f)) # end +include(joinpath(EXPERIMENTS_DIR, "JuliaRL_NFQ_CartPole.jl")) include(joinpath(EXPERIMENTS_DIR, "JuliaRL_BasicDQN_CartPole.jl")) include(joinpath(EXPERIMENTS_DIR, "JuliaRL_DQN_CartPole.jl")) include(joinpath(EXPERIMENTS_DIR, "JuliaRL_PrioritizedDQN_CartPole.jl")) diff --git a/src/ReinforcementLearningExperiments/test/runtests.jl b/src/ReinforcementLearningExperiments/test/runtests.jl index 723a5dbd3..1086313e0 100644 --- a/src/ReinforcementLearningExperiments/test/runtests.jl +++ b/src/ReinforcementLearningExperiments/test/runtests.jl @@ -3,6 +3,7 @@ using CUDA CUDA.allowscalar(false) +run(E`JuliaRL_NFQ_CartPole`) run(E`JuliaRL_BasicDQN_CartPole`) run(E`JuliaRL_DQN_CartPole`) run(E`JuliaRL_PrioritizedDQN_CartPole`) From c1e49da5eac7b5cfef9f97cb6485d119d0881961 Mon Sep 17 00:00:00 2001 From: Cas Bex Date: Tue, 13 Jun 2023 10:18:24 +0200 Subject: [PATCH 07/24] Update algorithm for refactor --- .../experiments/DQN/JuliaRL_NFQ_CartPole.jl | 9 +++++---- .../src/algorithms/dqns/NFQ.jl | 16 +++++----------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl index a53576080..eb9e8395b 100644 --- a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl +++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl @@ -28,15 +28,16 @@ function RLCore.Experiment( agent = Agent( policy=QBasedPolicy( learner=NFQ( + action_space=action_space(env), approximator=Approximator( model=Chain( - Dense(ns, 128, relu; init=glorot_uniform(rng)), - Dense(128, 128, relu; init=glorot_uniform(rng)), - Dense(128, na; init=glorot_uniform(rng)), + Dense(ns+na, 64, relu; init=glorot_uniform(rng)), + Dense(64, 64, relu; init=glorot_uniform(rng)), + Dense(64, 1; init=glorot_uniform(rng)), ) |> gpu, optimiser=RMSProp() ), - loss_func=huber_loss, + loss_function=huber_loss, epochs=500, num_iterations=10 ), diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl index 176697e7e..a6fdd9c23 100644 --- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl +++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl @@ -21,6 +21,7 @@ Neural Fitted Q-iteration as implemented in [1] [1] Riedmiller, M. (2005). Neural Fitted Q Iteration – First Experiences with a Data Efficient Neural Reinforcement Learning Method. In: Gama, J., Camacho, R., Brazdil, P.B., Jorge, A.M., Torgo, L. (eds) Machine Learning: ECML 2005. ECML 2005. Lecture Notes in Computer Science(), vol 3720. Springer, Berlin, Heidelberg. https://doi.org/10.1007/11564096_32 """ Base.@kwdef struct NFQ{A, R} <: AbstractLearner + action_space::AbstractVector approximator::A num_iterations::Integer = 20 epochs::Integer = 100 @@ -30,6 +31,7 @@ Base.@kwdef struct NFQ{A, R} <: AbstractLearner end function NFQ(; + action_space::AbstractVector, approximator::A, num_iterations::Integer = 20, epochs::Integer = 1000, @@ -37,7 +39,7 @@ function NFQ(; rng=Random.GLOBAL_RNG, γ::Float32 = 0.9f0, ) where {A} - NFQ(approximator, num_iterations, epochs, loss_function, rng, γ) + NFQ(action_space, approximator, num_iterations, epochs, loss_function, rng, γ) end # Copied from BasicDQN but sure whether it's appropriate @@ -58,19 +60,11 @@ end # Avoid optimisation in the middle of an episode function RLBase.optimise!(::NFQ, ::NamedTuple) end -# Instead do optimisation at the end of an episode -function Base.push!(agent::Agent{<:QBasedPolicy{<:NFQ}}, ::PostEpisodeStage, env::AbstractEnv) - for batch in agent.trajectory - _optimise!(agent.policy.learner, batch, env) - end -end - -function _optimise!(learner::NFQ, batch::NamedTuple, env::AbstractEnv) +function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, batch::NamedTuple) Q = learner.approximator γ = learner.γ loss_func = learner.loss_function - - as = action_space(env) + as = learner.action_space las = length(as) From 0edd287181f7abe3b56ffef578166e59ed158373 Mon Sep 17 00:00:00 2001 From: Henri Dehaybe <47037088+HenriDeh@users.noreply.github.com> Date: Thu, 15 Jun 2023 10:09:12 +0200 Subject: [PATCH 08/24] rng and loss type --- src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl index a6fdd9c23..6ecc09b6f 100644 --- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl +++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl @@ -20,13 +20,13 @@ Neural Fitted Q-iteration as implemented in [1] # References [1] Riedmiller, M. (2005). Neural Fitted Q Iteration – First Experiences with a Data Efficient Neural Reinforcement Learning Method. In: Gama, J., Camacho, R., Brazdil, P.B., Jorge, A.M., Torgo, L. (eds) Machine Learning: ECML 2005. ECML 2005. Lecture Notes in Computer Science(), vol 3720. Springer, Berlin, Heidelberg. https://doi.org/10.1007/11564096_32 """ -Base.@kwdef struct NFQ{A, R} <: AbstractLearner +Base.@kwdef struct NFQ{A, R, F} <: AbstractLearner action_space::AbstractVector approximator::A num_iterations::Integer = 20 epochs::Integer = 100 - loss_function::Any = mse - rng::R = Random.GLOBAL_RNG + loss_function::F = mse + rng::R = Random.default_rng() γ::Float32 = 0.9f0 end @@ -36,7 +36,7 @@ function NFQ(; num_iterations::Integer = 20, epochs::Integer = 1000, loss_function::Any = mse, - rng=Random.GLOBAL_RNG, + rng=Random.default_rng(), γ::Float32 = 0.9f0, ) where {A} NFQ(action_space, approximator, num_iterations, epochs, loss_function, rng, γ) From 8461c159bb15769f9c2eba2246a37fea14a42768 Mon Sep 17 00:00:00 2001 From: Henri Dehaybe <47037088+HenriDeh@users.noreply.github.com> Date: Thu, 15 Jun 2023 10:10:24 +0200 Subject: [PATCH 09/24] remove duplicate --- .../deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl index eb9e8395b..d2ed6e724 100644 --- a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl +++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl @@ -7,7 +7,7 @@ # --- #+ tangle=true -using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo, ReinforcementLearningZoo +using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo using ReinforcementLearningEnvironments using Flux using Flux: glorot_uniform From b89f67e5eadff840ddffd12e80fd4aaa666629fd Mon Sep 17 00:00:00 2001 From: Henri Dehaybe <47037088+HenriDeh@users.noreply.github.com> Date: Thu, 15 Jun 2023 11:10:06 +0200 Subject: [PATCH 10/24] dispatch on trajectory --- src/ReinforcementLearningCore/src/policies/q_based_policy.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ReinforcementLearningCore/src/policies/q_based_policy.jl b/src/ReinforcementLearningCore/src/policies/q_based_policy.jl index cb3376038..91d97d8ca 100644 --- a/src/ReinforcementLearningCore/src/policies/q_based_policy.jl +++ b/src/ReinforcementLearningCore/src/policies/q_based_policy.jl @@ -37,4 +37,4 @@ end RLBase.prob(p::QBasedPolicy{L,Ex}, env::AbstractEnv) where {L<:AbstractLearner,Ex<:AbstractExplorer} = prob(p.explorer, forward(p.learner, env), legal_action_space_mask(env)) -RLBase.optimise!(p::QBasedPolicy{L,Ex}, stage::S, x::NamedTuple) where {L<:AbstractLearner,Ex<:AbstractExplorer, S<:AbstractStage} = optimise!(p.learner, x) +RLBase.optimise!(p::QBasedPolicy{L,Ex}, stage::S, t::Trajectory) where {L<:AbstractLearner,Ex<:AbstractExplorer, S<:AbstractStage} = optimise!(p.learner, stage, t) From 19e0a977258b06480d457cdc25eb43d18f4e2999 Mon Sep 17 00:00:00 2001 From: Henri Dehaybe <47037088+HenriDeh@users.noreply.github.com> Date: Thu, 15 Jun 2023 11:10:27 +0200 Subject: [PATCH 11/24] optimise is dummy by default --- src/ReinforcementLearningCore/src/policies/agent/base.jl | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/ReinforcementLearningCore/src/policies/agent/base.jl b/src/ReinforcementLearningCore/src/policies/agent/base.jl index dc7798dc2..798442f8b 100644 --- a/src/ReinforcementLearningCore/src/policies/agent/base.jl +++ b/src/ReinforcementLearningCore/src/policies/agent/base.jl @@ -46,11 +46,8 @@ RLBase.optimise!(::SyncTrajectoryStyle, agent::Agent, stage::S) where {S<:Abstra # already spawn a task to optimise inner policy when initializing the agent RLBase.optimise!(::AsyncTrajectoryStyle, agent::Agent, stage::S) where {S<:AbstractStage} = nothing -function RLBase.optimise!(policy::AbstractPolicy, stage::S, trajectory::Trajectory) where {S<:AbstractStage} - for batch in trajectory - optimise!(policy, stage, batch) - end -end +#by default, optimise does nothing at all stage +function RLBase.optimise!(policy::AbstractPolicy, stage::AbstractStage, trajectory::Trajectory) end @functor Agent (policy,) From 98444e42fafd3a9c17b6a671fc71f46766fabe40 Mon Sep 17 00:00:00 2001 From: Henri Dehaybe <47037088+HenriDeh@users.noreply.github.com> Date: Thu, 15 Jun 2023 11:11:06 +0200 Subject: [PATCH 12/24] optimise! is dispatched on traj and loops it --- .../src/algorithms/dqns/NFQ.jl | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl index 6ecc09b6f..00c620ad6 100644 --- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl +++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl @@ -58,24 +58,24 @@ function RLCore.forward(learner::NFQ, env::AbstractEnv) end # Avoid optimisation in the middle of an episode -function RLBase.optimise!(::NFQ, ::NamedTuple) end +function RLBase.optimise!(::NFQ, ::AbstractStage, ::Trajectory) end -function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, batch::NamedTuple) +function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajectory) Q = learner.approximator γ = learner.γ loss_func = learner.loss_function as = learner.action_space las = length(as) - - - (s, a, r, ss) = batch[[:state, :action, :reward, :next_state]] - a = Float32.(a) - s, a, r, ss = map(x->send_to_device(device(Q), x), (s, a, r, ss)) - for i = 1:learner.num_iterations - # Make an input x samples x |action space| array -- Q --> samples x |action space| -- max --> samples - G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> x -> maximum(RLCore.forward(Q, x), dims=3) |> vec) - for e = 1:learner.epochs - Flux.train!((x, y) -> loss_func(RLCore.forward(Q, x), y), params(Q.model), [(vcat(s, transpose(a)), transpose(G))], Q.optimiser) + for batch in trajectory + (s, a, r, ss) = batch[[:state, :action, :reward, :next_state]] + a = Float32.(a) + s, a, r, ss = map(x->send_to_device(device(Q), x), (s, a, r, ss)) + for i = 1:learner.num_iterations + # Make an input x samples x |action space| array -- Q --> samples x |action space| -- max --> samples + G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> x -> maximum(RLCore.forward(Q, x), dims=3) |> vec) + for e = 1:learner.epochs + Flux.train!((x, y) -> loss_func(RLCore.forward(Q, x), y), params(Q.model), [(vcat(s, transpose(a)), transpose(G))], Q.optimiser) + end end end end From 6be2450a0f5d802be341d4b3accb00ac20a5e217 Mon Sep 17 00:00:00 2001 From: Cas Bex Date: Fri, 16 Jun 2023 14:59:52 +0200 Subject: [PATCH 13/24] Fix precompilation warnings --- .../src/algorithms/dqns/NFQ.jl | 27 ++++++++----------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl index 00c620ad6..ceb686c72 100644 --- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl +++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl @@ -30,28 +30,23 @@ Base.@kwdef struct NFQ{A, R, F} <: AbstractLearner γ::Float32 = 0.9f0 end -function NFQ(; - action_space::AbstractVector, - approximator::A, - num_iterations::Integer = 20, - epochs::Integer = 1000, - loss_function::Any = mse, - rng=Random.default_rng(), - γ::Float32 = 0.9f0, - ) where {A} - NFQ(action_space, approximator, num_iterations, epochs, loss_function, rng, γ) -end +# function NFQ(; +# action_space::AbstractVector, +# approximator::A, +# num_iterations::Integer = 20, +# epochs::Integer = 1000, +# loss_function::Any = mse, +# rng=Random.default_rng(), +# γ::Float32 = 0.9f0, +# ) where {A} +# NFQ(action_space, approximator, num_iterations, epochs, loss_function, rng, γ) +# end # Copied from BasicDQN but sure whether it's appropriate @functor NFQ (approximator,) RLCore.forward(L::NFQ, s::AbstractArray) = RLCore.forward(L.approximator, s) -# Flux.functor(x::NFQ) = (Q = x.approximator,), y -> begin -# x = @set x.approximator = y.Q -# x -# end - function RLCore.forward(learner::NFQ, env::AbstractEnv) as = action_space(env) return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> x->RLCore.forward(learner, x) |> send_to_host |> vec From 2ed5ffbba34915fcca9d8c3b2e26d7eb6f146784 Mon Sep 17 00:00:00 2001 From: Cas Bex Date: Fri, 16 Jun 2023 15:00:05 +0200 Subject: [PATCH 14/24] Avoid running post episode optimise! multiple times --- .../src/algorithms/dqns/NFQ.jl | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl index ceb686c72..fb5ed80b1 100644 --- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl +++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl @@ -61,16 +61,20 @@ function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajecto loss_func = learner.loss_function as = learner.action_space las = length(as) - for batch in trajectory - (s, a, r, ss) = batch[[:state, :action, :reward, :next_state]] - a = Float32.(a) - s, a, r, ss = map(x->send_to_device(device(Q), x), (s, a, r, ss)) - for i = 1:learner.num_iterations - # Make an input x samples x |action space| array -- Q --> samples x |action space| -- max --> samples - G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> x -> maximum(RLCore.forward(Q, x), dims=3) |> vec) - for e = 1:learner.epochs - Flux.train!((x, y) -> loss_func(RLCore.forward(Q, x), y), params(Q.model), [(vcat(s, transpose(a)), transpose(G))], Q.optimiser) - end + batch = nothing + for b in trajectory + batch = b + end + batch === nothing && return + + (s, a, r, ss) = batch[[:state, :action, :reward, :next_state]] + a = Float32.(a) + s, a, r, ss = map(x->send_to_device(device(Q), x), (s, a, r, ss)) + for i = 1:learner.num_iterations + # Make an input x samples x |action space| array -- Q --> samples x |action space| -- max --> samples + G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> x -> maximum(RLCore.forward(Q, x), dims=3) |> vec) + for e = 1:learner.epochs + Flux.train!((x, y) -> loss_func(RLCore.forward(Q, x), y), params(Q.model), [(vcat(s, a), transpose(G))], Q.optimiser) end end end From da384b206929d56ad6163047f502954d6364055a Mon Sep 17 00:00:00 2001 From: Cas Bex Date: Fri, 16 Jun 2023 15:27:25 +0200 Subject: [PATCH 15/24] Tune experiment --- .../experiments/DQN/JuliaRL_NFQ_CartPole.jl | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl index d2ed6e724..e313004d6 100644 --- a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl +++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl @@ -1,7 +1,7 @@ # --- -# title: JuliaRL\_NFQ\_PendulumDiscrete +# title: JuliaRL\_NFQ\_CartPole # cover: assets/JuliaRL_BasicDQN_CartPole.png -# description: NFQ applied to discrete Pendulum +# description: NFQ applied to the cartpole environment # date: 2023-06 # author: "[Lucas Bex](https://github.com/CasBex)" # --- @@ -13,7 +13,7 @@ using Flux using Flux: glorot_uniform using StableRNGs: StableRNG -using Flux.Losses: huber_loss +using Flux.Losses: mse function RLCore.Experiment( ::Val{:JuliaRL}, @@ -31,30 +31,32 @@ function RLCore.Experiment( action_space=action_space(env), approximator=Approximator( model=Chain( - Dense(ns+na, 64, relu; init=glorot_uniform(rng)), - Dense(64, 64, relu; init=glorot_uniform(rng)), - Dense(64, 1; init=glorot_uniform(rng)), + Dense(ns+na, 5, σ; init=glorot_uniform(rng)), + Dense(5, 5, σ; init=glorot_uniform(rng)), + Dense(5, 1; init=glorot_uniform(rng)), ) |> gpu, optimiser=RMSProp() ), - loss_function=huber_loss, - epochs=500, - num_iterations=10 + loss_function=mse, + epochs=100, + num_iterations=10, + γ = 0.95f0 ), explorer=EpsilonGreedyExplorer( kind=:exp, - ϵ_stable=0.01, - decay_steps=500, + ϵ_stable=0.001, + warmup_steps=500, rng=rng, ), ), trajectory=Trajectory( container=CircularArraySARTTraces( - capacity=1000, + capacity=10_000, state=Float32 => (ns,), + action=Float32 => (na,), ), sampler=BatchSampler{SS′ART}( - batch_size=1000, + batch_size=10_000, rng=rng ), controller=InsertSampleRatioController( @@ -71,7 +73,7 @@ function RLCore.Experiment( #+ tangle=false using Plots -pyplot() # hide +# pyplot() # hide ex = E`JuliaRL_NFQ_CartPole` run(ex) plot(ex.hook.rewards) From 5ab7a1c850e5755a7969d5448d008e144ccf8f32 Mon Sep 17 00:00:00 2001 From: Cas Bex Date: Mon, 19 Jun 2023 09:36:48 +0200 Subject: [PATCH 16/24] Remove commented code --- .../src/algorithms/dqns/NFQ.jl | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl index fb5ed80b1..a132ec7d1 100644 --- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl +++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl @@ -30,19 +30,6 @@ Base.@kwdef struct NFQ{A, R, F} <: AbstractLearner γ::Float32 = 0.9f0 end -# function NFQ(; -# action_space::AbstractVector, -# approximator::A, -# num_iterations::Integer = 20, -# epochs::Integer = 1000, -# loss_function::Any = mse, -# rng=Random.default_rng(), -# γ::Float32 = 0.9f0, -# ) where {A} -# NFQ(action_space, approximator, num_iterations, epochs, loss_function, rng, γ) -# end - -# Copied from BasicDQN but sure whether it's appropriate @functor NFQ (approximator,) RLCore.forward(L::NFQ, s::AbstractArray) = RLCore.forward(L.approximator, s) From afc21b6522b393cd5519967305c0e689d99c22ce Mon Sep 17 00:00:00 2001 From: CasBex <123587431+CasBex@users.noreply.github.com> Date: Mon, 19 Jun 2023 09:39:06 +0200 Subject: [PATCH 17/24] Drop gpu call Co-authored-by: Henri Dehaybe <47037088+HenriDeh@users.noreply.github.com> --- .../deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl index e313004d6..07a48ecef 100644 --- a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl +++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl @@ -34,7 +34,7 @@ function RLCore.Experiment( Dense(ns+na, 5, σ; init=glorot_uniform(rng)), Dense(5, 5, σ; init=glorot_uniform(rng)), Dense(5, 1; init=glorot_uniform(rng)), - ) |> gpu, + ), optimiser=RMSProp() ), loss_function=mse, From 033dcdf3a59878d2438bb168a8136dd3e3885efe Mon Sep 17 00:00:00 2001 From: Cas Bex Date: Mon, 19 Jun 2023 10:59:37 +0200 Subject: [PATCH 18/24] Use `sample` to get batch from trajectory --- src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl index a132ec7d1..f4de8b127 100644 --- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl +++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl @@ -48,11 +48,7 @@ function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajecto loss_func = learner.loss_function as = learner.action_space las = length(as) - batch = nothing - for b in trajectory - batch = b - end - batch === nothing && return + batch = ReinforcementLearningTrajectories.sample(trajectory) (s, a, r, ss) = batch[[:state, :action, :reward, :next_state]] a = Float32.(a) From 31b55b5f85dcb98bd5c757bc2454b1ccb60b6bf6 Mon Sep 17 00:00:00 2001 From: Cas Bex Date: Mon, 19 Jun 2023 13:38:15 +0200 Subject: [PATCH 19/24] optimise! for AbstractLearner --- src/ReinforcementLearningCore/src/policies/learners.jl | 2 ++ src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/ReinforcementLearningCore/src/policies/learners.jl b/src/ReinforcementLearningCore/src/policies/learners.jl index 0a0197a4d..13f505e24 100644 --- a/src/ReinforcementLearningCore/src/policies/learners.jl +++ b/src/ReinforcementLearningCore/src/policies/learners.jl @@ -10,6 +10,8 @@ Base.show(io::IO, m::MIME"text/plain", L::AbstractLearner) = show(io, m, convert # Take Learner and Environment, get state, send to RLCore.forward(Learner, State) forward(L::Le, env::E) where {Le <: AbstractLearner, E <: AbstractEnv} = env |> state |> send_to_device(L.approximator) |> x -> forward(L, x) |> send_to_device(env) +function RLBase.optimise!(::AbstractLearner, ::AbstractStage, ::Trajectory) end + Base.@kwdef mutable struct Approximator{M,O} model::M optimiser::O diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl index f4de8b127..8dbbaa339 100644 --- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl +++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl @@ -39,9 +39,6 @@ function RLCore.forward(learner::NFQ, env::AbstractEnv) return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> x->RLCore.forward(learner, x) |> send_to_host |> vec end -# Avoid optimisation in the middle of an episode -function RLBase.optimise!(::NFQ, ::AbstractStage, ::Trajectory) end - function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajectory) Q = learner.approximator γ = learner.γ From b53c96b4cefd347e07943e793ce6b9a16494bcbb Mon Sep 17 00:00:00 2001 From: Cas Bex Date: Mon, 19 Jun 2023 14:07:06 +0200 Subject: [PATCH 20/24] NFQ optimise! calls at the correct time --- src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl index 8dbbaa339..16990fc84 100644 --- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl +++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl @@ -39,6 +39,12 @@ function RLCore.forward(learner::NFQ, env::AbstractEnv) return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> x->RLCore.forward(learner, x) |> send_to_host |> vec end +function RLBase.optimise!(::NFQ, ::NamedTuple) end + +function RLBase.optimise!(p::QBasedPolicy{L,Ex}, s::PostEpisodeStage, trajectory::Trajectory) where {L<:NFQ,Ex<:AbstractExplorer} + RLBase.optimise!(p.learner, s, trajectory) +end + function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajectory) Q = learner.approximator γ = learner.γ From f77e198efae830091373893e926ba4afec56ee76 Mon Sep 17 00:00:00 2001 From: Cas Bex Date: Fri, 23 Jun 2023 11:51:42 +0200 Subject: [PATCH 21/24] Remove superfluous function due to main merge --- src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl index 16990fc84..8dbbaa339 100644 --- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl +++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl @@ -39,12 +39,6 @@ function RLCore.forward(learner::NFQ, env::AbstractEnv) return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> x->RLCore.forward(learner, x) |> send_to_host |> vec end -function RLBase.optimise!(::NFQ, ::NamedTuple) end - -function RLBase.optimise!(p::QBasedPolicy{L,Ex}, s::PostEpisodeStage, trajectory::Trajectory) where {L<:NFQ,Ex<:AbstractExplorer} - RLBase.optimise!(p.learner, s, trajectory) -end - function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajectory) Q = learner.approximator γ = learner.γ From 66ea89bf4c7d6af2f58a937258ebd401ce1c6aef Mon Sep 17 00:00:00 2001 From: Cas Bex Date: Fri, 23 Jun 2023 11:52:01 +0200 Subject: [PATCH 22/24] Anonymous loop variable --- src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl index 8dbbaa339..30442bdbb 100644 --- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl +++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl @@ -53,7 +53,7 @@ function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajecto for i = 1:learner.num_iterations # Make an input x samples x |action space| array -- Q --> samples x |action space| -- max --> samples G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> x -> maximum(RLCore.forward(Q, x), dims=3) |> vec) - for e = 1:learner.epochs + for _ = 1:learner.epochs Flux.train!((x, y) -> loss_func(RLCore.forward(Q, x), y), params(Q.model), [(vcat(s, a), transpose(G))], Q.optimiser) end end From 37be2a6353a862702bfc4b8bec11b35af982b14a Mon Sep 17 00:00:00 2001 From: Cas Bex Date: Fri, 23 Jun 2023 11:54:34 +0200 Subject: [PATCH 23/24] Update NFQ docs --- src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl index 30442bdbb..a14d36c5b 100644 --- a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl +++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl @@ -5,15 +5,15 @@ using Functors: @functor """ NFQ{A<:AbstractApproximator, F, R} <: AbstractLearner - NFQ(approximator::A, num_iterations::Integer epochs::Integer, loss_function::F, batch_size::Integer, rng::R, γ::Float32) where {A<:AbstractApproximator, F, R} + NFQ(action_space::AbstractVector, approximator::A, num_iterations::Integer epochs::Integer, loss_function::F, rng::R, γ::Float32) where {A, F, R} Neural Fitted Q-iteration as implemented in [1] # Keyword arguments -- `approximator::AbstractApproximator` neural network +- `action_space::AbstractVector` Action space of the environment (necessary in the optimise! step) +- `approximator::A` Q-function approximator (typically a neural network) - `num_iterations::Integer` number of value iteration iterations in FQI loop (i.e. the outer loop) -- `epochs` number of epochs to train neural network per iteration +- `epochs::Integer` number of epochs to train neural network per iteration - `loss_function::F` loss function of the NN -- `sampler::BatchSampler{SARTS}` data sampler - `rng::R` random number generator - `γ::Float32` discount rate From c43f37ab60a634960020bdbbb7c993e6bd888bc7 Mon Sep 17 00:00:00 2001 From: Henri Dehaybe <47037088+HenriDeh@users.noreply.github.com> Date: Mon, 26 Jun 2023 14:46:00 +0200 Subject: [PATCH 24/24] Update julia_words.txt --- .cspell/julia_words.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.cspell/julia_words.txt b/.cspell/julia_words.txt index 08196fd32..eb3b29b18 100644 --- a/.cspell/julia_words.txt +++ b/.cspell/julia_words.txt @@ -5294,4 +5294,5 @@ sqmahal logdpf devmode logpdfs -kldivs \ No newline at end of file +kldivs +Riedmiller