diff --git a/.cspell/julia_words.txt b/.cspell/julia_words.txt index 08196fd32..eb3b29b18 100644 --- a/.cspell/julia_words.txt +++ b/.cspell/julia_words.txt @@ -5294,4 +5294,5 @@ sqmahal logdpf devmode logpdfs -kldivs \ No newline at end of file +kldivs +Riedmiller diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl new file mode 100644 index 000000000..07a48ecef --- /dev/null +++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/JuliaRL_NFQ_CartPole.jl @@ -0,0 +1,95 @@ +# --- +# title: JuliaRL\_NFQ\_CartPole +# cover: assets/JuliaRL_BasicDQN_CartPole.png +# description: NFQ applied to the cartpole environment +# date: 2023-06 +# author: "[Lucas Bex](https://github.com/CasBex)" +# --- + +#+ tangle=true +using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo +using ReinforcementLearningEnvironments +using Flux +using Flux: glorot_uniform + +using StableRNGs: StableRNG +using Flux.Losses: mse + +function RLCore.Experiment( + ::Val{:JuliaRL}, + ::Val{:NFQ}, + ::Val{:CartPole}, + seed = 123, +) + rng = StableRNG(seed) + env = CartPoleEnv(; T=Float32, rng=rng) + ns, na = length(state(env)), length(first(action_space(env))) + + agent = Agent( + policy=QBasedPolicy( + learner=NFQ( + action_space=action_space(env), + approximator=Approximator( + model=Chain( + Dense(ns+na, 5, σ; init=glorot_uniform(rng)), + Dense(5, 5, σ; init=glorot_uniform(rng)), + Dense(5, 1; init=glorot_uniform(rng)), + ), + optimiser=RMSProp() + ), + loss_function=mse, + epochs=100, + num_iterations=10, + γ = 0.95f0 + ), + explorer=EpsilonGreedyExplorer( + kind=:exp, + ϵ_stable=0.001, + warmup_steps=500, + rng=rng, + ), + ), + trajectory=Trajectory( + container=CircularArraySARTTraces( + capacity=10_000, + state=Float32 => (ns,), + action=Float32 => (na,), + ), + sampler=BatchSampler{SS′ART}( + batch_size=10_000, + rng=rng + ), + controller=InsertSampleRatioController( + threshold=100, + n_inserted=-1 + ) + ) + ) + + stop_condition = StopAfterStep(10_000, is_show_progress=!haskey(ENV, "CI")) + hook = TotalRewardPerEpisode() + Experiment(agent, env, stop_condition, hook) + end + +#+ tangle=false +using Plots +# pyplot() # hide +ex = E`JuliaRL_NFQ_CartPole` +run(ex) +plot(ex.hook.rewards) +savefig("assets/JuliaRL_NFQ_CartPole.png") #hide + +#= +## Watch a demo episode with the trained agent + +```julia +demo = Experiment(ex.policy, + CartPoleEnv(), + StopWhenDone(), + RolloutHook(plot, closeall), + "DQN <-> Demo") +run(demo) +``` +=# + +# ![](assets/JuliaRL_NFQ_CartPole.png) diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/config.json b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/config.json index e6568ee09..65db0d95d 100644 --- a/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/config.json +++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/DQN/config.json @@ -1,6 +1,7 @@ { "description": "DQN related experiments.", "order": [ + "JuliaRL_NFQ_CartPole.jl", "JuliaRL_BasicDQN_CartPole.jl", "JuliaRL_BasicDQN_MountainCar.jl", "JuliaRL_BasicDQN_PendulumDiscrete.jl", diff --git a/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl b/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl index a280e7746..ca27a1031 100644 --- a/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl +++ b/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl @@ -8,6 +8,7 @@ const EXPERIMENTS_DIR = joinpath(@__DIR__, "experiments") # for f in readdir(EXPERIMENTS_DIR) # include(joinpath(EXPERIMENTS_DIR, f)) # end +include(joinpath(EXPERIMENTS_DIR, "JuliaRL_NFQ_CartPole.jl")) include(joinpath(EXPERIMENTS_DIR, "JuliaRL_BasicDQN_CartPole.jl")) include(joinpath(EXPERIMENTS_DIR, "JuliaRL_DQN_CartPole.jl")) include(joinpath(EXPERIMENTS_DIR, "JuliaRL_PrioritizedDQN_CartPole.jl")) diff --git a/src/ReinforcementLearningExperiments/test/runtests.jl b/src/ReinforcementLearningExperiments/test/runtests.jl index 723a5dbd3..1086313e0 100644 --- a/src/ReinforcementLearningExperiments/test/runtests.jl +++ b/src/ReinforcementLearningExperiments/test/runtests.jl @@ -3,6 +3,7 @@ using CUDA CUDA.allowscalar(false) +run(E`JuliaRL_NFQ_CartPole`) run(E`JuliaRL_BasicDQN_CartPole`) run(E`JuliaRL_DQN_CartPole`) run(E`JuliaRL_PrioritizedDQN_CartPole`) diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl new file mode 100644 index 000000000..a14d36c5b --- /dev/null +++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/NFQ.jl @@ -0,0 +1,60 @@ +export NFQ + +using Flux +using Functors: @functor + +""" + NFQ{A<:AbstractApproximator, F, R} <: AbstractLearner + NFQ(action_space::AbstractVector, approximator::A, num_iterations::Integer epochs::Integer, loss_function::F, rng::R, γ::Float32) where {A, F, R} +Neural Fitted Q-iteration as implemented in [1] + +# Keyword arguments +- `action_space::AbstractVector` Action space of the environment (necessary in the optimise! step) +- `approximator::A` Q-function approximator (typically a neural network) +- `num_iterations::Integer` number of value iteration iterations in FQI loop (i.e. the outer loop) +- `epochs::Integer` number of epochs to train neural network per iteration +- `loss_function::F` loss function of the NN +- `rng::R` random number generator +- `γ::Float32` discount rate + +# References +[1] Riedmiller, M. (2005). Neural Fitted Q Iteration – First Experiences with a Data Efficient Neural Reinforcement Learning Method. In: Gama, J., Camacho, R., Brazdil, P.B., Jorge, A.M., Torgo, L. (eds) Machine Learning: ECML 2005. ECML 2005. Lecture Notes in Computer Science(), vol 3720. Springer, Berlin, Heidelberg. https://doi.org/10.1007/11564096_32 +""" +Base.@kwdef struct NFQ{A, R, F} <: AbstractLearner + action_space::AbstractVector + approximator::A + num_iterations::Integer = 20 + epochs::Integer = 100 + loss_function::F = mse + rng::R = Random.default_rng() + γ::Float32 = 0.9f0 +end + +@functor NFQ (approximator,) + +RLCore.forward(L::NFQ, s::AbstractArray) = RLCore.forward(L.approximator, s) + +function RLCore.forward(learner::NFQ, env::AbstractEnv) + as = action_space(env) + return vcat(repeat(state(env), inner=(1, length(as))), transpose(as)) |> x -> send_to_device(device(learner.approximator), x) |> x->RLCore.forward(learner, x) |> send_to_host |> vec +end + +function RLBase.optimise!(learner::NFQ, ::PostEpisodeStage, trajectory::Trajectory) + Q = learner.approximator + γ = learner.γ + loss_func = learner.loss_function + as = learner.action_space + las = length(as) + batch = ReinforcementLearningTrajectories.sample(trajectory) + + (s, a, r, ss) = batch[[:state, :action, :reward, :next_state]] + a = Float32.(a) + s, a, r, ss = map(x->send_to_device(device(Q), x), (s, a, r, ss)) + for i = 1:learner.num_iterations + # Make an input x samples x |action space| array -- Q --> samples x |action space| -- max --> samples + G = r .+ γ .* (cat(repeat(ss, inner=(1, 1, las)), reshape(repeat(as, outer=(1, size(ss, 2))), (1, size(ss, 2), las)), dims=1) |> x -> maximum(RLCore.forward(Q, x), dims=3) |> vec) + for _ = 1:learner.epochs + Flux.train!((x, y) -> loss_func(RLCore.forward(Q, x), y), params(Q.model), [(vcat(s, a), transpose(G))], Q.optimiser) + end + end +end diff --git a/src/ReinforcementLearningZoo/src/algorithms/dqns/dqns.jl b/src/ReinforcementLearningZoo/src/algorithms/dqns/dqns.jl index b20517abb..49f88f54a 100644 --- a/src/ReinforcementLearningZoo/src/algorithms/dqns/dqns.jl +++ b/src/ReinforcementLearningZoo/src/algorithms/dqns/dqns.jl @@ -1,8 +1,9 @@ include("basic_dqn.jl") +include("NFQ.jl") include("dqn.jl") include("prioritized_dqn.jl") include("qr_dqn.jl") include("rem_dqn.jl") include("iqn.jl") include("rainbow.jl") -# include("common.jl") \ No newline at end of file +# include("common.jl")