diff --git a/pax/conf/experiment/cg/pre_train.yaml b/pax/conf/experiment/cg/pre_train.yaml index fdb10963..62624d03 100644 --- a/pax/conf/experiment/cg/pre_train.yaml +++ b/pax/conf/experiment/cg/pre_train.yaml @@ -61,8 +61,29 @@ num_seeds: 10 # model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-0/2022-09-19_10.25.28.808811/generation_2900 # {NEW} EARL pretrained against PPO RNN -run_path: ucl-dark/cg/v348mp4r -model_path: exp/EARL-PPO_memory-vs-PPO_memory/run-seed-0/2022-09-25_23.12.56.999833/generation_400 +# run_path: ucl-dark/cg/v348mp4r +# model_path: exp/EARL-PPO_memory-vs-PPO_memory/run-seed-0/2022-09-25_23.12.56.999833/generation_400 + +# FINAL EARL pre-trained against PPO RNN +# run_path: ucl-dark/cg/z0ckvwtf +# model_path: exp/EARL-PPO_memory-vs-PPO_memory/run-seed-0/2022-09-28_01.58.33.843138/generation_1200 + +# FINAL EARL pre-trained against Tabular +# run_path: ucl-dark/cg/2dxz9fup +# model_path: exp/EARL-PPO_memory-vs-Tabular/run-seed-0/2022-09-28_01.59.11.312943/generation_50 + +# FINAL MFOS pre-trained against PPO RNN +# run_path: ucl-dark/cg/34iaxeps +# model_path: exp/MFOS-vs-PPO_memory/run-seed-0/2022-09-28_14.23.22.065961/generation_570 + +# FINAL GS trained against PPO RNN +# run_path: ucl-dark/cg/2qrono7s +# model_path: exp/GS-PPO-vs-PPO_memory/run-seed-0/2022-09-28_01.51.37.913074/generation_1350 + +# FINAL GS traind against Tab +run_path: ucl-dark/cg/2xc30qob +model_path: exp/GS-PPO-vs-Tabular/run-seed-0/2022-09-28_02.01.03.381847/generation_50 + # PPO agent parameters ppo: diff --git a/pax/conf/experiment/ipd/earl_v_ppo_mem.yaml b/pax/conf/experiment/ipd/earl_v_ppo_mem.yaml index 9047197b..fba7a4cc 100644 --- a/pax/conf/experiment/ipd/earl_v_ppo_mem.yaml +++ b/pax/conf/experiment/ipd/earl_v_ppo_mem.yaml @@ -15,7 +15,6 @@ payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] evo: True num_devices: 1 - # Training top_k: 5 popsize: 1000 @@ -27,24 +26,6 @@ num_generations: 5000 total_timesteps: 1e11 eval_every: 1e11 -# Evaluation -num_seeds: 20 -# # EARL vs. PPO trained on seed=0 -# run_path: ucl-dark/ipd/13o3v95p -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-0-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.31.908871/generation_2900 -# EARL vs. PPO trained on seed=1 -# run_path: ucl-dark/ipd/dopodr9n -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-1-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.58.912526/generation_2900 -# EARL vs. PPO trained on seed=2 -# run_path: ucl-dark/ipd/265ftn32 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-2-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.17.00.437954/generation_2900 -# EARL vs. PPO trained on seed=3 -# run_path: ucl-dark/ipd/1hffijy2 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-3-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.19.01.387867/generation_2900 -# EARL vs. PPO trained on seed=25 -run_path: ucl-dark/ipd/1ui7wfop -model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900 - # PPO agent parameters ppo: num_minibatches: 4 diff --git a/pax/conf/experiment/ipd/earl_v_tabular.yaml b/pax/conf/experiment/ipd/earl_v_tabular.yaml index a6ab1a30..1170ab40 100644 --- a/pax/conf/experiment/ipd/earl_v_tabular.yaml +++ b/pax/conf/experiment/ipd/earl_v_tabular.yaml @@ -41,8 +41,12 @@ num_seeds: 20 # run_path: ucl-dark/ipd/1hffijy2 # model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-3-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.19.01.387867/generation_2900 # EARL vs. PPO trained on seed=25 -run_path: ucl-dark/ipd/1ui7wfop -model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900 +# run_path: ucl-dark/ipd/1ui7wfop +# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900 + +# hardstop +run_path: ucl-dark/ipd/1ow3zit1 +model_path: exp/EARL-PPO_memory-vs-Tabular/run-seed-0-OpenES-pop-size-1000-num-opps-1/2022-09-28_14.32.56.208897/generation_4900 # PPO agent parameters ppo: diff --git a/pax/conf/experiment/ipd/gs_v_ppo.yaml b/pax/conf/experiment/ipd/gs_v_ppo.yaml index a2bc1b1b..f4f1a3ca 100644 --- a/pax/conf/experiment/ipd/gs_v_ppo.yaml +++ b/pax/conf/experiment/ipd/gs_v_ppo.yaml @@ -27,24 +27,6 @@ num_generations: 5000 total_timesteps: 1e11 num_devices: 1 -# Evaluation -num_seeds: 20 -# # EARL vs. PPO trained on seed=0 -# run_path: ucl-dark/ipd/13o3v95p -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-0-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.31.908871/generation_2900 -# EARL vs. PPO trained on seed=1 -# run_path: ucl-dark/ipd/dopodr9n -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-1-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.58.912526/generation_2900 -# EARL vs. PPO trained on seed=2 -# run_path: ucl-dark/ipd/265ftn32 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-2-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.17.00.437954/generation_2900 -# EARL vs. PPO trained on seed=3 -# run_path: ucl-dark/ipd/1hffijy2 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-3-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.19.01.387867/generation_2900 -# EARL vs. PPO trained on seed=25 -run_path: ucl-dark/ipd/1ui7wfop -model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900 - # PPO agent parameters ppo: num_minibatches: 4 diff --git a/pax/conf/experiment/ipd/gs_v_ppo_mem.yaml b/pax/conf/experiment/ipd/gs_v_ppo_mem.yaml index 49bc149c..45f3cf57 100644 --- a/pax/conf/experiment/ipd/gs_v_ppo_mem.yaml +++ b/pax/conf/experiment/ipd/gs_v_ppo_mem.yaml @@ -27,24 +27,6 @@ num_generations: 5000 total_timesteps: 1e11 num_devices: 1 -# Evaluation -num_seeds: 20 -# # EARL vs. PPO trained on seed=0 -# run_path: ucl-dark/ipd/13o3v95p -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-0-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.31.908871/generation_2900 -# EARL vs. PPO trained on seed=1 -# run_path: ucl-dark/ipd/dopodr9n -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-1-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.58.912526/generation_2900 -# EARL vs. PPO trained on seed=2 -# run_path: ucl-dark/ipd/265ftn32 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-2-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.17.00.437954/generation_2900 -# EARL vs. PPO trained on seed=3 -# run_path: ucl-dark/ipd/1hffijy2 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-3-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.19.01.387867/generation_2900 -# EARL vs. PPO trained on seed=25 -run_path: ucl-dark/ipd/1ui7wfop -model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900 - # PPO agent parameters ppo: num_minibatches: 4 diff --git a/pax/conf/experiment/ipd/gs_v_tabular.yaml b/pax/conf/experiment/ipd/gs_v_tabular.yaml index 3dbf239b..8e2240d4 100644 --- a/pax/conf/experiment/ipd/gs_v_tabular.yaml +++ b/pax/conf/experiment/ipd/gs_v_tabular.yaml @@ -29,21 +29,46 @@ num_devices: 1 # Evaluation num_seeds: 20 -# # EARL vs. PPO trained on seed=0 -# run_path: ucl-dark/ipd/13o3v95p -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-0-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.31.908871/generation_2900 -# EARL vs. PPO trained on seed=1 -# run_path: ucl-dark/ipd/dopodr9n -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-1-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.58.912526/generation_2900 -# EARL vs. PPO trained on seed=2 -# run_path: ucl-dark/ipd/265ftn32 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-2-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.17.00.437954/generation_2900 -# EARL vs. PPO trained on seed=3 -# run_path: ucl-dark/ipd/1hffijy2 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-3-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.19.01.387867/generation_2900 -# EARL vs. PPO trained on seed=25 -run_path: ucl-dark/ipd/1ui7wfop -model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900 +# GS vs. Tabular trained on seed=0, where Naive Learners have their learning rate annealed halfway through the trial +# run_path: ucl-dark/ipd/1gg0p92x +# model_path: exp/GS-PPO-vs-Tabular/run-seed-0-pop-size-1000/2022-09-28_01.57.34.854198/generation_4900 + +# GS vs. Tabular trained on seed=1, where Naive Learners have their learning rate annealed halfway through the trial +# run_path: ucl-dark/ipd/scffrmfv +# model_path: exp/GS-PPO-vs-Tabular/run-seed-1-pop-size-1000/2022-09-28_05.00.56.131987/generation_4900 + +# GS vs. Tabular trained on seed=2, where Naive Learners have their learning rate annealed halfway through the trial +# run_path: ucl-dark/ipd/2858x8sa +# model_path: exp/GS-PPO-vs-Tabular/run-seed-2-pop-size-1000/2022-09-28_07.38.37.221049/generation_4900 + +# GS vs. Tabular trained on seed=3, where Naive Learners have their learning rate annealed halfway through the trial +# run_path: ucl-dark/ipd/1y9tefvj +# model_path: exp/GS-PPO-vs-Tabular/run-seed-3-pop-size-1000/2022-09-28_01.57.40.696321/generation_4900 + +# GS vs. Tabular trained on seed=4, where Naive Learners have their learning rate annealed halfway through the trial +# run_path: ucl-dark/ipd/8j6zmb6h +# model_path: exp/GS-PPO-vs-Tabular/run-seed-4-pop-size-1000/2022-09-28_05.11.49.206169/generation_4900 + +# GS vs. Tabular trained on seed = 0 +# run_path: ucl-dark/ipd/tywwxijw +# model_path: exp/GS-PPO-vs-Tabular/run-seed-0-pop-size-1000/2022-09-25_16.06.55.715665/generation_4900 +# GS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/2lyn9n10 +# model_path: exp/GS-PPO-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_16.07.48.978281/generation_4900 +# GS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/f2xhuhcz +# model_path: exp/GS-PPO-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_16.08.35.015944/generation_4900 +# GS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/16wzxeb6 +# model_path: exp/GS-PPO-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_16.09.01.274669/generation_4900 +# GS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/3dzkof3f +# model_path: exp/GS-PPO-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_16.41.50.643263/generation_4900 + + +# hard stop +run_path: ucl-dark/ipd/2kyx0680 +model_path: exp/EARL-PPO-vs-Tabular/run-seed-0-OpenES-pop-size-1000-num-opps-1/2022-09-28_14.35.25.775807/generation_4900 # PPO agent parameters ppo: diff --git a/pax/conf/experiment/ipd/mfos_v_ppo.yaml b/pax/conf/experiment/ipd/mfos_v_ppo.yaml index 5e5e32ed..c2439c96 100644 --- a/pax/conf/experiment/ipd/mfos_v_ppo.yaml +++ b/pax/conf/experiment/ipd/mfos_v_ppo.yaml @@ -26,24 +26,6 @@ num_generations: 5000 total_timesteps: 1e11 num_devices: 1 -# Evaluation -num_seeds: 20 -# # EARL vs. PPO trained on seed=0 -# run_path: ucl-dark/ipd/13o3v95p -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-0-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.31.908871/generation_2900 -# EARL vs. PPO trained on seed=1 -# run_path: ucl-dark/ipd/dopodr9n -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-1-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.58.912526/generation_2900 -# EARL vs. PPO trained on seed=2 -# run_path: ucl-dark/ipd/265ftn32 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-2-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.17.00.437954/generation_2900 -# EARL vs. PPO trained on seed=3 -# run_path: ucl-dark/ipd/1hffijy2 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-3-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.19.01.387867/generation_2900 -# EARL vs. PPO trained on seed=25 -run_path: ucl-dark/ipd/1ui7wfop -model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900 - # PPO agent parameters ppo: num_minibatches: 4 diff --git a/pax/conf/experiment/ipd/mfos_v_ppo_mem.yaml b/pax/conf/experiment/ipd/mfos_v_ppo_mem.yaml index 98ea3e4b..9e7df109 100644 --- a/pax/conf/experiment/ipd/mfos_v_ppo_mem.yaml +++ b/pax/conf/experiment/ipd/mfos_v_ppo_mem.yaml @@ -26,24 +26,6 @@ num_generations: 5000 total_timesteps: 1e11 num_devices: 1 -# Evaluation -num_seeds: 20 -# # EARL vs. PPO trained on seed=0 -# run_path: ucl-dark/ipd/13o3v95p -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-0-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.31.908871/generation_2900 -# EARL vs. PPO trained on seed=1 -# run_path: ucl-dark/ipd/dopodr9n -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-1-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.58.912526/generation_2900 -# EARL vs. PPO trained on seed=2 -# run_path: ucl-dark/ipd/265ftn32 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-2-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.17.00.437954/generation_2900 -# EARL vs. PPO trained on seed=3 -# run_path: ucl-dark/ipd/1hffijy2 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-3-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.19.01.387867/generation_2900 -# EARL vs. PPO trained on seed=25 -run_path: ucl-dark/ipd/1ui7wfop -model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900 - # PPO agent parameters ppo: num_minibatches: 4 diff --git a/pax/conf/experiment/ipd/mfos_v_tabular.yaml b/pax/conf/experiment/ipd/mfos_v_tabular.yaml index aa764724..183fb1cc 100644 --- a/pax/conf/experiment/ipd/mfos_v_tabular.yaml +++ b/pax/conf/experiment/ipd/mfos_v_tabular.yaml @@ -28,22 +28,25 @@ num_devices: 1 # Evaluation num_seeds: 20 -# # EARL vs. PPO trained on seed=0 -# run_path: ucl-dark/ipd/13o3v95p -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-0-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.31.908871/generation_2900 -# EARL vs. PPO trained on seed=1 -# run_path: ucl-dark/ipd/dopodr9n -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-1-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.58.912526/generation_2900 -# EARL vs. PPO trained on seed=2 -# run_path: ucl-dark/ipd/265ftn32 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-2-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.17.00.437954/generation_2900 -# EARL vs. PPO trained on seed=3 -# run_path: ucl-dark/ipd/1hffijy2 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-3-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.19.01.387867/generation_2900 -# EARL vs. PPO trained on seed=25 -run_path: ucl-dark/ipd/1ui7wfop -model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900 +# MFOS vs. Tabular trained on seed = 0 +# run_path: ucl-dark/ipd/xyq4feoj +# model_path: exp/MFOS-vs-Tabular/run-seed-0-pop-size-1000/2022-09-28_22.45.26.138403/generation_4900 +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 +# hardstop +run_path: ucl-dark/ipd/32k6kn5v +model_path: exp/MFOS-vs-Tabular/run-seed-0-pop-size-1000/2022-09-28_14.54.58.821778/generation_4900 # PPO agent parameters ppo: num_minibatches: 4 diff --git a/pax/conf/experiment/mp/earl_v_tabular.yaml b/pax/conf/experiment/mp/earl_v_tabular.yaml index e7e79fe0..38d38cbf 100644 --- a/pax/conf/experiment/mp/earl_v_tabular.yaml +++ b/pax/conf/experiment/mp/earl_v_tabular.yaml @@ -28,21 +28,21 @@ num_devices: 1 # Evaluation num_seeds: 20 -# # EARL vs. PPO trained on seed=0 -# run_path: ucl-dark/ipd/13o3v95p -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-0-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.31.908871/generation_2900 -# EARL vs. PPO trained on seed=1 -# run_path: ucl-dark/ipd/dopodr9n -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-1-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.58.912526/generation_2900 -# EARL vs. PPO trained on seed=2 -# run_path: ucl-dark/ipd/265ftn32 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-2-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.17.00.437954/generation_2900 -# EARL vs. PPO trained on seed=3 -# run_path: ucl-dark/ipd/1hffijy2 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-3-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.19.01.387867/generation_2900 -# EARL vs. PPO trained on seed=25 -run_path: ucl-dark/ipd/1ui7wfop -model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900 +# seed = 0 +# run_path: ucl-dark/mp/36w1tuju +# model_path: exp/GS-PPO_memory-vs-Tabular/run-seed-0-pop-size-1000/2022-09-25_15.52.33.707967/generation_4900 +# seed = 1 +# run_path: ucl-dark/mp/3074jksy +# model_path: exp/GS-PPO_memory-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_15.53.12.637138/generation_4900 +# seed = 2 +# run_path: ucl-dark/mp/1h18aq5c +# model_path: exp/GS-PPO_memory-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_15.53.49.913885/generation_4900 +# seed = 3 +# run_path: ucl-dark/mp/3ew2bidu +# model_path: exp/GS-PPO_memory-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_15.54.11.059838/generation_4900 +# seed = 4 +run_path: ucl-dark/mp/glpfg7zd +model_path: exp/GS-PPO_memory-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_15.55.16.444095/generation_4900 # PPO agent parameters ppo: diff --git a/pax/conf/experiment/mp/gs_v_tabular.yaml b/pax/conf/experiment/mp/gs_v_tabular.yaml index 8e7a7f0d..c67a2f90 100644 --- a/pax/conf/experiment/mp/gs_v_tabular.yaml +++ b/pax/conf/experiment/mp/gs_v_tabular.yaml @@ -28,21 +28,23 @@ num_devices: 1 # Evaluation num_seeds: 20 -# # EARL vs. PPO trained on seed=0 -# run_path: ucl-dark/ipd/13o3v95p -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-0-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.31.908871/generation_2900 -# EARL vs. PPO trained on seed=1 -# run_path: ucl-dark/ipd/dopodr9n -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-1-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.58.912526/generation_2900 -# EARL vs. PPO trained on seed=2 -# run_path: ucl-dark/ipd/265ftn32 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-2-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.17.00.437954/generation_2900 -# EARL vs. PPO trained on seed=3 -# run_path: ucl-dark/ipd/1hffijy2 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-3-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.19.01.387867/generation_2900 -# EARL vs. PPO trained on seed=25 -run_path: ucl-dark/ipd/1ui7wfop -model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900 +# seed = 0 +# run_path: ucl-dark/mp/11v18zvw +# model_path: exp/GS-PPO-vs-Tabular/run-seed-0-pop-size-1000/2022-09-25_15.56.30.500721/generation_4900 +# # seed = 1 +# run_path: ucl-dark/mp/krvvohwg +# model_path: exp/GS-PPO-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_15.57.21.586407/generation_4900 +# # seed = 2 +# run_path: ucl-dark/mp/1yjlt34b +# model_path: exp/GS-PPO-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_15.58.28.437732/generation_4900 +# # seed = 3 +# run_path: ucl-dark/mp/20q2wjkp +# model_path: exp/GS-PPO-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_16.03.58.776658/generation_4900 +# seed = 4 +run_path: ucl-dark/mp/3gnmuhf6 +model_path: exp/GS-PPO-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_16.04.41.243205/generation_4900 + + # PPO agent parameters ppo: diff --git a/pax/conf/experiment/mp/mfos_v_tabular.yaml b/pax/conf/experiment/mp/mfos_v_tabular.yaml index 42234ce5..d9a7c43c 100644 --- a/pax/conf/experiment/mp/mfos_v_tabular.yaml +++ b/pax/conf/experiment/mp/mfos_v_tabular.yaml @@ -28,21 +28,21 @@ num_devices: 1 # Evaluation num_seeds: 20 -# # EARL vs. PPO trained on seed=0 -# run_path: ucl-dark/ipd/13o3v95p -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-0-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.31.908871/generation_2900 -# EARL vs. PPO trained on seed=1 -# run_path: ucl-dark/ipd/dopodr9n -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-1-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.15.58.912526/generation_2900 -# EARL vs. PPO trained on seed=2 -# run_path: ucl-dark/ipd/265ftn32 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-2-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.17.00.437954/generation_2900 -# EARL vs. PPO trained on seed=3 -# run_path: ucl-dark/ipd/1hffijy2 -# model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-3-OpenES-pop-size-1000-num-opps-1/2022-09-15_00.19.01.387867/generation_2900 -# EARL vs. PPO trained on seed=25 -run_path: ucl-dark/ipd/1ui7wfop -model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900 +# seed=0 +run_path: ucl-dark/mp/2erj6940 +model_path: exp/GS-MFOS-vs-Tabular/run-seed-0-pop-size-1000/2022-09-28_01.55.03.327061/generation_4900 +# seed=1 +run_path: ucl-dark/mp/fj2ximl5 +model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-28_01.55.09.949508/generation_4900 +# seed=2 +run_path: ucl-dark/mp/vcf845u1 +model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-28_01.55.17.814611/generation_4900 +# seed=3 +run_path: ucl-dark/mp/5c3buqf2 +model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-28_01.55.31.913577/generation_4900 +# seed=4 +run_path: ucl-dark/mp/1vrh77zx +model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-28_06.32.22.210096/generation_4900 # PPO agent parameters ppo: diff --git a/pax/evaluation_ipd.py b/pax/evaluation_ipd.py index a7574ad2..fcd51ccc 100644 --- a/pax/evaluation_ipd.py +++ b/pax/evaluation_ipd.py @@ -27,7 +27,7 @@ class Sample(NamedTuple): class EvalRunnerIPD: """Holds the runner's state.""" - def __init__(self, args): + def __init__(self, args, param_reshaper): self.algo = args.es.algo self.args = args self.num_opps = args.num_opps @@ -43,6 +43,7 @@ def __init__(self, args): self.model_path = args.model_path self.ipd_stats = jax.jit(ipd_visitation) self.cg_stats = jax.jit(cg_visitation) + self.param_reshaper = param_reshaper def _reshape_opp_dim(x): # x: [num_opps, num_envs ...] @@ -54,6 +55,7 @@ def _reshape_opp_dim(x): self.reduce_opp_dim = jax.jit(_reshape_opp_dim) + # flake8: noqa: C901 def eval_loop(self, env, agents, num_episodes, watchers): """Run training of agents in environment""" @@ -109,13 +111,13 @@ def _inner_rollout(carry, unused): traj2, ) - def _outer_rollout(carry, unused): + def _outer_rollout_fixed(carry, unused): """Runner for trial""" t1, t2, a1_state, a1_mem, a2_state, a2_mem, env_state = carry # play episode of the game vals, trajectories = jax.lax.scan( _inner_rollout, - carry, + (t1, t2, a1_state, a1_mem, a2_state, a2_mem, env_state), None, length=env.inner_episode_length, ) @@ -123,6 +125,38 @@ def _outer_rollout(carry, unused): # update second agent t1, t2, a1_state, a1_mem, a2_state, a2_mem, env_state = vals + # MFOS has to takes a meta-action for each episode + if self.args.agent1 == "MFOS": + a1_mem = a1_mem._replace(th=a1_mem.curr_th) + + return ( + t1, + t2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + ), trajectories + + def _outer_rollout_training(carry, unused): + """Runner for trial""" + t1, t2, a1_state, a1_mem, a2_state, a2_mem, env_state = carry + # play episode of the game + vals, trajectories = jax.lax.scan( + _inner_rollout, + (t1, t2, a1_state, a1_mem, a2_state, a2_mem, env_state), + None, + length=env.inner_episode_length, + ) + + # update second agent + t1, t2, a1_state, a1_mem, a2_state, a2_mem, env_state = vals + + # MFOS has to takes a meta-action for each episode + if self.args.agent1 == "MFOS": + a1_mem = a1_mem._replace(th=a1_mem.curr_th) + # do second agent update final_t2 = t2._replace( step_type=2 * jnp.ones_like(vals[1].step_type) @@ -158,8 +192,8 @@ def _outer_rollout(carry, unused): wandb.restore( name=self.model_path, run_path=self.run_path, root=os.getcwd() ) + # if self.args.agent1 == "MFOS": params = load(self.model_path) - a1_state, a1_mem = agent1._state, agent1._mem a2_state, a2_mem = agent2._state, agent2._mem @@ -174,6 +208,8 @@ def _outer_rollout(carry, unused): mean_cooperation_prob = jnp.zeros( shape=(num_seeds, env.num_trials, 5) ) + all_mean_rewards_p1 = jnp.zeros(shape=(num_seeds,)) + all_mean_rewards_p2 = jnp.zeros(shape=(num_seeds,)) for opp_i in range(num_seeds): rng, rng_run = jax.random.split(rng) @@ -190,29 +226,30 @@ def _outer_rollout(carry, unused): jax.random.split(rng, self.num_opps), a2_mem.hidden ) + training_trials = 0.5 * env.num_trials # run trials - vals, stack = jax.lax.scan( - _outer_rollout, + vals, stack1 = jax.lax.scan( + _outer_rollout_training, (*t_init, a1_state, a1_mem, a2_state, a2_mem, env_state), None, - length=env.num_trials, + length=training_trials, ) - traj_1, traj_2, a2_metrics = stack - # update outer agent - final_t1 = vals[0]._replace( - step_type=2 * jnp.ones_like(vals[0].step_type) + # hardstop part + vals, stack2 = jax.lax.scan( + _outer_rollout_fixed, + (*t_init, a1_state, a1_mem, a2_state, a2_mem, env_state), + None, + length=training_trials, ) - a1_state = vals[2] - a1_mem = vals[3] - a1_mem = agent1.batch_reset(a1_mem, True) + traj11, traj12, a2_metrics = stack1 + traj21, traj22 = stack2 + t1, t2, a1_state, a1_mem, a2_state, a2_mem, env_state = vals - # update second agent - a2_state, a2_mem = vals[4], vals[5] + a1_mem = agent1.batch_reset(a1_mem, True) t1, t2, a1_state, a1_mem, a2_state, a2_mem, env_state = vals - traj_1, traj_2, a2_metrics = stack # logging if self.args.env_type == "coin_game": @@ -224,8 +261,8 @@ def _outer_rollout(carry, unused): rewards_1 = traj_2.rewards.sum(axis=1).mean() if self.args.env_type == "ipd": - rewards_0 = stack[0].rewards.mean() - rewards_1 = stack[1].rewards.mean() + rewards_0 = traj11.rewards.mean() + traj21.rewards.mean() + rewards_1 = traj12.rewards.mean() + traj22.rewards.mean() elif self.args.env_type in [ "meta", @@ -235,15 +272,21 @@ def _outer_rollout(carry, unused): step_type=2 * jnp.ones_like(t1.step_type) ) env_stats = jax.tree_util.tree_map( - lambda x: x.item(), + lambda x: x.mean(), self.ipd_stats( - traj_1.observations, - traj_1.actions, + jnp.concatenate( + [traj11.observations, traj21.observations] + ), + jnp.concatenate([traj11.actions, traj21.actions]), final_t1.observation, ), ) - rewards_0 = traj_1.rewards.mean() - rewards_1 = traj_2.rewards.mean() + rewards_0 = jnp.concatenate( + [traj11.rewards, traj21.rewards] + ).mean() + rewards_1 = jnp.concatenate( + [traj12.rewards, traj22.rewards] + ).mean() else: env_stats = {} print(f"Summary | Opponent: {opp_i+1}") @@ -258,13 +301,21 @@ def _outer_rollout(carry, unused): "--------------------------------------------------------------------------" ) - inner_steps = 0 + traj1_reward = jnp.concatenate([traj11.rewards, traj21.rewards]) + traj2_reward = jnp.concatenate([traj12.rewards, traj22.rewards]) + + traj1_obs = jnp.concatenate( + [traj11.observations, traj21.observations] + ) + + traj1_actions = jnp.concatenate([traj11.actions, traj21.actions]) + for out_step in range(env.num_trials): - rewards_trial_mean_p1 = traj_1.rewards[out_step].mean() - rewards_trial_mean_p2 = traj_2.rewards[out_step].mean() + rewards_trial_mean_p1 = traj1_reward[out_step].mean() + rewards_trial_mean_p2 = traj2_reward[out_step].mean() trial_env_stats = self.ipd_stats( - traj_1.observations[out_step], - traj_1.actions[out_step], + traj1_obs[out_step], + traj1_actions[out_step], final_t1.observation[out_step], ) @@ -329,25 +380,18 @@ def _outer_rollout(carry, unused): } ) - for in_step in range(env.inner_episode_length): - rewards_step_p1 = traj_1.rewards[out_step, in_step] - rewards_step_p2 = traj_2.rewards[out_step, in_step] - if watchers: - wandb.log( - { - "eval/timestep": inner_steps + 1, - f"eval/reward_step/player_1_opp_{opp_i+1}": rewards_step_p1, - f"eval/reward_step/player_2_opp_{opp_i+1}": rewards_step_p2, - } - ) - inner_steps += 1 - mean_rewards_p1 = mean_rewards_p1.at[opp_i, out_step].set( rewards_trial_mean_p1 ) # jnp.zeros(shape=(num_iters, env.num_trials)) mean_rewards_p2 = mean_rewards_p2.at[opp_i, out_step].set( rewards_trial_mean_p2 ) + all_mean_rewards_p1 = all_mean_rewards_p1.at[opp_i].set( + rewards_0 + ) + all_mean_rewards_p2 = all_mean_rewards_p2.at[opp_i].set( + rewards_1 + ) # TODO: Remove when you move the number of iterations outside # of the eval loop into experiments.py mean_visits = mean_visits.at[opp_i, out_step, :].set( @@ -410,6 +454,24 @@ def _outer_rollout(carry, unused): ), } ) + + wandb.log( + { + "eval/meta_episode": 1, + "eval/mean_reward_over_seeds/p1": mean_rewards_p1.mean(), + "eval/mean_reward_over_seeds/p2": mean_rewards_p2.mean(), + "eval/median_reward_over_seeds/p1": jnp.median( + mean_rewards_p1.reshape(mean_rewards_p1.shape[0], -1).mean( + axis=1 + ) + ), + "eval/median_reward_over_seeds/p2": jnp.median( + mean_rewards_p2.reshape(mean_rewards_p2.shape[0], -1).mean( + axis=1 + ) + ), + } + ) for out_step in range(env.num_trials): if watchers: wandb.log( diff --git a/pax/experiment.py b/pax/experiment.py index 23d8f8a1..985bb6a5 100644 --- a/pax/experiment.py +++ b/pax/experiment.py @@ -206,8 +206,12 @@ def runner_setup(args, agents, save_dir, logger): return RunnerPretrained(args, save_dir, param_reshaper) if args.eval: if args.env_id == "ipd": + agent1, _ = agents.agents + param_reshaper = ParameterReshaper( + agent1._state.params, n_devices=args.num_devices + ) logger.info("Evaluating with EvalRunnerIPD") - return EvalRunnerIPD(args) + return EvalRunnerIPD(args, param_reshaper) elif args.env_id == "coin_game": logger.info("Evaluating with EvalRunnerCG") return EvalRunnerCG(args) diff --git a/pax/runner_evo.py b/pax/runner_evo.py index 4f1de965..7976dae9 100644 --- a/pax/runner_evo.py +++ b/pax/runner_evo.py @@ -52,6 +52,7 @@ def __init__(self, args, strategy, es_params, param_reshaper, save_dir): self.ipd_stats = jax.jit(ipd_visitation) self.cg_stats = jax.jit(cg_visitation) + # flake8: noqa: C901 def train_loop(self, env, agents, num_generations, watchers): """Run training of agents in environment""" @@ -107,13 +108,41 @@ def _inner_rollout(carry, unused): traj2, ) - def _outer_rollout(carry, unused): + def _outer_rollout_fixed(carry, unused): """Runner for trial""" t1, t2, a1_state, a1_mem, a2_state, a2_mem, env_state = carry # play episode of the game vals, trajectories = jax.lax.scan( _inner_rollout, - carry, + (t1, t2, a1_state, a1_mem, a2_state, a2_mem, env_state), + None, + length=env.inner_episode_length, + ) + + # update second agent + t1, t2, a1_state, a1_mem, a2_state, a2_mem, env_state = vals + + # MFOS has to takes a meta-action for each episode + if self.args.agent1 == "MFOS": + a1_mem = a1_mem._replace(th=a1_mem.curr_th) + + return ( + t1, + t2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + ), trajectories + + def _outer_rollout_training(carry, unused): + """Runner for trial""" + t1, t2, a1_state, a1_mem, a2_state, a2_mem, env_state = carry + # play episode of the game + vals, trajectories = jax.lax.scan( + _inner_rollout, + (t1, t2, a1_state, a1_mem, a2_state, a2_mem, env_state), None, length=env.inner_episode_length, ) @@ -169,21 +198,36 @@ def evo_rollout( ), agent2._mem.hidden, ) + split = jax.random.uniform(rng_run, minval=0.2) + training_trials = int(split * env.num_trials) + non_training_trials = int((1 - split) * env.num_trials) - vals, stack = jax.lax.scan( - _outer_rollout, + vals, stack1 = jax.lax.scan( + _outer_rollout_training, (*t_init, a1_state, a1_mem, a2_state, a2_mem, env_state), None, - length=env.num_trials, + length=training_trials, ) - traj_1, traj_2, a2_metrics = stack + # hardstop part + vals, stack2 = jax.lax.scan( + _outer_rollout_fixed, + (*t_init, a1_state, a1_mem, a2_state, a2_mem, env_state), + None, + length=non_training_trials, + ) + + traj11, traj12, a2_metrics = stack1 + traj21, traj22 = stack2 t1, t2, a1_state, a1_mem, a2_state, a2_mem, env_state = vals # Fitness - fitness = traj_1.rewards.mean(axis=(0, 1, 3, 4)) - other_fitness = traj_2.rewards.mean(axis=(0, 1, 3, 4)) - + fitness = jnp.concatenate([traj11.rewards, traj21.rewards]).mean( + axis=(0, 1, 3, 4) + ) + other_fitness = jnp.concatenate( + [traj12.rewards, traj22.rewards] + ).mean(axis=(0, 1, 3, 4)) # Stats if self.args.env_type == "coin_game": env_stats = jax.tree_util.tree_map( @@ -191,8 +235,8 @@ def evo_rollout( self.cg_stats(env_state), ) - rewards_0 = traj_1.rewards.sum(axis=1).mean() - rewards_1 = traj_2.rewards.sum(axis=1).mean() + # rewards_0 = traj_1.rewards.sum(axis=1).mean() + # rewards_1 = traj_2.rewards.sum(axis=1).mean() elif self.args.env_type in [ "meta", @@ -204,13 +248,19 @@ def evo_rollout( env_stats = jax.tree_util.tree_map( lambda x: x.mean(), self.ipd_stats( - traj_1.observations, - traj_1.actions, + jnp.concatenate( + [traj11.observations, traj21.observations] + ), + jnp.concatenate([traj11.actions, traj21.actions]), final_t1.observation, ), ) - rewards_0 = traj_1.rewards.mean() - rewards_1 = traj_2.rewards.mean() + rewards_0 = jnp.concatenate( + [traj11.rewards, traj21.rewards] + ).mean() + rewards_1 = jnp.concatenate( + [traj12.rewards, traj22.rewards] + ).mean() return ( fitness, other_fitness, @@ -276,19 +326,19 @@ def evo_rollout( ) a1_state, a1_mem = agent1._state, agent1._mem - evo_rollout = jax.pmap( - evo_rollout, - in_axes=(0, None, None, None, None), - ) + # evo_rollout = jax.pmap( + # evo_rollout, + # in_axes=(0, None, None, None, None), + # ) for gen in range(num_gens): rng, rng_run, rng_gen, rng_key = jax.random.split(rng, 4) # Ask x, evo_state = strategy.ask(rng_gen, evo_state, es_params) params = param_reshaper.reshape(x) - if num_devices == 1: - params = jax.tree_util.tree_map( - lambda x: jax.lax.expand_dims(x, (0,)), params - ) + # if num_devices == 1: + # params = jax.tree_util.tree_map( + # lambda x: jax.lax.expand_dims(x, (0,)), params + # ) # Evo Rollout ( fitness, diff --git a/pax/runner_pretrained.py b/pax/runner_pretrained.py index 23600b92..4d2efd49 100644 --- a/pax/runner_pretrained.py +++ b/pax/runner_pretrained.py @@ -133,6 +133,8 @@ def _outer_rollout(carry, unused): final_t2 = t2._replace(step_type=2 * jnp.ones_like(t2.step_type)) + if self.args.agent1 == "MFOS": + a1_mem = a1_mem._replace(th=a1_mem.curr_th) a2_state, a2_memory, a2_metrics = agent2.batch_update( trajectories[1], final_t2, a2_state, a2_memory ) @@ -160,14 +162,16 @@ def _outer_rollout(carry, unused): name=self.model_path, run_path=self.run_path, root=os.getcwd() ) pretrained_params = load(self.model_path) - pretrained_params = self.param_reshaper.reshape_single_net( - pretrained_params - ) + # pretrained_params = self.param_reshaper.reshape_single_net( + # pretrained_params + # ) a1_state = a1_state._replace(params=pretrained_params) num_iters = max(int(num_episodes / (env.num_envs * self.num_opps)), 1) log_interval = max(num_iters / MAX_WANDB_CALLS, 5) print(f"Log Interval {log_interval}") + cum_reward_0 = 0 + cum_reward_1 = 0 # run actual loop for i in range(num_episodes): rng, rng_run = jax.random.split(rng) @@ -220,6 +224,10 @@ def _outer_rollout(carry, unused): print(f"Saving iteration {i} locally") # logging + rewards_0 = traj_1.rewards.sum(axis=1).mean() + rewards_1 = traj_2.rewards.sum(axis=1).mean() + cum_reward_0 += float(rewards_0.mean()) + cum_reward_1 += float(rewards_1.mean()) self.train_episodes += 1 if i % log_interval == 0: print(f"Episode {i}") @@ -253,6 +261,9 @@ def _outer_rollout(carry, unused): print( f"Total Episode Reward: {float(rewards_0.mean()), float(rewards_1.mean())}" ) + print( + f"Cumalative Avg. Reward: {cum_reward_0/(i+1), cum_reward_1/(i+1)}" + ) print() if watchers: @@ -274,6 +285,10 @@ def _outer_rollout(carry, unused): "train/episode_reward/player_2": float( rewards_1.mean() ), + "train/cum_avg_reward/player_1": cum_reward_0 + / (i + 1), + "train/cum_avg_reward/player_2": cum_reward_1 + / (i + 1), } | env_stats, )