diff --git a/.github/workflows/cpu-tests.yaml b/.github/workflows/cpu-tests.yaml index 2cc88abf8..6c82a44bf 100644 --- a/.github/workflows/cpu-tests.yaml +++ b/.github/workflows/cpu-tests.yaml @@ -22,7 +22,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, windows-latest] - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11"] timeout-minutes: 20 diff --git a/benchmarks/benchmark_sb3.py b/benchmarks/benchmark_sb3.py index 5cf92b1ce..e2168b36c 100644 --- a/benchmarks/benchmark_sb3.py +++ b/benchmarks/benchmark_sb3.py @@ -27,13 +27,13 @@ # print(sb3.common.evaluation.evaluate_policy(model.policy, env)) -# Stable Baselines3 SAC - LunarLanderContinuous-v2 +# Stable Baselines3 SAC - LunarLanderContinuous-v3 # Decomment below to run SAC benchmarks # if __name__ == "__main__": # with timer("run_time", SumMetric, sync_on_compute=False): # env = sb3.common.vec_env.DummyVecEnv( -# [lambda: gym.make("LunarLanderContinuous-v2", render_mode="rgb_array") for _ in range(4)] +# [lambda: gym.make("LunarLanderContinuous-v3", render_mode="rgb_array") for _ in range(4)] # ) # model = SAC("MlpPolicy", env, verbose=0, device="cpu") # model.learn(total_timesteps=1024 * 64, log_interval=None) diff --git a/howto/select_observations.md b/howto/select_observations.md index 61a6188c6..7979f63b9 100644 --- a/howto/select_observations.md +++ b/howto/select_observations.md @@ -80,13 +80,13 @@ The algorithms that work with only vector observations are reported here: * SAC * Droq -For any of them you **must select** only the environments that provide vector observations. For instance, you can train the *SAC* algorithm on the `LunarLanderContinuous-v2` environment, but you cannot train it on the `CarRacing-v2` environment. +For any of them you **must select** only the environments that provide vector observations. For instance, you can train the *SAC* algorithm on the `LunarLanderContinuous-v3` environment, but you cannot train it on the `CarRacing-v2` environment. For these algorithms, you have to specify the *mlp* keys you want to encode. As usual, you have to specify them through the `mlp_keys.encoder` and `mlp_keys.decoder` arguments (in the command or the configs). -For instance, you can train a SAC agent on the `LunarLanderContinuous-v2` with the following command: +For instance, you can train a SAC agent on the `LunarLanderContinuous-v3` with the following command: ```bash -python sheeprl.py exp=sac env=gym env.id=LunarLanderContinuous-v2 algo.mlp_keys.encoder=[state] +python sheeprl.py exp=sac env=gym env.id=LunarLanderContinuous-v3 algo.mlp_keys.encoder=[state] ``` diff --git a/pyproject.toml b/pyproject.toml index c89f0a500..17e1ae2f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,22 +24,25 @@ maintainers = [ keywords = ["reinforcement", "machine", "learning", "distributed", "production"] license = { file = "LICENSE" } readme = { file = "docs/README.md", content-type = "text/markdown" } -requires-python = ">=3.8,<3.12" +requires-python = ">=3.9,<3.12" classifiers = ["Programming Language :: Python", "Topic :: Scientific/Engineering :: Artificial Intelligence"] dependencies = [ - "gymnasium==0.29.*", - "pygame >=2.1.3", + "gymnasium==1.0.0", + "pygame>=2.1.3", "moviepy>=1.0.3", "tensorboard>=2.10", "python-dotenv>=1.0.0", "lightning>=2.0", - "lightning-utilities<=0.9", + "lightning-utilities<=0.11.9", "hydra-core==1.3.0", "torchmetrics", "rich==13.5.*", - "opencv-python==4.8.0.*", - "torch>=2.0,!=2.2.0", - "numpy<2.0" + "opencv-python==4.10.*", + "torch==2.3.1", + # Windows only: + "numpy==1.26.0; sys_platform == 'win32'", + # Non-Windows (Linux, macOS, etc.): + "numpy>=2.0.0; sys_platform != 'win32'", ] dynamic = ["version"] @@ -74,13 +77,13 @@ dev = [ "autoflake==2.2.1", "ruff==0.1.11", ] -mujoco = ["mujoco>=2.3.3", "gymnasium[mujoco]==0.29.*"] +mujoco = ["mujoco>=2.3.3", "gymnasium[mujoco]==1.0.0"] dmc = ["dm_control>=1.0.12"] -box2d = ["gymnasium[box2d]==0.29.*"] +box2d = ["gymnasium[box2d]==1.0.0"] atari = [ - "gymnasium[atari]==0.29.*", - "gymnasium[accept-rom-license]==0.29.*", - "gymnasium[other]==0.29.*", + "gymnasium[atari]==1.0.0", + "gymnasium[accept-rom-license]==1.0.0", + "gymnasium[other]==1.0.0", ] minedojo = ["minedojo==0.1", "importlib_resources==5.12.0", "gym==0.21.0"] minerl = ["setuptools==66.0.0", "minerl==0.4.4", "gym==0.19.0"] diff --git a/sheeprl/configs/exp/sac.yaml b/sheeprl/configs/exp/sac.yaml index 505e87198..e496534c0 100644 --- a/sheeprl/configs/exp/sac.yaml +++ b/sheeprl/configs/exp/sac.yaml @@ -26,7 +26,7 @@ buffer: # Environment env: - id: LunarLanderContinuous-v2 + id: LunarLanderContinuous-v3 metric: aggregator: diff --git a/sheeprl/configs/exp/sac_benchmarks.yaml b/sheeprl/configs/exp/sac_benchmarks.yaml index 43e089457..097d16c26 100644 --- a/sheeprl/configs/exp/sac_benchmarks.yaml +++ b/sheeprl/configs/exp/sac_benchmarks.yaml @@ -10,7 +10,7 @@ run_benchmarks: True # Environment env: - id: LunarLanderContinuous-v2 + id: LunarLanderContinuous-v3 capture_video: False num_envs: 4 diff --git a/sheeprl/envs/dummy.py b/sheeprl/envs/dummy.py index fcaf74601..750ab17f0 100644 --- a/sheeprl/envs/dummy.py +++ b/sheeprl/envs/dummy.py @@ -18,7 +18,7 @@ def __init__( if self._dict_obs_space: self.observation_space = gym.spaces.Dict( { - "rgb": gym.spaces.Box(0, 256, shape=image_size, dtype=np.uint8), + "rgb": gym.spaces.Box(0, 255, shape=image_size, dtype=np.uint8), "state": gym.spaces.Box(-20, 20, shape=vector_shape, dtype=np.float32), } ) @@ -43,7 +43,7 @@ def get_obs(self) -> Dict[str, np.ndarray]: if self._dict_obs_space: return { # da sostituire con np.random.rand - "rgb": np.full(self.observation_space["rgb"].shape, self._current_step % 256, dtype=np.uint8), + "rgb": np.full(self.observation_space["rgb"].shape, self._current_step % 255, dtype=np.uint8), "state": np.full(self.observation_space["state"].shape, self._current_step, dtype=np.uint8), } else: diff --git a/sheeprl/envs/wrappers.py b/sheeprl/envs/wrappers.py index cc285b11b..0b042a1e3 100644 --- a/sheeprl/envs/wrappers.py +++ b/sheeprl/envs/wrappers.py @@ -25,7 +25,7 @@ class MaskVelocityWrapper(gym.ObservationWrapper): "MountainCarContinuous-v0": np.array([1]), "Pendulum-v1": np.array([2]), "LunarLander-v2": np.array([2, 3, 5]), - "LunarLanderContinuous-v2": np.array([2, 3, 5]), + "LunarLanderContinuous-v3": np.array([2, 3, 5]), } def __init__(self, env: gym.Env): diff --git a/sheeprl/utils/env.py b/sheeprl/utils/env.py index 750d85ee5..312f37cae 100644 --- a/sheeprl/utils/env.py +++ b/sheeprl/utils/env.py @@ -107,10 +107,14 @@ def thunk() -> gym.Env: f"is allowed in {cfg.env.id}, " f"only the first one is kept: {cfg.algo.cnn_keys.encoder[0]}" ) + obs_key = "state" if encoder_mlp_keys_length > 0: - gym.wrappers.pixel_observation.STATE_KEY = cfg.algo.mlp_keys.encoder[0] - env = gym.wrappers.PixelObservationWrapper( - env, pixels_only=encoder_mlp_keys_length == 0, pixel_keys=(cfg.algo.cnn_keys.encoder[0],) + obs_key = cfg.algo.mlp_keys.encoder[0] + env = gym.wrappers.AddRenderObservation( + env, + render_only=encoder_mlp_keys_length == 0, + render_key=cfg.algo.cnn_keys.encoder[0], + obs_key=obs_key, ) else: if encoder_mlp_keys_length > 1: @@ -120,7 +124,7 @@ def thunk() -> gym.Env: f"only the first one is kept: {cfg.algo.mlp_keys.encoder[0]}" ) mlp_key = cfg.algo.mlp_keys.encoder[0] - env = gym.wrappers.TransformObservation(env, lambda obs: {mlp_key: obs}) + env = gym.wrappers.TransformObservation(env, lambda obs: {mlp_key: obs}, None) env.observation_space = gym.spaces.Dict({mlp_key: env.observation_space}) elif isinstance(env.observation_space, gym.spaces.Box) and 2 <= len(env.observation_space.shape) <= 3: # Pixel only observation @@ -136,7 +140,9 @@ def thunk() -> gym.Env: "Please set at least one cnn key in the config file: `algo.cnn_keys.encoder=[your_cnn_key]`" ) cnn_key = cfg.algo.cnn_keys.encoder[0] - env = gym.wrappers.TransformObservation(env, lambda obs: {cnn_key: obs}) + env = gym.wrappers.TransformObservation( + env, lambda obs: {cnn_key: obs}, gym.spaces.Dict({cnn_key: env.observation_space}) + ) env.observation_space = gym.spaces.Dict({cnn_key: env.observation_space}) if ( @@ -195,7 +201,7 @@ def transform_obs(obs: Dict[str, Any]): return obs - env = gym.wrappers.TransformObservation(env, transform_obs) + env = gym.wrappers.TransformObservation(env, transform_obs, None) for k in cnn_keys: env.observation_space[k] = gym.spaces.Box( 0, 255, (1 if cfg.env.grayscale else 3, cfg.env.screen_size, cfg.env.screen_size), np.uint8 @@ -222,7 +228,7 @@ def transform_obs(obs: Dict[str, Any]): if cfg.env.capture_video and rank == 0 and vector_env_idx == 0 and run_name is not None: if cfg.env.grayscale: env = GrayscaleRenderWrapper(env) - env = gym.experimental.wrappers.RecordVideoV0( + env = gym.wrappers.RecordVideo( env, os.path.join(run_name, prefix + "_videos" if prefix else "videos"), disable_logger=True ) env.metadata["render_fps"] = env.frames_per_sec diff --git a/tests/test_data/test_buffers.py b/tests/test_data/test_buffers.py index c5c069ccb..d3caee2aa 100644 --- a/tests/test_data/test_buffers.py +++ b/tests/test_data/test_buffers.py @@ -34,7 +34,7 @@ def test_replay_buffer_add_single_td_not_full(): rb.add(td1) assert not rb.full assert rb._pos == 2 - np.testing.assert_allclose(rb["a"][:2], td1["a"]) + assert np.allclose(rb["a"][:2], td1["a"]) def test_replay_buffer_add_tds(): @@ -51,7 +51,7 @@ def test_replay_buffer_add_tds(): assert rb["a"][0] == td3["a"][-2] assert rb["a"][1] == td3["a"][-1] assert rb._pos == 2 - np.testing.assert_allclose(rb["a"][2:4], td2["a"]) + assert np.allclose(rb["a"][2:4], td2["a"]) def test_replay_buffer_add_tds_exceeding_buf_size_multiple_times(): @@ -68,7 +68,7 @@ def test_replay_buffer_add_tds_exceeding_buf_size_multiple_times(): assert rb.full assert rb._pos == 5 remainder = len(td3["a"]) % buf_size - np.testing.assert_allclose(rb["a"][: rb._pos], td3["a"][rb.buffer_size - rb._pos + remainder :]) + assert np.allclose(rb["a"][: rb._pos], td3["a"][rb.buffer_size - rb._pos + remainder :]) def test_replay_buffer_add_single_td_size_is_not_multiple(): @@ -80,8 +80,8 @@ def test_replay_buffer_add_single_td_size_is_not_multiple(): assert rb.full assert rb._pos == 2 remainder = len(td1["a"]) % buf_size - np.testing.assert_allclose(rb["a"][:remainder], td1["a"][-remainder:]) - np.testing.assert_allclose(rb["a"][remainder:], td1["a"][-buf_size:-remainder]) + assert np.allclose(rb["a"][:remainder], td1["a"][-remainder:]) + assert np.allclose(rb["a"][remainder:], td1["a"][-buf_size:-remainder]) def test_replay_buffer_add_single_td_size_is_multiple(): @@ -92,7 +92,7 @@ def test_replay_buffer_add_single_td_size_is_multiple(): rb.add(td1) assert rb.full assert rb._pos == 0 - np.testing.assert_allclose(rb["a"], td1["a"][-buf_size:]) + assert np.allclose(rb["a"], td1["a"][-buf_size:]) def test_replay_buffer_add_replay_buffer(): diff --git a/tests/test_data/test_sequential_buffer.py b/tests/test_data/test_sequential_buffer.py index 95bf8a8b8..362a6219c 100644 --- a/tests/test_data/test_sequential_buffer.py +++ b/tests/test_data/test_sequential_buffer.py @@ -31,7 +31,7 @@ def test_seq_replay_buffer_add_tds(): assert rb.full assert rb["a"][0] == td3["a"][-2] assert rb["a"][1] == td3["a"][-1] - np.testing.assert_allclose(rb["a"][2:4], td2["a"]) + assert rb["a"][2] == td2["a"][-2] def test_seq_replay_buffer_add_single_td(): diff --git a/tests/test_envs/test_actions_as_observations.py b/tests/test_envs/test_actions_as_observations.py index 35ec9d36c..69110d2bc 100644 --- a/tests/test_envs/test_actions_as_observations.py +++ b/tests/test_envs/test_actions_as_observations.py @@ -62,7 +62,7 @@ def test_actions_as_observation_wrapper(env_id: str, num_stack, dilation): expected_actions_stack = list(expected_actions)[dilation - 1 :: dilation] expected_actions_stack = np.concatenate(expected_actions_stack, axis=-1).astype(np.float32) - np.testing.assert_array_equal(o["action_stack"], expected_actions_stack) + assert np.allclose(o["action_stack"], expected_actions_stack) @pytest.mark.parametrize("num_stack", [-1, 0]) diff --git a/tests/test_envs/test_frame_stack.py b/tests/test_envs/test_frame_stack.py index e7e3e825c..58e3a51f2 100644 --- a/tests/test_envs/test_frame_stack.py +++ b/tests/test_envs/test_frame_stack.py @@ -87,7 +87,7 @@ def test_framestack(num_stack, dilation): ], axis=0, ) - np.testing.assert_array_equal(obs["rgb"], expected_frame) + assert np.allclose(obs["rgb"], expected_frame) @pytest.mark.parametrize("env_id", ENVIRONMENTS.keys()) diff --git a/tests/test_envs/test_wrappers.py b/tests/test_envs/test_wrappers.py index 8189b573e..ce0b76bc6 100644 --- a/tests/test_envs/test_wrappers.py +++ b/tests/test_envs/test_wrappers.py @@ -14,7 +14,7 @@ def test_mask_velocities_fail(): with pytest.raises(NotImplementedError): - env = gym.make("CarRacing-v2") + env = gym.make("CarRacing-v3") env = MaskVelocityWrapper(env) @@ -48,7 +48,7 @@ def test_rewards_as_observation_wrapper_step_method(env_id, dict_obs_space): else: assert "obs" in obs assert "reward" in obs - np.testing.assert_array_equal(obs["reward"], np.array([0.0])) + assert np.allclose(obs["reward"], np.array([0.0])) @pytest.mark.parametrize("env_id", ENVIRONMENTS.keys()) @@ -65,7 +65,7 @@ def test_rewards_as_observation_wrapper_reset_method(env_id, dict_obs_space): else: assert "obs" in obs assert "reward" in obs - np.testing.assert_array_equal(obs["reward"], np.array([0.0])) + assert np.allclose(obs["reward"], np.array([0.0])) @pytest.mark.parametrize("amount", [-1.3, -1, 0])