From 98a089da7a5d281f8060d9a6c8b36e706d08bb96 Mon Sep 17 00:00:00 2001 From: Waris Radji Date: Tue, 21 Nov 2023 16:18:44 +0100 Subject: [PATCH] Refactorization of the repository (#379) * Move some class to rlberry-scoo and rlberry-research * Update and remove some files in agents and envs * Update the .gitignore * Updates imports 'paths' * add tests to better coverage (env with action space in Box) * add tests to better coverage (observation_space as Dict) * add tests to better coverage (check_gym_env_warnings) * increase writer coverage * add tests to better coverage (check_gym_env_warnings) * add tests to better coverage (writer) * removing old doc * update rlberry-researche -> update poetry.lock * update display on API doc * add YannBerthelot to contributor * update tests on writers --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: JulienT01 Co-authored-by: TimotheeMathieu --- .gitignore | 2 +- azure-pipelines.yml | 16 + codecov.yml | 2 - docs/api.rst | 142 +-- docs/basics/experiment_setup.rst | 4 +- docs/basics/multiprocess.rst | 4 +- docs/basics/rlberry how to.rst | 2 +- docs/contributors.rst | 4 + examples/comparison_agents.py | 4 +- examples/demo_agents/video_plot_a2c.py | 4 +- examples/demo_agents/video_plot_mbqvi.py | 4 +- examples/demo_agents/video_plot_ppo.py | 4 +- examples/demo_agents/video_plot_vi.py | 4 +- examples/demo_bandits/plot_TS_bandit.py | 4 +- .../plot_compare_index_bandits.py | 4 +- examples/demo_bandits/plot_exp3_bandit.py | 4 +- examples/demo_bandits/plot_mirror_bandit.py | 2 +- examples/demo_bandits/plot_ucb_bandit.py | 4 +- examples/demo_env/video_plot_apple_gold.py | 4 +- examples/demo_env/video_plot_chain.py | 2 +- examples/demo_env/video_plot_gridworld.py | 4 +- examples/demo_env/video_plot_pball.py | 2 +- examples/demo_env/video_plot_rooms.py | 4 +- examples/demo_env/video_plot_twinrooms.py | 4 +- examples/demo_experiment/room.yaml | 2 +- examples/demo_experiment/rsucbvi.yaml | 2 +- examples/demo_network/run_server.py | 8 +- examples/plot_agent_manager.py | 2 +- examples/plot_kernels.py | 2 +- examples/plot_writer_wrapper.py | 4 +- poetry.lock | 74 +- pyproject.toml | 2 + rlberry/agents/__init__.py | 13 - rlberry/agents/adaptiveql/__init__.py | 1 - rlberry/agents/adaptiveql/adaptiveql.py | 195 ---- rlberry/agents/adaptiveql/tree.py | 219 ----- rlberry/agents/adaptiveql/utils.py | 61 -- rlberry/agents/bandits/__init__.py | 19 - rlberry/agents/bandits/bandit_base.py | 123 --- rlberry/agents/bandits/index_agents.py | 101 --- rlberry/agents/bandits/indices.py | 421 --------- rlberry/agents/bandits/priors.py | 151 ---- rlberry/agents/bandits/randomized_agents.py | 115 --- rlberry/agents/bandits/tools/__init__.py | 1 - rlberry/agents/bandits/tools/tracker.py | 231 ----- rlberry/agents/bandits/ts_agents.py | 157 ---- rlberry/agents/dynprog/__init__.py | 1 - rlberry/agents/dynprog/utils.py | 272 ------ rlberry/agents/dynprog/value_iteration.py | 82 -- rlberry/agents/experimental/__init__.py | 0 rlberry/agents/experimental/tests/__init__.py | 0 rlberry/agents/experimental/torch/__init__.py | 0 rlberry/agents/features/__init__.py | 1 - rlberry/agents/features/feature_map.py | 29 - rlberry/agents/kernel_based/__init__.py | 2 - rlberry/agents/kernel_based/common.py | 34 - rlberry/agents/kernel_based/kernels.py | 58 -- .../agents/kernel_based/rs_kernel_ucbvi.py | 390 -------- rlberry/agents/kernel_based/rs_ucbvi.py | 332 ------- rlberry/agents/linear/__init__.py | 1 - rlberry/agents/linear/lsvi_ucb.py | 356 -------- rlberry/agents/mbqvi/__init__.py | 1 - rlberry/agents/mbqvi/mbqvi.py | 152 ---- rlberry/agents/optql/__init__.py | 1 - rlberry/agents/optql/optql.py | 206 ----- rlberry/agents/psrl/__init__.py | 1 - rlberry/agents/psrl/psrl.py | 257 ------ rlberry/agents/rlsvi/__init__.py | 1 - rlberry/agents/rlsvi/rlsvi.py | 280 ------ rlberry/agents/tabular_rl/__init__.py | 2 - rlberry/agents/tabular_rl/qlearning.py | 127 --- rlberry/agents/tabular_rl/sarsa.py | 125 --- rlberry/agents/tests/test_adaptiveql.py | 12 - rlberry/agents/tests/test_bandits.py | 131 --- rlberry/agents/tests/test_dynprog.py | 156 ---- rlberry/agents/tests/test_kernel_based.py | 58 -- rlberry/agents/tests/test_lsvi_ucb.py | 218 ----- rlberry/agents/tests/test_mbqvi.py | 27 - rlberry/agents/tests/test_optql.py | 9 - rlberry/agents/tests/test_psrl.py | 29 - rlberry/agents/tests/test_replay.py | 28 +- rlberry/agents/tests/test_rlsvi.py | 19 - rlberry/agents/tests/test_tabular_rl.py | 33 - rlberry/agents/tests/test_ucbvi.py | 30 - rlberry/agents/torch/__init__.py | 7 - rlberry/agents/torch/a2c/__init__.py | 1 - rlberry/agents/torch/a2c/a2c.py | 338 ------- rlberry/agents/torch/dqn/__init__.py | 2 - rlberry/agents/torch/dqn/dqn.py | 513 ----------- rlberry/agents/torch/dqn/dqn_utils.py | 142 --- rlberry/agents/torch/dqn/mdqn.py | 478 ---------- rlberry/agents/torch/ppo/__init__.py | 1 - rlberry/agents/torch/ppo/ppo.py | 843 ------------------ rlberry/agents/torch/ppo/ppo_utils.py | 193 ---- rlberry/agents/torch/reinforce/__init__.py | 1 - rlberry/agents/torch/reinforce/reinforce.py | 270 ------ rlberry/agents/torch/sac/__init__.py | 1 - rlberry/agents/torch/sac/sac.py | 543 ----------- rlberry/agents/torch/sac/sac_utils.py | 38 - rlberry/agents/torch/tests/__init__.py | 0 rlberry/agents/torch/tests/test_a2c.py | 122 --- rlberry/agents/torch/tests/test_dqn.py | 138 --- rlberry/agents/torch/tests/test_factory.py | 23 - rlberry/agents/torch/tests/test_mdqn.py | 40 - rlberry/agents/torch/tests/test_ppo.py | 201 ----- rlberry/agents/torch/tests/test_reinforce.py | 49 - rlberry/agents/torch/tests/test_sac.py | 68 -- .../agents/torch/tests/test_torch_atari.py | 287 ------ .../agents/torch/tests/test_torch_models.py | 47 - .../agents/torch/tests/test_torch_training.py | 32 - rlberry/agents/torch/utils/__init__.py | 0 rlberry/agents/torch/utils/models.py | 534 ----------- rlberry/agents/torch/utils/training.py | 148 --- rlberry/agents/ucbvi/__init__.py | 1 - rlberry/agents/ucbvi/ucbvi.py | 332 ------- rlberry/agents/ucbvi/utils.py | 83 -- rlberry/agents/utils/memories.py | 59 -- rlberry/colab_utils/__init__.py | 0 rlberry/colab_utils/display_setup.py | 37 - rlberry/envs/__init__.py | 3 +- rlberry/envs/bandits/__init__.py | 3 - rlberry/envs/bandits/bandit_base.py | 115 --- rlberry/envs/bandits/corrupted_bandits.py | 90 -- rlberry/envs/bandits/stochastic_bandits.py | 58 -- rlberry/envs/benchmarks/__init__.py | 0 .../benchmarks/ball_exploration/__init__.py | 1 - .../benchmarks/ball_exploration/ball2d.py | 220 ----- .../envs/benchmarks/ball_exploration/pball.py | 482 ---------- .../benchmarks/generalization/__init__.py | 0 .../benchmarks/generalization/twinrooms.py | 185 ---- .../benchmarks/grid_exploration/__init__.py | 0 .../benchmarks/grid_exploration/apple_gold.py | 180 ---- .../benchmarks/grid_exploration/four_room.py | 130 --- .../envs/benchmarks/grid_exploration/nroom.py | 305 ------- .../benchmarks/grid_exploration/six_room.py | 151 ---- rlberry/envs/bullet3/data/__init__.py | 6 - rlberry/envs/bullet3/data/mjcf/pendulum.xml | 28 - rlberry/envs/bullet3/data/pendulum.urdf | 51 -- .../envs/bullet3/pybullet_envs/__init__.py | 40 - .../pybullet_envs/gym_pendulum_envs.py | 80 -- .../envs/bullet3/pybullet_envs/robot_bases.py | 123 --- .../bullet3/pybullet_envs/robot_pendula.py | 46 - .../envs/classic_control/SpringCartPole.py | 604 ------------- rlberry/envs/classic_control/__init__.py | 4 - rlberry/envs/classic_control/acrobot.py | 394 -------- rlberry/envs/classic_control/mountain_car.py | 202 ----- rlberry/envs/classic_control/pendulum.py | 132 --- rlberry/envs/finite/__init__.py | 3 - rlberry/envs/finite/chain.py | 132 --- rlberry/envs/finite/gridworld.py | 490 ---------- rlberry/envs/finite/gridworld_utils.py | 70 -- rlberry/envs/{finite => }/finite_mdp.py | 0 rlberry/envs/tests/test_bandits.py | 61 -- rlberry/envs/tests/test_env_seeding.py | 14 +- rlberry/envs/tests/test_gym_make.py | 4 +- rlberry/envs/tests/test_instantiation.py | 252 ------ rlberry/envs/tests/test_spring_env.py | 104 --- rlberry/experiment/tests/room.yaml | 2 +- rlberry/experiment/tests/rsucbvi.yaml | 2 +- .../tests/test_experiment_generator.py | 2 +- rlberry/experiment/yaml_utils.py | 4 +- rlberry/exploration_tools/__init__.py | 0 rlberry/exploration_tools/discrete_counter.py | 100 --- .../online_discretization_counter.py | 189 ---- rlberry/exploration_tools/tests/__init__.py | 0 .../tests/test_discrete_counter.py | 113 --- rlberry/exploration_tools/torch/__init__.py | 0 rlberry/exploration_tools/torch/rnd.py | 212 ----- .../exploration_tools/torch/tests/__init__.py | 0 .../exploration_tools/torch/tests/test_rnd.py | 27 - rlberry/exploration_tools/typing.py | 85 -- .../uncertainty_estimator.py | 34 - rlberry/manager/__init__.py | 5 +- rlberry/manager/experiment_manager.py | 4 +- rlberry/manager/remote_experiment_manager.py | 235 ----- rlberry/manager/tests/test_comparisons.py | 2 +- .../manager/tests/test_experiment_manager.py | 26 +- .../tests/test_experiment_manager_seeding.py | 4 +- .../manager/tests/test_hyperparam_optim.py | 7 +- rlberry/manager/tests/test_plot.py | 4 +- rlberry/network/__init__.py | 0 rlberry/network/client.py | 53 -- rlberry/network/interface.py | 103 --- rlberry/network/server.py | 174 ---- rlberry/network/server_utils.py | 118 --- rlberry/network/tests/__init__.py | 0 rlberry/network/tests/conftest.py | 43 - rlberry/network/tests/test_server.py | 91 -- rlberry/network/utils.py | 83 -- rlberry/rendering/__init__.py | 3 - rlberry/rendering/common_shapes.py | 39 - rlberry/rendering/core.py | 56 -- rlberry/rendering/opengl_render2d.py | 252 ------ rlberry/rendering/pygame_render2d.py | 197 ---- rlberry/rendering/render_interface.py | 162 ---- rlberry/rendering/tests/__init__.py | 0 .../tests/test_rendering_interface.py | 125 --- rlberry/rendering/utils.py | 73 -- rlberry/tests/test_agent_extra.py | 10 +- rlberry/tests/test_agents_base.py | 29 +- rlberry/tests/test_envs.py | 14 +- .../tests/test_rlberry_main_agents_and_env.py | 133 +++ rlberry/utils/__init__.py | 1 - rlberry/utils/check_agent.py | 12 +- rlberry/utils/check_bandit_agent.py | 62 -- rlberry/utils/io.py | 33 - rlberry/utils/tests/test_check.py | 4 +- rlberry/utils/tests/test_writer.py | 10 +- rlberry/utils/writers.py | 4 +- rlberry/wrappers/tests/old_env/old_acrobot.py | 4 +- .../wrappers/tests/old_env/old_apple_gold.py | 2 +- .../wrappers/tests/old_env/old_gridworld.py | 6 +- .../tests/old_env/old_mountain_car.py | 2 +- rlberry/wrappers/tests/old_env/old_nroom.py | 2 +- rlberry/wrappers/tests/old_env/old_pball.py | 2 +- .../wrappers/tests/old_env/old_pendulum.py | 4 +- .../wrappers/tests/old_env/old_six_room.py | 2 +- .../wrappers/tests/old_env/old_twinrooms.py | 4 +- rlberry/wrappers/tests/test_basewrapper.py | 2 +- .../wrappers/tests/test_common_wrappers.py | 9 +- .../wrappers/tests/test_wrapper_seeding.py | 8 +- rlberry/wrappers/tests/test_writer_utils.py | 4 +- rlberry/wrappers/vis2d.py | 4 +- scripts/fetch_contributors.py | 3 +- 224 files changed, 417 insertions(+), 18516 deletions(-) delete mode 100644 rlberry/agents/adaptiveql/__init__.py delete mode 100644 rlberry/agents/adaptiveql/adaptiveql.py delete mode 100644 rlberry/agents/adaptiveql/tree.py delete mode 100644 rlberry/agents/adaptiveql/utils.py delete mode 100644 rlberry/agents/bandits/__init__.py delete mode 100644 rlberry/agents/bandits/bandit_base.py delete mode 100644 rlberry/agents/bandits/index_agents.py delete mode 100644 rlberry/agents/bandits/indices.py delete mode 100644 rlberry/agents/bandits/priors.py delete mode 100644 rlberry/agents/bandits/randomized_agents.py delete mode 100644 rlberry/agents/bandits/tools/__init__.py delete mode 100644 rlberry/agents/bandits/tools/tracker.py delete mode 100644 rlberry/agents/bandits/ts_agents.py delete mode 100644 rlberry/agents/dynprog/__init__.py delete mode 100644 rlberry/agents/dynprog/utils.py delete mode 100644 rlberry/agents/dynprog/value_iteration.py delete mode 100644 rlberry/agents/experimental/__init__.py delete mode 100644 rlberry/agents/experimental/tests/__init__.py delete mode 100644 rlberry/agents/experimental/torch/__init__.py delete mode 100644 rlberry/agents/features/__init__.py delete mode 100644 rlberry/agents/features/feature_map.py delete mode 100644 rlberry/agents/kernel_based/__init__.py delete mode 100644 rlberry/agents/kernel_based/common.py delete mode 100644 rlberry/agents/kernel_based/kernels.py delete mode 100644 rlberry/agents/kernel_based/rs_kernel_ucbvi.py delete mode 100644 rlberry/agents/kernel_based/rs_ucbvi.py delete mode 100644 rlberry/agents/linear/__init__.py delete mode 100644 rlberry/agents/linear/lsvi_ucb.py delete mode 100644 rlberry/agents/mbqvi/__init__.py delete mode 100644 rlberry/agents/mbqvi/mbqvi.py delete mode 100644 rlberry/agents/optql/__init__.py delete mode 100644 rlberry/agents/optql/optql.py delete mode 100644 rlberry/agents/psrl/__init__.py delete mode 100644 rlberry/agents/psrl/psrl.py delete mode 100644 rlberry/agents/rlsvi/__init__.py delete mode 100644 rlberry/agents/rlsvi/rlsvi.py delete mode 100644 rlberry/agents/tabular_rl/__init__.py delete mode 100644 rlberry/agents/tabular_rl/qlearning.py delete mode 100644 rlberry/agents/tabular_rl/sarsa.py delete mode 100644 rlberry/agents/tests/test_adaptiveql.py delete mode 100644 rlberry/agents/tests/test_bandits.py delete mode 100644 rlberry/agents/tests/test_dynprog.py delete mode 100644 rlberry/agents/tests/test_kernel_based.py delete mode 100644 rlberry/agents/tests/test_lsvi_ucb.py delete mode 100644 rlberry/agents/tests/test_mbqvi.py delete mode 100644 rlberry/agents/tests/test_optql.py delete mode 100644 rlberry/agents/tests/test_psrl.py delete mode 100644 rlberry/agents/tests/test_rlsvi.py delete mode 100644 rlberry/agents/tests/test_tabular_rl.py delete mode 100644 rlberry/agents/tests/test_ucbvi.py delete mode 100644 rlberry/agents/torch/__init__.py delete mode 100644 rlberry/agents/torch/a2c/__init__.py delete mode 100644 rlberry/agents/torch/a2c/a2c.py delete mode 100644 rlberry/agents/torch/dqn/__init__.py delete mode 100644 rlberry/agents/torch/dqn/dqn.py delete mode 100644 rlberry/agents/torch/dqn/dqn_utils.py delete mode 100644 rlberry/agents/torch/dqn/mdqn.py delete mode 100644 rlberry/agents/torch/ppo/__init__.py delete mode 100644 rlberry/agents/torch/ppo/ppo.py delete mode 100644 rlberry/agents/torch/ppo/ppo_utils.py delete mode 100644 rlberry/agents/torch/reinforce/__init__.py delete mode 100644 rlberry/agents/torch/reinforce/reinforce.py delete mode 100644 rlberry/agents/torch/sac/__init__.py delete mode 100644 rlberry/agents/torch/sac/sac.py delete mode 100644 rlberry/agents/torch/sac/sac_utils.py delete mode 100644 rlberry/agents/torch/tests/__init__.py delete mode 100644 rlberry/agents/torch/tests/test_a2c.py delete mode 100644 rlberry/agents/torch/tests/test_dqn.py delete mode 100644 rlberry/agents/torch/tests/test_factory.py delete mode 100644 rlberry/agents/torch/tests/test_mdqn.py delete mode 100644 rlberry/agents/torch/tests/test_ppo.py delete mode 100644 rlberry/agents/torch/tests/test_reinforce.py delete mode 100644 rlberry/agents/torch/tests/test_sac.py delete mode 100644 rlberry/agents/torch/tests/test_torch_atari.py delete mode 100644 rlberry/agents/torch/tests/test_torch_models.py delete mode 100644 rlberry/agents/torch/tests/test_torch_training.py delete mode 100644 rlberry/agents/torch/utils/__init__.py delete mode 100644 rlberry/agents/torch/utils/models.py delete mode 100644 rlberry/agents/torch/utils/training.py delete mode 100644 rlberry/agents/ucbvi/__init__.py delete mode 100644 rlberry/agents/ucbvi/ucbvi.py delete mode 100644 rlberry/agents/ucbvi/utils.py delete mode 100644 rlberry/agents/utils/memories.py delete mode 100644 rlberry/colab_utils/__init__.py delete mode 100644 rlberry/colab_utils/display_setup.py delete mode 100644 rlberry/envs/bandits/__init__.py delete mode 100644 rlberry/envs/bandits/bandit_base.py delete mode 100644 rlberry/envs/bandits/corrupted_bandits.py delete mode 100644 rlberry/envs/bandits/stochastic_bandits.py delete mode 100644 rlberry/envs/benchmarks/__init__.py delete mode 100644 rlberry/envs/benchmarks/ball_exploration/__init__.py delete mode 100644 rlberry/envs/benchmarks/ball_exploration/ball2d.py delete mode 100644 rlberry/envs/benchmarks/ball_exploration/pball.py delete mode 100644 rlberry/envs/benchmarks/generalization/__init__.py delete mode 100644 rlberry/envs/benchmarks/generalization/twinrooms.py delete mode 100644 rlberry/envs/benchmarks/grid_exploration/__init__.py delete mode 100644 rlberry/envs/benchmarks/grid_exploration/apple_gold.py delete mode 100644 rlberry/envs/benchmarks/grid_exploration/four_room.py delete mode 100644 rlberry/envs/benchmarks/grid_exploration/nroom.py delete mode 100644 rlberry/envs/benchmarks/grid_exploration/six_room.py delete mode 100644 rlberry/envs/bullet3/data/__init__.py delete mode 100644 rlberry/envs/bullet3/data/mjcf/pendulum.xml delete mode 100644 rlberry/envs/bullet3/data/pendulum.urdf delete mode 100644 rlberry/envs/bullet3/pybullet_envs/__init__.py delete mode 100644 rlberry/envs/bullet3/pybullet_envs/gym_pendulum_envs.py delete mode 100644 rlberry/envs/bullet3/pybullet_envs/robot_bases.py delete mode 100644 rlberry/envs/bullet3/pybullet_envs/robot_pendula.py delete mode 100644 rlberry/envs/classic_control/SpringCartPole.py delete mode 100644 rlberry/envs/classic_control/__init__.py delete mode 100644 rlberry/envs/classic_control/acrobot.py delete mode 100644 rlberry/envs/classic_control/mountain_car.py delete mode 100644 rlberry/envs/classic_control/pendulum.py delete mode 100644 rlberry/envs/finite/__init__.py delete mode 100644 rlberry/envs/finite/chain.py delete mode 100644 rlberry/envs/finite/gridworld.py delete mode 100644 rlberry/envs/finite/gridworld_utils.py rename rlberry/envs/{finite => }/finite_mdp.py (100%) delete mode 100644 rlberry/envs/tests/test_bandits.py delete mode 100644 rlberry/envs/tests/test_instantiation.py delete mode 100644 rlberry/envs/tests/test_spring_env.py delete mode 100644 rlberry/exploration_tools/__init__.py delete mode 100644 rlberry/exploration_tools/discrete_counter.py delete mode 100644 rlberry/exploration_tools/online_discretization_counter.py delete mode 100644 rlberry/exploration_tools/tests/__init__.py delete mode 100644 rlberry/exploration_tools/tests/test_discrete_counter.py delete mode 100644 rlberry/exploration_tools/torch/__init__.py delete mode 100644 rlberry/exploration_tools/torch/rnd.py delete mode 100644 rlberry/exploration_tools/torch/tests/__init__.py delete mode 100644 rlberry/exploration_tools/torch/tests/test_rnd.py delete mode 100644 rlberry/exploration_tools/typing.py delete mode 100644 rlberry/exploration_tools/uncertainty_estimator.py delete mode 100644 rlberry/manager/remote_experiment_manager.py delete mode 100644 rlberry/network/__init__.py delete mode 100644 rlberry/network/client.py delete mode 100644 rlberry/network/interface.py delete mode 100644 rlberry/network/server.py delete mode 100644 rlberry/network/server_utils.py delete mode 100644 rlberry/network/tests/__init__.py delete mode 100644 rlberry/network/tests/conftest.py delete mode 100644 rlberry/network/tests/test_server.py delete mode 100644 rlberry/network/utils.py delete mode 100644 rlberry/rendering/__init__.py delete mode 100644 rlberry/rendering/common_shapes.py delete mode 100644 rlberry/rendering/core.py delete mode 100644 rlberry/rendering/opengl_render2d.py delete mode 100644 rlberry/rendering/pygame_render2d.py delete mode 100644 rlberry/rendering/render_interface.py delete mode 100644 rlberry/rendering/tests/__init__.py delete mode 100644 rlberry/rendering/tests/test_rendering_interface.py delete mode 100644 rlberry/rendering/utils.py create mode 100644 rlberry/tests/test_rlberry_main_agents_and_env.py delete mode 100644 rlberry/utils/check_bandit_agent.py delete mode 100644 rlberry/utils/io.py diff --git a/.gitignore b/.gitignore index 533f60754..4644ebe3d 100644 --- a/.gitignore +++ b/.gitignore @@ -167,4 +167,4 @@ dmypy.json .pydevproject -profile.prof +*.prof diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d11f38b93..df4118f15 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -153,6 +153,12 @@ jobs: set -xe pip install . displayName: 'Install rlberry' + + - script: | + pip install git+https://github.com/rlberry-py/rlberry-scool.git + pip install git+https://github.com/rlberry-py/rlberry-research.git + displayName: 'Install rlberry-scool and rlberry-research' + #ignore les tests qui viennent des extras : torch, experimental, stablebaselines, optuna - script: | pip install pytest==7.0.1 pytest-azurepipelines pytest-xvfb @@ -186,6 +192,11 @@ jobs: pip install . displayName: 'Install rlberry' + - script: | + pip install git+https://github.com/rlberry-py/rlberry-scool.git + pip install git+https://github.com/rlberry-py/rlberry-research.git + displayName: 'Install rlberry-scool and rlberry-research' + - script: | pip install pytest==7.0.1 pytest-azurepipelines pytest-xvfb pytest rlberry/tests/test_agents_base.py rlberry/tests/test_envs.py @@ -215,6 +226,11 @@ jobs: pip install . displayName: 'Install rlberry' + - script: | + pip install git+https://github.com/rlberry-py/rlberry-scool.git + pip install git+https://github.com/rlberry-py/rlberry-research.git + displayName: 'Install rlberry-scool and rlberry-research' + - script: | pip install pytest==7.0.1 pytest-azurepipelines pytest-xvfb pytest rlberry/tests/test_agents_base.py rlberry/tests/test_envs.py diff --git a/codecov.yml b/codecov.yml index 90677b227..809ad8b6c 100644 --- a/codecov.yml +++ b/codecov.yml @@ -25,5 +25,3 @@ ignore: - "./rlberry/wrappers/tests/old_env/*.py" - "./rlberry/rendering/pygame_render2d.py" - "./rlberry/colab_utils/display_setup.py" - - "./rlberry/agents/experimental/jax/**/*.py" - - "./rlberry/network/**/*.py" diff --git a/docs/api.rst b/docs/api.rst index 1969d1658..63c7b5552 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -44,25 +44,6 @@ Base classes agents.Agent agents.AgentWithSimplePolicy -Basic Agents --------------------- - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - agents.QLAgent - agents.SARSAAgent - agents.ValueIterationAgent - agents.MBQVIAgent - agents.UCBVIAgent - agents.RSUCBVIAgent - agents.RSKernelUCBVIAgent - agents.OptQLAgent - agents.LSVIUCBAgent - agents.RLSVIAgent - agents.PSRLAgent - Agent importation tools ----------------------- @@ -74,22 +55,6 @@ Agent importation tools agents.stable_baselines.StableBaselinesAgent -Torch Agents ---------------------------- - - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - agents.torch.SACAgent - agents.torch.A2CAgent - agents.torch.PPOAgent - agents.torch.DQNAgent - agents.torch.MunchausenDQNAgent - agents.torch.REINFORCEAgent - - Environments ============ @@ -116,23 +81,6 @@ Spaces spaces.MultiBinary spaces.Dict -Benchmark Environments ----------------------- - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - envs.Acrobot - envs.benchmarks.ball_exploration.PBall2D - envs.benchmarks.generalization.twinrooms.TwinRooms - envs.benchmarks.grid_exploration.apple_gold.AppleGold - envs.benchmarks.grid_exploration.nroom.NRoom - envs.classic_control.MountainCar - envs.SpringCartPole - envs.finite.Chain - envs.finite.GridWorld - Environment tools ----------------- @@ -171,6 +119,7 @@ Manager Utilitis .. autosummary:: :toctree: generated/ :template: function.rst + manager.preset_manager @@ -208,16 +157,6 @@ Logging Utilities utils.logging.set_level -Typing ------- - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - types.Env - - Environment Wrappers ==================== @@ -230,82 +169,3 @@ Environment Wrappers wrappers.RescaleRewardWrapper wrappers.vis2d.Vis2dWrapper wrappers.WriterWrapper - - -Neural Networks -=============== - - -Torch ------- - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - agents.torch.utils.training.model_factory - utils.torch.choose_device - - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - agents.torch.utils.models.MultiLayerPerceptron - agents.torch.utils.models.ConvolutionalNetwork - agents.torch.utils.models.DuelingNetwork - agents.torch.utils.models.Table - - -Bandits -======= - -Bandit environments -------------------- - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - envs.bandits.AdversarialBandit - envs.bandits.Bandit - envs.bandits.BernoulliBandit - envs.bandits.NormalBandit - envs.bandits.CorruptedNormalBandit - -Bandit algorithms ------------------ -The bandits algorithms use mainly the following tracker tool: - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - agents.bandits.tools.BanditTracker - -Some general class of bandit algorithms are provided. - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - agents.bandits.BanditWithSimplePolicy - agents.bandits.IndexAgent - agents.bandits.RandomizedAgent - agents.bandits.TSAgent - -A number of indices are provided to use in bandits algorithms: - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - agents.bandits.makeBoundedIMEDIndex - agents.bandits.makeBoundedMOSSIndex - agents.bandits.makeBoundedNPTSIndex - agents.bandits.makeBoundedUCBIndex - agents.bandits.makeBoundedUCBVIndex - agents.bandits.makeETCIndex - agents.bandits.makeEXP3Index - agents.bandits.makeSubgaussianMOSSIndex - agents.bandits.makeSubgaussianUCBIndex diff --git a/docs/basics/experiment_setup.rst b/docs/basics/experiment_setup.rst index e4d82e96d..a13fa600e 100644 --- a/docs/basics/experiment_setup.rst +++ b/docs/basics/experiment_setup.rst @@ -36,7 +36,7 @@ This can be done very succinctly as in the example below: .. code-block:: yaml - constructor: 'rlberry.envs.benchmarks.grid_exploration.nroom.NRoom' + constructor: 'rlberry_research.envs.benchmarks.grid_exploration.nroom.NRoom' params: reward_free: false array_observation: true @@ -46,7 +46,7 @@ This can be done very succinctly as in the example below: .. code-block:: yaml - agent_class: 'rlberry.agents.kernel_based.rs_ucbvi.RSUCBVIAgent' + agent_class: 'rlberry_research.agents.kernel_based.rs_ucbvi.RSUCBVIAgent' init_kwargs: gamma: 1.0 lp_metric: 2 diff --git a/docs/basics/multiprocess.rst b/docs/basics/multiprocess.rst index de25cae22..ae65ad2a7 100644 --- a/docs/basics/multiprocess.rst +++ b/docs/basics/multiprocess.rst @@ -29,9 +29,9 @@ The advised method of parallelization is spawn (parameter :code:`mp_context="spa .. code:: python - from rlberry.agents.torch import A2CAgent + from rlberry_research.agents.torch import A2CAgent from rlberry.manager import ExperimentManager - from rlberry.envs.benchmarks.ball_exploration import PBall2D + from rlberry_research.envs.benchmarks.ball_exploration import PBall2D n_steps = 1e5 batch_size = 256 diff --git a/docs/basics/rlberry how to.rst b/docs/basics/rlberry how to.rst index f84e79c5f..dfa443df3 100644 --- a/docs/basics/rlberry how to.rst +++ b/docs/basics/rlberry how to.rst @@ -6,7 +6,7 @@ Libraries import numpy as np import pandas as pd from rlberry.agents import ValueIterationAgent, AgentWithSimplePolicy - from rlberry.envs import GridWorld + from rlberry_research.envs import GridWorld from rlberry.manager import ExperimentManager, evaluate_agents diff --git a/docs/contributors.rst b/docs/contributors.rst index 85b1b29ac..ae2a9edd4 100644 --- a/docs/contributors.rst +++ b/docs/contributors.rst @@ -65,4 +65,8 @@

Riccardo Della Vecchia

+
+
+

YannBerthelot

+
diff --git a/examples/comparison_agents.py b/examples/comparison_agents.py index 897dbb014..113be1645 100644 --- a/examples/comparison_agents.py +++ b/examples/comparison_agents.py @@ -15,9 +15,9 @@ from rlberry.manager.comparison import compare_agents from rlberry.manager import AgentManager -from rlberry.envs.bandits import BernoulliBandit +from rlberry_research.envs.bandits import BernoulliBandit from rlberry.wrappers import WriterWrapper -from rlberry.agents.bandits import ( +from rlberry_research.agents.bandits import ( IndexAgent, makeBoundedMOSSIndex, makeBoundedNPTSIndex, diff --git a/examples/demo_agents/video_plot_a2c.py b/examples/demo_agents/video_plot_a2c.py index 6e20c537f..80d7a4fe8 100644 --- a/examples/demo_agents/video_plot_a2c.py +++ b/examples/demo_agents/video_plot_a2c.py @@ -11,8 +11,8 @@ """ # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_a2c.jpg' -from rlberry.agents.torch import A2CAgent -from rlberry.envs.benchmarks.ball_exploration import PBall2D +from rlberry_research.agents.torch import A2CAgent +from rlberry_research.envs.benchmarks.ball_exploration import PBall2D from gymnasium.wrappers import TimeLimit diff --git a/examples/demo_agents/video_plot_mbqvi.py b/examples/demo_agents/video_plot_mbqvi.py index 906aec11e..d98ddcf77 100644 --- a/examples/demo_agents/video_plot_mbqvi.py +++ b/examples/demo_agents/video_plot_mbqvi.py @@ -10,8 +10,8 @@ """ # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_mbqvi.jpg' -from rlberry.agents.mbqvi import MBQVIAgent -from rlberry.envs.finite import GridWorld +from rlberry_scool.agents.mbqvi import MBQVIAgent +from rlberry_research.envs.finite import GridWorld params = {} params["n_samples"] = 100 # samples per state-action pair diff --git a/examples/demo_agents/video_plot_ppo.py b/examples/demo_agents/video_plot_ppo.py index 47e4c6629..8834c7960 100644 --- a/examples/demo_agents/video_plot_ppo.py +++ b/examples/demo_agents/video_plot_ppo.py @@ -11,8 +11,8 @@ """ # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_a2c.jpg' -from rlberry.agents.torch import PPOAgent -from rlberry.envs.benchmarks.ball_exploration import PBall2D +from rlberry_research.agents.torch import PPOAgent +from rlberry_research.envs.benchmarks.ball_exploration import PBall2D env = PBall2D() diff --git a/examples/demo_agents/video_plot_vi.py b/examples/demo_agents/video_plot_vi.py index ce84a0dbc..65f4e4b8f 100644 --- a/examples/demo_agents/video_plot_vi.py +++ b/examples/demo_agents/video_plot_vi.py @@ -11,8 +11,8 @@ """ # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_vi.jpg' -from rlberry.agents.dynprog import ValueIterationAgent -from rlberry.envs.finite import Chain +from rlberry_research.agents.dynprog import ValueIterationAgent +from rlberry_research.envs.finite import Chain env = Chain() agent = ValueIterationAgent(env, gamma=0.95) diff --git a/examples/demo_bandits/plot_TS_bandit.py b/examples/demo_bandits/plot_TS_bandit.py index 599033dbc..41de68770 100644 --- a/examples/demo_bandits/plot_TS_bandit.py +++ b/examples/demo_bandits/plot_TS_bandit.py @@ -11,8 +11,8 @@ """ import numpy as np -from rlberry.envs.bandits import BernoulliBandit, NormalBandit -from rlberry.agents.bandits import ( +from rlberry_research.envs.bandits import BernoulliBandit, NormalBandit +from rlberry_research.agents.bandits import ( IndexAgent, TSAgent, makeBoundedUCBIndex, diff --git a/examples/demo_bandits/plot_compare_index_bandits.py b/examples/demo_bandits/plot_compare_index_bandits.py index f089c5ac3..25e520aa3 100644 --- a/examples/demo_bandits/plot_compare_index_bandits.py +++ b/examples/demo_bandits/plot_compare_index_bandits.py @@ -8,10 +8,10 @@ """ import numpy as np import matplotlib.pyplot as plt -from rlberry.envs.bandits import BernoulliBandit +from rlberry_research.envs.bandits import BernoulliBandit from rlberry.manager import ExperimentManager, plot_writer_data from rlberry.wrappers import WriterWrapper -from rlberry.agents.bandits import ( +from rlberry_research.agents.bandits import ( IndexAgent, RandomizedAgent, makeBoundedIMEDIndex, diff --git a/examples/demo_bandits/plot_exp3_bandit.py b/examples/demo_bandits/plot_exp3_bandit.py index f4716a219..7452f85b3 100644 --- a/examples/demo_bandits/plot_exp3_bandit.py +++ b/examples/demo_bandits/plot_exp3_bandit.py @@ -8,8 +8,8 @@ """ import numpy as np -from rlberry.envs.bandits import AdversarialBandit -from rlberry.agents.bandits import ( +from rlberry_research.envs.bandits import AdversarialBandit +from rlberry_research.agents.bandits import ( RandomizedAgent, TSAgent, makeEXP3Index, diff --git a/examples/demo_bandits/plot_mirror_bandit.py b/examples/demo_bandits/plot_mirror_bandit.py index 4e9b9757d..a89602943 100644 --- a/examples/demo_bandits/plot_mirror_bandit.py +++ b/examples/demo_bandits/plot_mirror_bandit.py @@ -16,7 +16,7 @@ from rlberry.manager import ExperimentManager, read_writer_data from rlberry.envs.interface import Model -from rlberry.agents.bandits import BanditWithSimplePolicy +from rlberry_research.agents.bandits import BanditWithSimplePolicy from rlberry.wrappers import WriterWrapper import rlberry.spaces as spaces diff --git a/examples/demo_bandits/plot_ucb_bandit.py b/examples/demo_bandits/plot_ucb_bandit.py index 92b9d8ae2..43e4d1e70 100644 --- a/examples/demo_bandits/plot_ucb_bandit.py +++ b/examples/demo_bandits/plot_ucb_bandit.py @@ -7,8 +7,8 @@ """ import numpy as np -from rlberry.envs.bandits import NormalBandit -from rlberry.agents.bandits import IndexAgent, makeSubgaussianUCBIndex +from rlberry_research.envs.bandits import NormalBandit +from rlberry_research.agents.bandits import IndexAgent, makeSubgaussianUCBIndex from rlberry.manager import ExperimentManager, plot_writer_data import matplotlib.pyplot as plt from rlberry.wrappers import WriterWrapper diff --git a/examples/demo_env/video_plot_apple_gold.py b/examples/demo_env/video_plot_apple_gold.py index 74282cca4..9e6eb34c6 100644 --- a/examples/demo_env/video_plot_apple_gold.py +++ b/examples/demo_env/video_plot_apple_gold.py @@ -10,8 +10,8 @@ """ # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_apple_gold.jpg' -from rlberry.envs.benchmarks.grid_exploration.apple_gold import AppleGold -from rlberry.agents.dynprog import ValueIterationAgent +from rlberry_research.envs.benchmarks.grid_exploration.apple_gold import AppleGold +from rlberry_research.agents.dynprog import ValueIterationAgent env = AppleGold(reward_free=False, array_observation=False) diff --git a/examples/demo_env/video_plot_chain.py b/examples/demo_env/video_plot_chain.py index 6437d3988..42c2b3c8b 100644 --- a/examples/demo_env/video_plot_chain.py +++ b/examples/demo_env/video_plot_chain.py @@ -11,7 +11,7 @@ # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_chain.jpg' -from rlberry.envs.finite import Chain +from rlberry_research.envs.finite import Chain env = Chain(10, 0.1) env.enable_rendering() diff --git a/examples/demo_env/video_plot_gridworld.py b/examples/demo_env/video_plot_gridworld.py index 129e5a7e6..872b46fbb 100644 --- a/examples/demo_env/video_plot_gridworld.py +++ b/examples/demo_env/video_plot_gridworld.py @@ -12,8 +12,8 @@ """ # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_gridworld.jpg' -from rlberry.agents.dynprog import ValueIterationAgent -from rlberry.envs.finite import GridWorld +from rlberry_research.agents.dynprog import ValueIterationAgent +from rlberry_research.envs.finite import GridWorld env = GridWorld(7, 10, walls=((2, 2), (3, 3))) diff --git a/examples/demo_env/video_plot_pball.py b/examples/demo_env/video_plot_pball.py index af6c7c637..e9765fa5f 100644 --- a/examples/demo_env/video_plot_pball.py +++ b/examples/demo_env/video_plot_pball.py @@ -11,7 +11,7 @@ # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_pball.jpg' import numpy as np -from rlberry.envs.benchmarks.ball_exploration import PBall2D +from rlberry_research.envs.benchmarks.ball_exploration import PBall2D p = 5 A = np.array([[1.0, 0.1], [-0.1, 1.0]]) diff --git a/examples/demo_env/video_plot_rooms.py b/examples/demo_env/video_plot_rooms.py index 9cee6bf6f..5119c8957 100644 --- a/examples/demo_env/video_plot_rooms.py +++ b/examples/demo_env/video_plot_rooms.py @@ -10,8 +10,8 @@ """ # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_rooms.jpg' -from rlberry.envs.benchmarks.grid_exploration.nroom import NRoom -from rlberry.agents.dynprog import ValueIterationAgent +from rlberry_research.envs.benchmarks.grid_exploration.nroom import NRoom +from rlberry_scool.agents.dynprog import ValueIterationAgent env = NRoom( nrooms=9, diff --git a/examples/demo_env/video_plot_twinrooms.py b/examples/demo_env/video_plot_twinrooms.py index 22c36683a..f8ae6ab11 100644 --- a/examples/demo_env/video_plot_twinrooms.py +++ b/examples/demo_env/video_plot_twinrooms.py @@ -10,8 +10,8 @@ """ # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_twinrooms.jpg' -from rlberry.envs.benchmarks.generalization.twinrooms import TwinRooms -from rlberry.agents.mbqvi import MBQVIAgent +from rlberry_research.envs.benchmarks.generalization.twinrooms import TwinRooms +from rlberry_scool.agents.mbqvi import MBQVIAgent from rlberry.wrappers.discretize_state import DiscretizeStateWrapper from rlberry.seeding import Seeder diff --git a/examples/demo_experiment/room.yaml b/examples/demo_experiment/room.yaml index 3223015c2..977d239ed 100644 --- a/examples/demo_experiment/room.yaml +++ b/examples/demo_experiment/room.yaml @@ -3,7 +3,7 @@ # Demo: room.yaml # ===================== # """ -constructor: 'rlberry.envs.benchmarks.grid_exploration.nroom.NRoom' +constructor: 'rlberry_research.envs.benchmarks.grid_exploration.nroom.NRoom' params: reward_free: false array_observation: true diff --git a/examples/demo_experiment/rsucbvi.yaml b/examples/demo_experiment/rsucbvi.yaml index 25cfadebc..e4a47b69e 100644 --- a/examples/demo_experiment/rsucbvi.yaml +++ b/examples/demo_experiment/rsucbvi.yaml @@ -3,7 +3,7 @@ # Demo: rsucbvi.yaml # ===================== # """ -agent_class: 'rlberry.agents.kernel_based.rs_ucbvi.RSUCBVIAgent' +agent_class: 'rlberry_research.agents.kernel_based.rs_ucbvi.RSUCBVIAgent' init_kwargs: gamma: 1.0 lp_metric: 2 diff --git a/examples/demo_network/run_server.py b/examples/demo_network/run_server.py index c1b6a15b5..a8b9b04c7 100644 --- a/examples/demo_network/run_server.py +++ b/examples/demo_network/run_server.py @@ -3,11 +3,11 @@ Demo: run_server ===================== """ -from rlberry.network.interface import ResourceItem -from rlberry.network.server import BerryServer +from rlberry_research.network.interface import ResourceItem +from rlberry_research.network.server import BerryServer from rlberry.agents import ValueIterationAgent -from rlberry.agents.torch import REINFORCEAgent, A2CAgent -from rlberry.envs import GridWorld, gym_make +from rlberry_research.agents.torch import REINFORCEAgent, A2CAgent +from rlberry_research.envs import GridWorld, gym_make from rlberry.utils.writers import DefaultWriter if __name__ == "__main__": diff --git a/examples/plot_agent_manager.py b/examples/plot_agent_manager.py index 338ee417d..076ccc5fd 100644 --- a/examples/plot_agent_manager.py +++ b/examples/plot_agent_manager.py @@ -17,7 +17,7 @@ Finally, we compare with a baseline provided by a random policy using the Agent Manager class which trains, evaluates and gathers statistics about the two agents. """ -from rlberry.envs import GridWorld +from rlberry_research.envs import GridWorld # Create a grid world environment and an agent with a value iteration policy env_ctor = GridWorld diff --git a/examples/plot_kernels.py b/examples/plot_kernels.py index 84b2b2cfc..6557426e2 100644 --- a/examples/plot_kernels.py +++ b/examples/plot_kernels.py @@ -8,7 +8,7 @@ import matplotlib.pyplot as plt import numpy as np -from rlberry.agents.kernel_based.kernels import kernel_func +from rlberry_research.agents.kernel_based.kernels import kernel_func kernel_types = [ "uniform", diff --git a/examples/plot_writer_wrapper.py b/examples/plot_writer_wrapper.py index 069d4de00..63bcad943 100644 --- a/examples/plot_writer_wrapper.py +++ b/examples/plot_writer_wrapper.py @@ -23,9 +23,9 @@ import numpy as np from rlberry.wrappers import WriterWrapper -from rlberry.envs import GridWorld +from rlberry_research.envs import GridWorld from rlberry.manager import plot_writer_data, ExperimentManager -from rlberry.agents import UCBVIAgent +from rlberry_scool.agents import UCBVIAgent import matplotlib.pyplot as plt # We wrape the default writer of the agent in a WriterWrapper diff --git a/poetry.lock b/poetry.lock index 38d4e07b7..9f1db6b27 100644 --- a/poetry.lock +++ b/poetry.lock @@ -26,7 +26,7 @@ files = [ name = "ale-py" version = "0.8.1" description = "The Arcade Learning Environment (ALE) - a platform for AI research." -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "ale_py-0.8.1-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:b2aa2f69a4169742800615970efe6914fa856e33eaf7fa9133c0e06a617a80e2"}, @@ -82,7 +82,7 @@ tz = ["python-dateutil"] name = "autorom" version = "0.4.2" description = "Automated installation of Atari ROMs for Gym/ALE-Py" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "AutoROM-0.4.2-py3-none-any.whl", hash = "sha256:719c9d363ef08391fdb7003d70df235b68f36de628d289a946c4a59a3adefa13"}, @@ -102,7 +102,7 @@ accept-rom-license = ["AutoROM.accept-rom-license"] name = "autorom-accept-rom-license" version = "0.6.1" description = "Automated installation of Atari ROMs for Gym/ALE-Py" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "AutoROM.accept-rom-license-0.6.1.tar.gz", hash = "sha256:0c905a708d634a076f686802f672817d3585259ce3be0bde8713a4fb59e3159e"}, @@ -1343,6 +1343,16 @@ files = [ {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"}, @@ -2365,6 +2375,7 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -2372,8 +2383,15 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -2390,6 +2408,7 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -2397,6 +2416,7 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -2441,6 +2461,46 @@ requests = ">=2.0.0" [package.extras] rsa = ["oauthlib[signedtoken] (>=3.0.0)"] +[[package]] +name = "rlberry-research" +version = "0.6.1" +description = "Algorithms and envs for research with rlberry" +optional = false +python-versions = "*" +files = [] +develop = false + +[package.dependencies] +rlberry = {git = "https://github.com/rlberry-py/rlberry"} + +[package.extras] +deploy = ["sphinx", "sphinx_rtd_theme"] +torch-agents = ["torch (>=1.6.0)"] + +[package.source] +type = "git" +url = "https://github.com/rlberry-py/rlberry-research.git" +reference = "HEAD" +resolved_reference = "973358e77d4e931361b4bb955e295b1537f5e7e9" + +[[package]] +name = "rlberry-scool" +version = "0.5.0.post29.dev0+2b871b8" +description = "Teaching Reinforcement Learning made easy" +optional = false +python-versions = "*" +files = [] +develop = false + +[package.dependencies] +rlberry = "*" + +[package.source] +type = "git" +url = "https://github.com/rlberry-py/rlberry-scool.git" +reference = "HEAD" +resolved_reference = "b534a999289909c6c1b589658a71d22490452de7" + [[package]] name = "rsa" version = "4.9" @@ -2538,7 +2598,7 @@ testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jar name = "shimmy" version = "0.2.1" description = "API for converting popular non-gymnasium environments to a gymnasium compatible environment." -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "Shimmy-0.2.1-py3-none-any.whl", hash = "sha256:2d7d21c4ca679a64bb452e6a4232c6b0f5dba7589f5420454ddc1f0634334334"}, @@ -3177,10 +3237,10 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [extras] doc = ["matplotlib", "myst-parser", "numpydoc", "sphinx", "sphinx-gallery", "sphinx-math-dollar", "sphinxcontrib-video"] -extras = ["ffmpeg-python", "optuna", "pyopengl", "pyvirtualdisplay"] -torch = ["Gymnasium", "ale-py", "gymnasium", "opencv-python", "stable-baselines3", "tensorboard", "torch"] +extras = ["ffmpeg-python", "numba", "optuna", "pyopengl", "pyvirtualdisplay"] +torch = ["ale-py", "opencv-python", "stable-baselines3", "tensorboard", "torch"] [metadata] lock-version = "2.0" python-versions = "^3.9, <3.13" -content-hash = "bb56a4ca54235fed2aa5c567a5921f6d27bbf6b8d3da8a2feb5c6656c735c875" +content-hash = "5edef0e50e0c75c099db79ac5e569d004998a988495ef3bc14f3fd9c6b426faa" diff --git a/pyproject.toml b/pyproject.toml index e70cfc756..660aa4270 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,8 @@ pytest-xprocess = "^0.23.0" codecov = "^2.1.13" black = "23.9.1" pre-commit = "^3.5.0" +rlberry-research = {git = "https://github.com/rlberry-py/rlberry-research.git"} +rlberry-scool = {git = "https://github.com/rlberry-py/rlberry-scool.git"} [build-system] requires = ["poetry-core"] diff --git a/rlberry/agents/__init__.py b/rlberry/agents/__init__.py index 60fd5b8a4..49c0ff952 100644 --- a/rlberry/agents/__init__.py +++ b/rlberry/agents/__init__.py @@ -2,16 +2,3 @@ from .agent import Agent from .agent import AgentWithSimplePolicy from .agent import AgentTorch - -# Basic agents (in alphabetical order) -# basic = does not require torch, jax, etc... -from .adaptiveql import AdaptiveQLAgent -from .dynprog import ValueIterationAgent -from .kernel_based import RSUCBVIAgent, RSKernelUCBVIAgent -from .linear import LSVIUCBAgent -from .mbqvi import MBQVIAgent -from .optql import OptQLAgent -from .psrl import PSRLAgent -from .rlsvi import RLSVIAgent -from .ucbvi import UCBVIAgent -from .tabular_rl import QLAgent, SARSAAgent diff --git a/rlberry/agents/adaptiveql/__init__.py b/rlberry/agents/adaptiveql/__init__.py deleted file mode 100644 index b0498beee..000000000 --- a/rlberry/agents/adaptiveql/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .adaptiveql import AdaptiveQLAgent diff --git a/rlberry/agents/adaptiveql/adaptiveql.py b/rlberry/agents/adaptiveql/adaptiveql.py deleted file mode 100644 index 667ed54e0..000000000 --- a/rlberry/agents/adaptiveql/adaptiveql.py +++ /dev/null @@ -1,195 +0,0 @@ -import gymnasium.spaces as spaces -import numpy as np -from rlberry.agents import AgentWithSimplePolicy -from rlberry.agents.adaptiveql.tree import MDPTreePartition - -import rlberry - -logger = rlberry.logger - - -class AdaptiveQLAgent(AgentWithSimplePolicy): - """ - Adaptive Q-Learning algorithm [1]_ implemented for enviroments - with continuous (Box) states and **discrete actions**. - - .. todo:: Handle continuous actios too. - - Parameters - ---------- - env : gym.Env - Environment with continuous states and discrete actions. - gamma : double, default: 1.0 - Discount factor in [0, 1]. - horizon : int, default: 50 - Horizon of the objective function. - bonus_scale_factor : double, default: 1.0 - Constant by which to multiply the exploration bonus, controls - the level of exploration. - bonus_type : string, default: "simplified_bernstein" - Type of exploration bonus. Currently, only "simplified_bernstein" - is implemented. - - Attributes - ---------- - gamma : double, default: 1.0 - Discount factor in [0, 1]. - horizon : int, default: 50 - Horizon of the objective function. - bonus_scale_factor : double, default: 1.0 - Constant by which to multiply the exploration bonus, controls - the level of exploration. - bonus_type : string, default: "simplified_bernstein" - Type of exploration bonus. Currently, only "simplified_bernstein" - is implemented. - v_max : ndarray - Array of the maximum state value as a function of the Horizon. - Qtree : MDPTreePartition - Tree structure to represent the MDP model of transition. - episode : int - Number of episodes done during training of the adaptiveql agent. - - References - ---------- - .. [1] Sinclair, Sean R., Siddhartha Banerjee, and Christina Lee Yu. - "Adaptive Discretization for Episodic Reinforcement Learning in Metric Spaces." - Proceedings of the ACM on Measurement and Analysis of Computing Systems 3.3 (2019): 1-44. - - Notes - ------ - Uses the metric induced by the l-infinity norm. - """ - - name = "AdaptiveQLearning" - - def __init__( - self, - env, - gamma=1.0, - horizon=50, - bonus_scale_factor=1.0, - bonus_type="simplified_bernstein", - **kwargs - ): - AgentWithSimplePolicy.__init__(self, env, **kwargs) - - assert isinstance(self.env.observation_space, spaces.Box) - assert isinstance(self.env.action_space, spaces.Discrete) - - self.gamma = gamma - self.horizon = horizon - self.bonus_scale_factor = bonus_scale_factor - self.bonus_type = bonus_type - - # maximum value - r_range = self.env.reward_range[1] - self.env.reward_range[0] - if r_range == np.inf or r_range == 0.0: - logger.warning( - "{}: Reward range is zero or infinity. ".format(self.name) - + "Setting it to 1." - ) - r_range = 1.0 - - self.v_max = np.zeros(self.horizon) - self.v_max[-1] = r_range - for hh in reversed(range(self.horizon - 1)): - self.v_max[hh] = r_range + self.gamma * self.v_max[hh + 1] - - self.reset() - - def reset(self): - self.Qtree = MDPTreePartition( - self.env.observation_space, self.env.action_space, self.horizon - ) - - # info - self.episode = 0 - - def policy(self, observation): - action, _ = self.Qtree.get_argmax_and_node(observation, 0) - return action - - def _get_action_and_node(self, observation, hh): - action, node = self.Qtree.get_argmax_and_node(observation, hh) - return action, node - - def _update(self, node, state, action, next_state, reward, hh): - # split node if necessary - node_to_check = self.Qtree.update_counts(state, action, hh) - if node_to_check.n_visits >= (self.Qtree.dmax / node_to_check.radius) ** 2.0: - node_to_check.split() - assert id(node_to_check) == id(node) - - tt = node.n_visits # number of visits to the selected state-action node - - # value at next_state - value_next_state = 0 - if hh < self.horizon - 1: - value_next_state = min( - self.v_max[hh + 1], - self.Qtree.get_argmax_and_node(next_state, hh + 1)[1].qvalue, - ) - - # learning rate - alpha = (self.horizon + 1.0) / (self.horizon + tt) - - bonus = self._compute_bonus(tt, hh) - target = reward + bonus + self.gamma * value_next_state - - # update Q - node.qvalue = (1 - alpha) * node.qvalue + alpha * target - - def _compute_bonus(self, n, hh): - if self.bonus_type == "simplified_bernstein": - bonus = self.bonus_scale_factor * np.sqrt(1.0 / n) + self.v_max[hh] / n - bonus = min(bonus, self.v_max[hh]) - return bonus - else: - raise ValueError( - "Error: bonus type {} not implemented".format(self.bonus_type) - ) - - def _run_episode(self): - # interact for H steps - episode_rewards = 0 - observation, info = self.env.reset() - for hh in range(self.horizon): - action, node = self._get_action_and_node(observation, hh) - next_observation, reward, terminated, truncated, info = self.env.step( - action - ) - done = terminated or truncated - episode_rewards += reward - - self._update(node, observation, action, next_observation, reward, hh) - - observation = next_observation - if done: - break - - # update info - self.episode += 1 - - # writer - if self.writer is not None: - self.writer.add_scalar("episode_rewards", episode_rewards, self.episode) - - # return sum of rewards collected in the episode - return episode_rewards - - def fit(self, budget: int, **kwargs): - """ - Train the agent using the provided environment. - - Parameters - ---------- - budget: int - number of episodes. Each episode runs for self.horizon unless it - enconters a terminal state in which case it stops early. - """ - del kwargs - n_episodes_to_run = budget - count = 0 - while count < n_episodes_to_run: - self._run_episode() - count += 1 diff --git a/rlberry/agents/adaptiveql/tree.py b/rlberry/agents/adaptiveql/tree.py deleted file mode 100644 index 4aaeb7948..000000000 --- a/rlberry/agents/adaptiveql/tree.py +++ /dev/null @@ -1,219 +0,0 @@ -import gymnasium.spaces as spaces -import numpy as np -import matplotlib.pyplot as plt -from rlberry.agents.adaptiveql.utils import bounds_contains, split_bounds - - -class TreeNode: - """ - Node representing an l-infinity ball in R^d, that points - to sub-balls (node children). - Stores a value, a number of visits, and (possibly) rewards and transition probability - to a list of other nodes. - - This class is used to represent (and store data about) - a tuple (state, action, stage) = (x, a, h). - - Parameters - ---------- - bounds : numpy.ndarray - Bounds of each dimension [ [x0, y0], [x1, y1], ..., [xd, yd] ], - representing the cartesian product in R^d: - [x0, y0] X [x1, y1] X ... X [xd, yd] - depth: int - Node depth, root is at depth 0. - qvalue : double, default: 0 - Initial node Q value - n_visits : int, default = 0 - Number of visits to the node. - - """ - - def __init__(self, bounds, depth, qvalue=0.0, n_visits=0): - self.dim = len(bounds) - - self.radius = (bounds[:, 1] - bounds[:, 0]).max() / 2.0 - assert self.radius > 0.0 - - self.bounds = bounds - self.depth = depth - self.qvalue = qvalue - self.n_visits = n_visits - self.children = [] - - # - # For AdaMB - # - - # Value V, initialized as Q - self.vvalue = qvalue - # Reward estimate - self.reward_est = 0.0 - # Dictionary node_id -> transition_prob - # node_id = id(node), where id() is a built-in python function - self.transition_probs = {} - # Dictionary node_id -> node - self.transition_nodes = {} - - def is_leaf(self): - return len(self.children) == 0 - - def contains(self, x): - """Check if `x` is contained in the node/ball.""" - return bounds_contains(self.bounds, x) - - def split(self): - """Spawn children nodes by splitting the ball.""" - child_bounds = split_bounds(self.bounds) - for bounds in child_bounds: - self.children.append( - TreeNode(bounds, self.depth + 1, self.qvalue, self.n_visits) - ) - - -class TreePartition: - """ - Tree-based partition of an l-infinity ball in R^d. - - Each node is of type TreeNode. - - Parameters - ---------- - space: gym.spaces.Box - Domain of the function. - initial_value: double - Value to initialize the root node. - """ - - def __init__(self, space, initial_value=0.0): - assert isinstance(space, spaces.Box) - assert space.is_bounded() - - bounds = np.vstack((space.low, space.high)).T - self.root = TreeNode(bounds, depth=0, qvalue=initial_value) - self.dim = bounds.shape[0] - self.dmax = self.root.radius - - def traverse(self, x, update=False): - """ - Returns leaf node containing x. - - If `update=true`, increments number of visits of each - node in the path. - - Parameters - ---------- - x : numpy.ndarray - Array of shape (d,) - """ - node = self.root - - # traverse the tree until leaf - while True: - if update: - node.n_visits += 1 - if node.is_leaf(): - break - for cc in node.children: - if cc.contains(x): - node = cc - break - - # return value at leaf - return node - - def plot( - self, - fignum="tree plot", - colormap_name="cool", - max_value=10, - node=None, - root=True, - ): - """ - Visualize the function (2d domain only). - Shows the hierarchical partition. - """ - if root: - assert ( - self.dim == 2 - ), "TreePartition plot only available for 2-dimensional spaces." - node = self.root - plt.figure(fignum) - - # draw region corresponding to the leaf - if node.is_leaf(): - x0, x1 = node.bounds[0, :] - y0, y1 = node.bounds[1, :] - - colormap_fn = plt.get_cmap(colormap_name) - color = colormap_fn(node.qvalue / max_value) - rectangle = plt.Rectangle( - (x0, y0), x1 - x0, y1 - y0, ec="black", color=color - ) - plt.gca().add_patch(rectangle) - plt.axis("scaled") - - else: - for cc in node.children: - self.plot( - max_value=max_value, - colormap_name=colormap_name, - node=cc, - root=False, - ) - - -class MDPTreePartition: - """ - Set of H x A TreePartition instances. - - Used to store/manipulate a Q function, a reward function and a transition model. - """ - - def __init__(self, observation_space, action_space, horizon): - self.horizon = horizon - self.n_actions = action_space.n - self.trees = [] - for hh in range(horizon): - self.trees.append({}) - for aa in range(self.n_actions): - self.trees[hh][aa] = TreePartition( - observation_space, initial_value=horizon - hh - ) - - self.dmax = self.trees[0][0].dmax - - def get_argmax_and_node(self, x, hh): - """ - Returns a* = argmax_a Q_h(x, a) and the node corresponding to (x, a*). - """ - # trees for each action at hh - trees_hh = self.trees[hh] - - best_action = 0 - best_node = trees_hh[0].traverse(x, update=False) - best_val = best_node.qvalue - for aa in range(1, self.n_actions): - node = trees_hh[aa].traverse(x, update=False) - val = node.qvalue - if val > best_val: - best_val = val - best_action = aa - best_node = node - - return best_action, best_node - - def update_counts(self, x, aa, hh): - """ - Increment counters associated to (x, aa, hh) and returns the node. - """ - tree = self.trees[hh][aa] - node = tree.traverse(x, update=True) - return node - - def plot(self, a, h): - """ - Visualize Q_h(x, a) - """ - self.trees[h][a].plot(max_value=self.horizon - h) diff --git a/rlberry/agents/adaptiveql/utils.py b/rlberry/agents/adaptiveql/utils.py deleted file mode 100644 index f04f96c3c..000000000 --- a/rlberry/agents/adaptiveql/utils.py +++ /dev/null @@ -1,61 +0,0 @@ -from rlberry.utils.jit_setup import numba_jit - - -@numba_jit -def bounds_contains(bounds, x): - """ - Returns True if `x` is contained in the bounds, and False otherwise. - - Parameters - ---------- - bounds : numpy.ndarray - Array of shape (d, 2). - Bounds of each dimension [ [x0, y0], [x1, y1], ..., [xd, yd] ], - representing the following cartesian product in R^d: - [x0, y0] X [x1, y1] X ... X [xd, yd]. - x : numpy.ndarray - Array of shape (d,) - """ - dim = x.shape[0] - for dd in range(dim): - if x[dd] < bounds[dd, 0] or x[dd] > bounds[dd, 1]: - return False - return True - - -def split_bounds(bounds, dim=0): - """ - Split an array representing an l-infinity ball in R^d in R^d - into a list of 2^d arrays representing the ball split. - - Parameters - ---------- - bounds : numpy.ndarray - Array of shape (d, 2). - Bounds of each dimension [ [x0, y0], [x1, y1], ..., [xd, yd] ], - representing the cartesian product in R^d: - [x0, y0] X [x1, y1] X ... X [xd, yd]. - - dim : int, default: 0 - Dimension from which to start splitting. - - Returns - ------- - List of arrays of shape (d, 2) containing the bounds to be split. - """ - if dim == bounds.shape[0]: - return [bounds] - left = bounds[dim, 0] - right = bounds[dim, 1] - middle = (left + right) / 2.0 - - left_interval = bounds.copy() - right_interval = bounds.copy() - - left_interval[dim, 0] = left - left_interval[dim, 1] = middle - - right_interval[dim, 0] = middle - right_interval[dim, 1] = right - - return split_bounds(left_interval, dim + 1) + split_bounds(right_interval, dim + 1) diff --git a/rlberry/agents/bandits/__init__.py b/rlberry/agents/bandits/__init__.py deleted file mode 100644 index b35c171cf..000000000 --- a/rlberry/agents/bandits/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from .bandit_base import BanditWithSimplePolicy -from .index_agents import IndexAgent -from .indices import ( - makeBoundedIMEDIndex, - makeBoundedMOSSIndex, - makeBoundedNPTSIndex, - makeBoundedUCBIndex, - makeBoundedUCBVIndex, - makeETCIndex, - makeEXP3Index, - makeSubgaussianMOSSIndex, - makeSubgaussianUCBIndex, -) -from .priors import ( - makeBetaPrior, - makeGaussianPrior, -) -from .randomized_agents import RandomizedAgent -from .ts_agents import TSAgent diff --git a/rlberry/agents/bandits/bandit_base.py b/rlberry/agents/bandits/bandit_base.py deleted file mode 100644 index 2558b30a2..000000000 --- a/rlberry/agents/bandits/bandit_base.py +++ /dev/null @@ -1,123 +0,0 @@ -import numpy as np -from rlberry.agents import AgentWithSimplePolicy -from .tools import BanditTracker -import pickle - -from pathlib import Path - -import rlberry - -logger = rlberry.logger - - -class BanditWithSimplePolicy(AgentWithSimplePolicy): - """ - Base class for bandits algorithms. - - The fit function must result in self.optimal_action being set for the save - and load functions to work. - - Parameters - ----------- - env: rlberry bandit environment - See :class:`~rlberry.envs.bandits.Bandit`. - - tracker_params: dict - Parameters for the tracker object, typically to decide what to store. - - """ - - name = "" - - def __init__(self, env, tracker_params={}, **kwargs): - AgentWithSimplePolicy.__init__(self, env, **kwargs) - self.n_arms = self.env.action_space.n - self.arms = np.arange(self.n_arms) - self.tracker = BanditTracker(self, tracker_params) - - @property - def total_time(self): - return self.tracker.t - - def fit(self, budget=None, **kwargs): - """ - Example fit function. Should be overwritten by your own implementation. - - Parameters - ---------- - budget: int - Total number of iterations, also called horizon. - """ - horizon = budget - rewards = np.zeros(horizon) - - for ep in range(horizon): - # choose the optimal action - # for demo purpose, we will always choose action 0 - action = 0 - _, reward, _, _, _ = self.env.step(action) - self.tracker.update(action, reward) - rewards[ep] = reward - - self.optimal_action = 0 - info = {"episode_reward": np.sum(rewards)} - return info - - def policy(self, observation): - return self.optimal_action - - def save(self, filename): - """ - Save agent object. - - Parameters - ---------- - filename: Path or str - File in which to save the Agent. - - Returns - ------- - If save() is successful, a Path object corresponding to the filename is returned. - Otherwise, None is returned. - Important: the returned filename might differ from the input filename: For instance, - the method can append the correct suffix to the name before saving. - - """ - - dico = { - "_writer": self.writer, - "seeder": self.seeder, - "_execution_metadata": self._execution_metadata, - "_unique_id": self._unique_id, - "_output_dir": self._output_dir, - "optimal_action": self.optimal_action, - } - - # save - filename = Path(filename).with_suffix(".pickle") - filename.parent.mkdir(parents=True, exist_ok=True) - with filename.open("wb") as ff: - pickle.dump(dico, ff) - - return filename - - @classmethod - def load(cls, filename, **kwargs): - """Load agent object. - - If overridden, save() method must also be overriden. - - Parameters - ---------- - **kwargs: dict - Arguments to required by the __init__ method of the Agent subclass. - """ - filename = Path(filename).with_suffix(".pickle") - - obj = cls(**kwargs) - with filename.open("rb") as ff: - tmp_dict = pickle.load(ff) - - obj.__dict__.update(tmp_dict) - - return obj diff --git a/rlberry/agents/bandits/index_agents.py b/rlberry/agents/bandits/index_agents.py deleted file mode 100644 index c7335aef7..000000000 --- a/rlberry/agents/bandits/index_agents.py +++ /dev/null @@ -1,101 +0,0 @@ -import numpy as np -from rlberry.agents.bandits import BanditWithSimplePolicy - - -import rlberry - -logger = rlberry.logger - -# TODO : fix bug when doing several fit, the fit do not resume. Should define -# self.rewards and self.action and resume training. - - -class IndexAgent(BanditWithSimplePolicy): - """ - Agent for bandit environment using Index-based policy like UCB. - - Parameters - ----------- - env : rlberry bandit environment - See :class:`~rlberry.envs.bandits.Bandit`. - - index_function : callable or None, default = None - Compute the index for an arm using the past rewards on this arm and - the current time t. If None, use UCB bound for Bernoulli. - - **kwargs: arguments - Arguments to be passed to :class:`~rlberry.agents.bandit.BanditWithSimplePolicy`. - In particular, one may want to pass the following parameters: - tracker_params: dict - Parameters for the tracker object, typically to decide what to store. - in particular may contain a function "update", used to define additional statistics - that have to be saved in the tracker. See :class:~rlberry.agents.bandit.BanditTracker`. - - Examples - -------- - >>> from rlberry.agents.bandits import IndexAgent - >>> import numpy as np - >>> class UCBAgent(IndexAgent): - >>> name = "UCB" - >>> def __init__(self, env, **kwargs): - >>> def index(tr): - >>> return [ - >>> tr.mu_hat(arm) - >>> + np.sqrt( - >>> np.log(tr.t ** 2) - >>> / (2 * tr.n_pulls(arm)) - >>> ) - >>> for arm in tr.arms - >>> ] - >>> IndexAgent.__init__(self, env, index, **kwargs) - - """ - - name = "IndexAgent" - - def __init__(self, env, index_function=None, **kwargs): - BanditWithSimplePolicy.__init__(self, env, **kwargs) - if index_function is None: - - def index_function(tr): - return [ - tr.mu_hat(arm) + np.sqrt(np.log(tr.t**2) / (2 * tr.n_pulls(arm))) - for arm in tr.arms - ] - - self.index_function = index_function - - def fit(self, budget=None, **kwargs): - """ - Train the bandit using the provided environment. - - Parameters - ---------- - budget: int - Total number of iterations, also called horizon. - """ - horizon = budget - total_reward = 0.0 - indices = np.inf * np.ones(self.n_arms) - - for ep in range(horizon): - # Warmup: play every arm one before starting computing indices - if ep < self.n_arms: - action = ep - else: - # Compute index for each arm and play the highest one - indices = self.index_function(self.tracker) - action = np.argmax(indices) - - _, reward, _, _, _ = self.env.step(action) - - # Feed the played action and the resulting reward to the tracker - self.tracker.update(action, reward) - - total_reward += reward - - # Best action in hinsight is the one with highest index - self.optimal_action = np.argmax(indices) - - info = {"episode_reward": total_reward} - return info diff --git a/rlberry/agents/bandits/indices.py b/rlberry/agents/bandits/indices.py deleted file mode 100644 index ebea3ac3f..000000000 --- a/rlberry/agents/bandits/indices.py +++ /dev/null @@ -1,421 +0,0 @@ -import numpy as np -from typing import Callable - - -def makeETCIndex(A: int = 2, m: int = 1): - """ - Explore-Then-Commit index, see Chapter 6 in [1]. - - Parameters - ---------- - A: int - Number of arms. - - m : int, default: 1 - Number of exploration pulls per arm. - - Return - ------ - Callable - ETC index. - - Dict - Extra parameters for the BanditTracker object. - By default the tracker stores the number of pulls and the - empirical average reward for each arm. If you want it to store - all rewards for instance, return {'store_rewards': True}. - - References - ---------- - .. [1] Lattimore, Tor, and Csaba Szepesvári. Bandit algorithms. - Cambridge University Press, 2020. - """ - - def index(tr): - return [-tr.n_pulls(arm) if tr.t < m * A else tr.mu_hat(arm) for arm in tr.arms] - - return index, {} - - -def makeSubgaussianUCBIndex( - sigma: float = 1.0, - delta: Callable = lambda t: 1 / (1 + (t + 1) * np.log(t + 1) ** 2), -): - """ - UCB index for sub-Gaussian distributions, see Chapters 7 & 8 in [1]. - - Parameters - ---------- - sigma : float, default: 1.0 - Sub-Gaussian parameter. - - delta: Callable, - Confidence level. Default is tuned to have asymptotically optimal - regret, see Chapter 8 in [1]. - - Return - ------ - Callable - UCB index for sigma-sub-Gaussian distributions. - - Dict - Extra parameters for the BanditTracker object. - By default the tracker stores the number of pulls and the - empirical average reward for each arm. If you want it to store - all rewards for instance, return {'store_rewards': True}. - - References - ---------- - .. [1] Lattimore, Tor, and Csaba Szepesvári. Bandit algorithms. - Cambridge University Press, 2020. - """ - - def index(tr): - return [ - tr.mu_hat(arm) - + sigma * np.sqrt(2 * np.log(1 / delta(tr.t)) / tr.n_pulls(arm)) - for arm in tr.arms - ] - - return index, {} - - -def makeBoundedUCBIndex( - lower_bound: float = 0.0, - upper_bound: float = 1.0, - delta: Callable = lambda t: 1 / (1 + (t + 1) * np.log(t + 1) ** 2), -): - """ - UCB index for bounded distributions, see Chapters 7 & 8 in [1]. - By Hoeffding's lemma, such distributions are sigma-sub-Gaussian with - sigma = (upper_bound - lower_bound) / 2. - - Parameters - ---------- - lower_bound: float, default: 0.0 - Lower bound on the rewards. - - upper_bound: float, default: 1.0 - Upper bound on the rewards. - - delta: Callable, - Confidence level. Default is tuned to have asymptotically optimal - regret, see Chapter 8 in [1]. - - Return - ------ - Callable - UCB index for bounded distributions. - - Dict - Extra parameters for the BanditTracker object. - By default the tracker stores the number of pulls and the - empirical average reward for each arm. If you want it to store - all rewards for instance, return {'store_rewards': True}. - - References - ---------- - .. [1] Lattimore, Tor, and Csaba Szepesvári. Bandit algorithms. - Cambridge University Press, 2020. - """ - return makeSubgaussianUCBIndex((upper_bound - lower_bound) / 2, delta) - - -def makeSubgaussianMOSSIndex(T: int = 1, A: int = 2, sigma: float = 1.0): - """ - MOSS index for sub-Gaussian distributions, see Chapters 9 in [1]. - - Parameters - ---------- - T: int - Time horizon. - - A: int - Number of arms. - - sigma : float, default: 1.0 - Sub-Gaussian parameter. - - Return - ------ - Callable - MOSS index for sigma-sub-Gaussian distributions. - - Dict - Extra parameters for the BanditTracker object. - By default the tracker stores the number of pulls and the - empirical average reward for each arm. If you want it to store - all rewards for instance, return {'store_rewards': True}. - - References - ---------- - .. [1] Lattimore, Tor, and Csaba Szepesvári. Bandit algorithms. - Cambridge University Press, 2020. - """ - - def index(tr): - return [ - tr.mu_hat(arm) - + sigma - * np.sqrt( - 4 / tr.n_pulls(arm) * np.maximum(0, np.log(T / (A * tr.n_pulls(arm)))) - ) - for arm in tr.arms - ] - - return index, {} - - -def makeBoundedMOSSIndex( - T: float = 1, A: float = 2, lower_bound: float = 0.0, upper_bound: float = 1.0 -): - """ - MOSS index for bounded distributions, see Chapters 9 in [1]. - By Hoeffding's lemma, such distributions are sigma-sub-Gaussian with - sigma = (upper_bound - lower_bound) / 2. - - Parameters - ---------- - T: int - Time horizon. - - A: int - Number of arms. - - lower_bound: float, default: 0.0 - Lower bound on the rewards. - - upper_bound: float, default: 1.0 - Upper bound on the rewards. - - Return - ------ - Callable - MOSS index for bounded distributions. - - Dict - Extra parameters for the BanditTracker object. - By default the tracker stores the number of pulls and the - empirical average reward for each arm. If you want it to store - all rewards for instance, return {'store_rewards': True}. - - References - ---------- - .. [1] Lattimore, Tor, and Csaba Szepesvári. Bandit algorithms. - Cambridge University Press, 2020. - """ - return makeSubgaussianMOSSIndex(T, A, (upper_bound - lower_bound) / 2) - - -def makeEXP3Index(): - """ - EXP3 index for distributions in [0, 1], see Chapters 11 in [1] and [2]. - - Return - ------ - Callable - EXP3 index for [0, 1] distributions. - - Dict - Extra parameters for the BanditTracker object. - By default the tracker stores the number of pulls and the - empirical average reward for each arm. If you want it to store - all rewards for instance, return {'store_rewards': True}. - - References - ---------- - .. [1] Lattimore, Tor, and Csaba Szepesvári. Bandit algorithms. - Cambridge University Press, 2020. - - .. [2] Seldin, Yevgeny, et al. Evaluation and analysis of the - performance of the EXP3 algorithm in stochastic environments. - European Workshop on Reinforcement Learning. PMLR, 2013. - """ - - def prob(tr): - w = np.zeros(tr.n_arms) - for arm in tr.arms: - eta = np.minimum( - np.sqrt(np.log(tr.n_arms) / (tr.n_arms * (tr.t + 1))), - 1 / tr.n_arms, - ) - w[arm] = np.exp(eta * tr.iw_total_reward(arm)) - w /= w.sum() - return (1 - tr.n_arms * eta) * w + eta * np.ones(tr.n_arms) - - return prob, {"do_iwr": True} - - -def makeBoundedIMEDIndex(upper_bound: float = 1.0): - """ - IMED index for semi-bounded distributions, see [1]. - - Parameters - ---------- - upper_bound: float, default: 1.0 - Upper bound on the rewards. - - Return - ------ - Callable - IMED index for sigma-sub-Gaussian distributions. - - Dict - Extra parameters for the BanditTracker object. - By default the tracker stores the number of pulls and the - empirical average reward for each arm. If you want it to store - all rewards for instance, return {'store_rewards': True}. - - References - ---------- - .. [1] Honda, Junya, and Akimichi Takemura. Non-asymptotic analysis of - a new bandit algorithm for semi-bounded rewards. - J. Mach. Learn. Res. 16 (2015): 3721-3756. - """ - from scipy.optimize import minimize_scalar - - def index(tr): - mu_hat_star = np.max([tr.mu_hat(arm) for arm in tr.arms]) - indices = np.zeros(tr.n_arms) - for arm in tr.arms: - X = np.array(tr.rewards(arm)) - - def dual(u): - return -np.mean(np.log(1 - (X - mu_hat_star) * u)) - - eps = 1e-12 - ret = minimize_scalar( - dual, - method="bounded", - bounds=(eps, 1.0 / (upper_bound - mu_hat_star + eps)), - ) - if ret.success: - kinf = -ret.fun - else: - # if not successful, just make this arm ineligible this turn - kinf = np.inf - - indices[arm] = -kinf * len(X) - np.log(len(X)) - return indices - - return index, {"store_rewards": True} - - -def makeBoundedNPTSIndex(upper_bound: float = 1.0): - """ - NPTS index for bounded distributions, see [1]. - - Parameters - ---------- - upper_bound: float, default: 1.0 - Upper bound on the rewards. - - - Return - ------ - Callable - NPTS index for bounded distributions. - - Dict - Extra parameters for the BanditTracker object. - By default the tracker stores the number of pulls and the - empirical average reward for each arm. If you want it to store - all rewards for instance, return {'store_rewards': True}. - - References - ---------- - .. [1] Riou, Charles, and Junya Honda. Bandit algorithms based on - thompson sampling for bounded reward distributions. - Algorithmic Learning Theory. PMLR, 2020. - - """ - - def index(tr): - indices = np.zeros(tr.n_arms) - for arm in tr.arms: - X = np.array(tr.rewards(arm)) - w = tr.rng.dirichlet(np.ones(len(X) + 1)) - indices[arm] = w[:-1] @ X + upper_bound * w[-1] - return indices - - return index, {"store_rewards": True} - - -def makeBoundedUCBVIndex( - upper_bound: float = 1.0, - c: float = 0.34, - zeta: float = 1.0, - delta: Callable = lambda t: 1 / t, -): - """ - UCBV index for bounded distributions, see [1]. In particular, the index - recommended on p10 is implemented. - The empirical variance is computed sequentially using Welford's algorithm. - Parameters - ---------- - upper_bound: float, default: 1.0 - Upper bound on the rewards. - - c: float, default: 0.34 - Parameter in UCBV algorithm. See Equation (18) in [1] - - zeta: float, default: 1.0 - Parameter in UCBV algorithm. See Equation (18) in [1] - - delta: Callable, - Confidence level. See [1]. - - Return - ------ - Callable - UCBV index for bounded distributions. - - Dict - Extra parameters for the BanditTracker object. - By default the tracker stores the number of pulls and the - empirical average reward for each arm. If you want it to store - all rewards for instance, return {'store_rewards': True}. - - References - ---------- - .. [1] Audibert, J. Y., Munos, R., & Szepesvári, C. (2009). - Exploration–exploitation tradeoff using variance estimates - in multi-armed bandits. Theoretical Computer Science, 410(19), 1876-1902. - - """ - - def update_fun(tr, arm): - """ - Sequentially add variance estimate to tracker - """ - if tr.n_pulls(arm) == 1: - tr.add_scalars(arm, {"v_hat": 0}) - else: - # compute variance sequentially using Welford's algorithm. - reward = tr.reward(arm) - old_muhat = (tr.total_reward(arm) - reward) / ( - tr.n_pulls(arm) - 1 - ) # compute mu at time n-1 - new_muhat = tr.mu_hat(arm) - old_vhat = tr.read_last_tag_value("v_hat", arm) - new_vhat = ( - old_vhat - + ((reward - old_muhat) * (reward - new_muhat) - old_vhat) / tr.t - ) - tr.add_scalars(arm, {"v_hat": new_vhat}) - - def index(tr): - return [ - tr.mu_hat(arm) - + np.sqrt( - 2 - * zeta - * tr.read_last_tag_value("v_hat", arm) - * np.log(1 / delta(tr.t)) - / tr.n_pulls(arm) - ) - + 3 * c * zeta * upper_bound * np.log(1 / delta(tr.t)) / tr.n_pulls(arm) - for arm in tr.arms - ] - - return index, {"update": update_fun} diff --git a/rlberry/agents/bandits/priors.py b/rlberry/agents/bandits/priors.py deleted file mode 100644 index 060fb5036..000000000 --- a/rlberry/agents/bandits/priors.py +++ /dev/null @@ -1,151 +0,0 @@ -import numpy as np - - -def makeBetaPrior(): - """ - Beta prior for Bernoulli bandits, see Chapter 3 in [1]. - - Parameters - ---------- - None - - Return - ------ - Dict - Callable - Beta sampler. - - Callable - Function that computes the parameters of the prior distribution - from the bandit tracker. - - Callable - Function that computes the optimal action from the prior distributions. - - Dict - Extra parameters for the BanditTracker object. - By default the tracker stores the number of pulls and the - empirical average reward for each arm. If you want it to store - all rewards for instance, return {'store_rewards': True}. - - References - ---------- - .. [1] Russo, Daniel J., et al. "A tutorial on Thompson Sampling." - Foundations and Trends in Machine Learning 11.1 (2018): 1-96. - """ - - def prior_params(tr): - """ - The mean of a Bernoulli arm B(p) has prior distribution Beta(a, b), - where a is the number of success + 1, b the number of failures + 1. - """ - return [ - [ - tr.total_reward(arm) + 1, - tr.n_pulls(arm) - tr.total_reward(arm) + 1, - ] - for arm in tr.arms - ] - - def prior_sampler(tr): - """ - Beta prior. - """ - params = prior_params(tr) - return [tr.rng.beta(params[arm][0], params[arm][1]) for arm in tr.arms] - - def optimal_action(tr): - """ - The mean of a Bernoulli arm B(p) has prior distribution Beta(a, b), - where a is the number of success + 1, b the number of failures + 1. - The expectation of p is a / (a + b), therefore the optimal arm w.r.t - the Beta prior is the one with highest a / (a + b). - """ - params = prior_params(tr) - return np.argmax( - [params[arm][0] / (params[arm][0] + params[arm][1]) for arm in tr.arms] - ) - - prior_info = { - "params": prior_params, - "sampler": prior_sampler, - "optimal_action": optimal_action, - } - - return prior_info, {} - - -def makeGaussianPrior(sigma: float = 1.0): - """ - Gaussian prior for Gaussian bandits with known variance, see [1]. - - Parameters - ---------- - sigma : float, default: 1.0 - Gaussian standard deviation. - - Return - ------ - Dict - Callable - Gaussian sampler. - - Callable - Function that computes the parameters of the prior distribution - from the bandit tracker. - - Callable - Function that computes the optimal action from the prior distributions. - - Dict - Extra parameters for the BanditTracker object. - By default the tracker stores the number of pulls and the - empirical average reward for each arm. If you want it to store - all rewards for instance, return {'store_rewards': True}. - - References - ---------- - .. [1] Korda, Nathaniel, Emilie Kaufmann, and Remi Munos. - "Thompson sampling for 1-dimensional exponential family bandits." - Advances in Neural Information Processing Systems 26 (2013). - """ - - def prior_params(tr): - """ - The mean of a Gaussian arm N(mu, sigma^2) has prior distribution - N(mu_hat, s^2), where mu_hat is the empirical average reward and - s^2 = sigma^2 / n, n being the number of pulls for this arm. - """ - return [ - [ - tr.mu_hat(arm), - sigma / np.sqrt(tr.n_pulls(arm)), - ] - for arm in tr.arms - ] - - def prior_sampler(tr): - """ - Normal prior. - """ - params = prior_params(tr) - return [tr.rng.normal(params[arm][0], params[arm][1]) for arm in tr.arms] - - def optimal_action(tr): - """ - The mean of a Gaussian arm N(mu, sigma^2) has prior distribution - N(mu_hat, s^2), where mu_hat is the empirical average reward and - s^2 = sigma^2 / n, n being the number of pulls for this arm. - The expectation of mu is mu_hat, therefore the optimal arm w.r.t - the Gaussian prior is the one with highest mu_hat. - """ - params = prior_params(tr) - return np.argmax([params[arm][0] for arm in tr.arms]) - - prior_info = { - "params": prior_params, - "sampler": prior_sampler, - "optimal_action": optimal_action, - } - - return prior_info, {} diff --git a/rlberry/agents/bandits/randomized_agents.py b/rlberry/agents/bandits/randomized_agents.py deleted file mode 100644 index 76a82c97f..000000000 --- a/rlberry/agents/bandits/randomized_agents.py +++ /dev/null @@ -1,115 +0,0 @@ -import numpy as np -from rlberry.agents.bandits import BanditWithSimplePolicy - - -import rlberry - -logger = rlberry.logger - - -class RandomizedAgent(BanditWithSimplePolicy): - """ - Agent for bandit environment using randomized policy like EXP3. - - Parameters - ----------- - env : rlberry bandit environment - See :class:`~rlberry.envs.bandits.Bandit`. - - index_function : callable or None, default = None - Compute the index for an arm using the past rewards and sampling - probability on this arm and the current time t. - If None, use loss-based importance weighted estimator. - - prob_function : callable or None, default = None - Compute the sampling probability for an arm using its index. - If None, EXP3 softmax probabilities. - References: Seldin, Yevgeny, et al. "Evaluation and analysis of the - performance of the EXP3 algorithm in stochastic environments.". - European Workshop on Reinforcement Learning. PMLR, 2013. - - **kwargs: arguments - Arguments to be passed to BanditWithSimplePolicy. In particular, - one may want to pass the following parameter: - tracker_params: dict - Parameters for the tracker object, typically to decide what to store. - - Examples - -------- - >>> from rlberry.agents.bandits import IndexAgent - >>> import numpy as np - >>> class EXP3Agent(RandomizedAgent): - >>> name = "EXP3" - >>> def __init__(self, env, **kwargs): - >>> def prob_function(tr): - >>> w = np.zeros(tr.n_arms) - >>> for arm in tr.arms: - >>> eta = np.minimum( - >>> np.sqrt( - >>> np.log(tr.n_arms) / (tr.n_arms * (tr.t + 1)) - >>> ), - >>> 1 / tr.n_arms, - >>> ) - >>> w[arm] = np.exp(eta * tr.iw_total_reward(arm)) - >>> w /= w.sum() - >>> return (1 - tr.n_arms * eta) * w + eta * np.ones(tr.n_arms) - >>> - >>> RandomizedAgent.__init__(self, env, index, prob, **kwargs) - - """ - - name = "RandomizedAgent" - - def __init__(self, env, prob_function=None, **kwargs): - BanditWithSimplePolicy.__init__(self, env, **kwargs) - - if prob_function is None: - - def prob_function(tr): - w = np.zeros(tr.n_arms) - for arm in tr.arms: - eta = np.minimum( - np.sqrt(np.log(tr.n_arms) / (tr.n_arms * (tr.t + 1))), - 1 / tr.n_arms, - ) - w[arm] = np.exp(eta * tr.iw_total_reward(arm)) - w /= w.sum() - return (1 - tr.n_arms * eta) * w + eta * np.ones(tr.n_arms) - - self.prob_function = prob_function - - def fit(self, budget=None, **kwargs): - """ - Train the bandit using the provided environment. - - Parameters - ---------- - budget: int - Total number of iterations, also called horizon. - """ - horizon = budget - total_reward = 0.0 - - for ep in range(horizon): - # Warmup: play every arm one before starting computing indices - if ep < self.n_arms: - action = ep - probs = [float(k == action) for k in self.arms] - else: - # Compute sampling probability for each arm - # and play one at random - probs = self.prob_function(self.tracker) - action = self.rng.choice(self.arms, p=probs) - - _, reward, _, _, _ = self.env.step(action) - - # Feed the played action and the resulting reward and sampling - # probability to the tracker. - self.tracker.update(action, reward, {"p": probs[action]}) - - total_reward += reward - - # Best action in hinsight is the one with highest sampling probability - self.optimal_action = np.argmax(probs[:]) - info = {"episode_reward": total_reward} - return info diff --git a/rlberry/agents/bandits/tools/__init__.py b/rlberry/agents/bandits/tools/__init__.py deleted file mode 100644 index 7237ffcb5..000000000 --- a/rlberry/agents/bandits/tools/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .tracker import BanditTracker diff --git a/rlberry/agents/bandits/tools/tracker.py b/rlberry/agents/bandits/tools/tracker.py deleted file mode 100644 index 0aa6a17b0..000000000 --- a/rlberry/agents/bandits/tools/tracker.py +++ /dev/null @@ -1,231 +0,0 @@ -from rlberry import metadata_utils -from rlberry.utils.writers import DefaultWriter - -import rlberry - -logger = rlberry.logger - - -class BanditTracker(DefaultWriter): - """ - Container class for rewards and various statistics (means...) collected - during the run of a bandit algorithm. - - BanditTracker is a companion class for - :class:`~rlberry.agents.bandits.BanditWithSimplePolicy` (and other agents - based on it), where a default tracker is automatically constructed, and can - then be used e.g as an entry for an index function. - - It inherits the logic of DefaultWriter to write/store/read - various data of interest for the execution of a bandit agent. - - Data are stored in the data attribute and indexed by a specific tag. - Except for the tag "t" (corresponding to the running total number of time - steps played by the agent), all tags are arm-specific (n_pulls, - total_reward...). Each tag entry is stored as a deque with fixed maximum - length (FIFO). By default, this maximum length is set to 1, i.e each new - update to the tag erases the previously stored entry. The maximum length - can be changed on a tag-by-tag basis with the dict maxlen_by_tag. - - Data can be interacted with by using the following DefaultWriter accessors: - * Read: - * read_last_tag_value(tag, arm)): returns the last entry of the - deque corresponding to arm-specific tag. - * read_tag_value(tag, arm)): returns the full deque corresponding - to the arm-specific tag. - * Write: - * add_scalar(tag, value): add a single scalar value to the deque - corresponding to the tag. - * add_scalars(arm, {tags: values}): add multiple arm-specific - tagged values to each corresponding deque. - - For ease of use, wrapper methods are provided to access common tag such as - t, n_pulls, total_reward... without explicitly calling the - read_last_tag_value/read_tag_value methods. - - Parameters - ---------- - agent: rlberry bandit agent - See :class:`~rlberry.agents.bandits`. - - params: dict - Other parameters to condition what to store and compute. - In particuler if params contains store_rewards=True, the - rewards will be saved for each arm at each step and if - store_actions=True, the actions are saved. - It can also contain a function named "update" that will - be called at the end of the update phase. def update(tr, arm): ... - - - Examples - -------- - >>> def index(tr): - ''' Compute UCB index for rewards in [0,1]''' - return [ - tr.mu_hat(arm) + np.sqrt( - 0.5 * np.log(1 / delta(tr.t))) / tr.n_pulls(arm) - ) - for arm in tr.arms - ] - - """ - - name = "BanditTracker" - - def __init__(self, agent, params={}): - self.n_arms = agent.n_arms - self.arms = agent.arms - self.rng = agent.rng - - # Store rewards for each arm or not - self.store_rewards = params.get("store_rewards", False) - # Store the actions for each arm or not - self.store_actions = params.get("store_actions", False) - # Additional update function - self.additional_update = params.get("update", None) - - # Add importance weighted rewards or not - self.do_iwr = params.get("do_iwr", False) - - # By default, store a single attribute (the most recent) - maxlen = 1 - # To store all rewards, override the maxlen for the corresponding tags - maxlen_by_tag = dict() - if self.store_rewards: - for arm in self.arms: - maxlen_by_tag[str(arm) + "_reward"] = None - if self.store_actions: - maxlen_by_tag["action"] = None - - _tracker_kwargs = dict( - name="BanditTracker", - execution_metadata=metadata_utils.ExecutionMetadata(), - maxlen=maxlen, - maxlen_by_tag=maxlen_by_tag, - ) - DefaultWriter.__init__(self, print_log=False, **_tracker_kwargs) - - self.reset_tracker() - - def reset_tracker(self): - self.add_scalar("t", 0) - - tag_scalar_dict = dict() - for arm in self.arms: - tag_scalar_dict["n_pulls"] = 0 - tag_scalar_dict["total_reward"] = 0.0 - if self.do_iwr: - tag_scalar_dict["iw_total_reward"] = 0.0 - self.add_scalars(arm, tag_scalar_dict) - - @property - def t(self): - """ - Current running time of the bandit algorithm played by the associated - bandit agent. - """ - return self.read_last_tag_value("t") - - def n_pulls(self, arm): - """ - Current number of pulls by the associated bandit agent to a given arm. - """ - return self.read_last_tag_value("n_pulls", arm) - - def rewards(self, arm): - """ - All rewards collected so far by the associated bandit agent for a given - arm and currently stored. If maxlen_by_tag[str(arm) + "_reward"] is None - or maxlen is None, all the reward history is stored at anytime. - """ - return self.read_tag_value("reward", arm) - - def reward(self, arm): - """ - Last collected reward for a given arm. - """ - return self.read_last_tag_value("reward", arm) - - def actions(self, arm): - """ - All actions collected so far by the associated bandit agent for a given - arm and currently stored. If maxlen_by_tag["action"] is None - or maxlen is None, all the action history is stored at anytime. - """ - return self.read_tag_value("action") - - def action(self, arm): - """ - Last collected action for a given arm. - """ - return self.read_last_tag_value("action") - - def total_reward(self, arm): - """ - Current total reward collected so far by the associated bandit agent - for a given arm. - """ - return self.read_last_tag_value("total_reward", arm) - - def mu_hat(self, arm): - """ - Current empirical mean reward for a given arm estimated by the - associated bandit agent. - """ - return self.read_last_tag_value("mu_hat", arm) - - def iw_total_reward(self, arm): - """ - Empirical Importance weighted total reward collected so far by the - associated bandit agent for a given arm. Used by randomized algorithms. - The IW total reward is the sum of rewards for a given arm inversely - weighted by the arm sampling probabilities at each pull. - In this implementation, we update the loss-based estimator, i.e for - a reward r in [0, 1], we weight 1 - r instead of r - (see Note 9, Chapter 11 of [1]). - - .. [1] Lattimore, Tor, and Csaba Szepesvári. Bandit algorithms. - Cambridge University Press, 2020. - """ - return self.read_last_tag_value("iw_total_reward", arm) - - def update(self, arm, reward, params={}): - """ - After the associated bandit agent played a given arm and collected a - given reward, update the stored data. - By default, only standard statistics are calculated and stored (number - of pulls, current reward, total reward and current empirical mean - reward). Special parameters can be passed in params, e.g the sampling - probability for randomized algorithms (to update the importance - weighted total reward). - """ - # Update current running time - self.add_scalar("t", self.t + 1) - - # Total number of pulls for current arm - n_pulls_arm = self.n_pulls(arm) + 1 - # Sum of rewards for current arm - total_reward_arm = self.total_reward(arm) + reward - - tag_scalar_dict = { - "n_pulls": n_pulls_arm, - "reward": reward, - "total_reward": total_reward_arm, - "mu_hat": total_reward_arm / n_pulls_arm, - } - - # Importance weighted total rewards for randomized algorithns - if self.do_iwr: - p = params.get("p", 1.0) - iw_total_reward_arm = self.iw_total_reward(arm) - tag_scalar_dict["iw_total_reward"] = ( - iw_total_reward_arm + 1 - (1 - reward) / p - ) - - # Write all tracked statistics - self.add_scalars(arm, tag_scalar_dict) - self.add_scalar("action", arm) - - # Do the additional update - if self.additional_update is not None: - self.additional_update(self, arm) diff --git a/rlberry/agents/bandits/ts_agents.py b/rlberry/agents/bandits/ts_agents.py deleted file mode 100644 index 528fae0c0..000000000 --- a/rlberry/agents/bandits/ts_agents.py +++ /dev/null @@ -1,157 +0,0 @@ -import numpy as np -from rlberry.agents.bandits import BanditWithSimplePolicy - - -import rlberry - -logger = rlberry.logger - - -class TSAgent(BanditWithSimplePolicy): - """ - Agent for bandit environment using Thompson sampling. - - Parameters - ----------- - env : rlberry bandit environment - See :class:`~rlberry.envs.bandits.Bandit`. - - prior : str in {"gaussian", "beta"} - Family of priors used in Thompson sampling algorithm. - - prior_params : arary of size (2,n_actions) or None, default = None - Only used if prior = "gaussian", means and std of the gaussian prior distributions. - If None, use an array of all 0 and an array of all 1. - - - Examples - -------- - >>> from rlberry.agents.bandits import TSAgent - >>> import numpy as np - >>> class BernoulliTSAgent(TSAgent): - >>> name = "TS" - >>> def __init__(self, env, **kwargs): - >>> def prior_params(tr): - >>> return [ - >>> [ - >>> tr.total_reward(arm) + 1, - >>> tr.n_pulls(arm) - tr.total_reward(arm) + 1, - >>> ] - >>> for arm in tr.arms - >>> ] - >>> - >>> def prior_sampler(tr): - >>> params = prior_params(tr) - >>> return [tr.rng.beta(params[arm][0], params[arm][1]) for arm in tr.arms] - >>> - >>> def optimal_action(tr): - >>> params = prior_params(tr) - >>> return np.argmax( - >>> [ - >>> params[arm][0] / (params[arm][0] + params[arm][1]) - >>> for arm in tr.arms - >>> ] - >>> ) - >>> - >>> prior = { - >>> "params": prior_params, - >>> "sampler": prior_sampler, - >>> "optimal_action": optimal_action, - >>> } - >>> - >>> TSAgent.__init__(self, env, prior, **kwargs) - - """ - - name = "TSAgent" - - def __init__(self, env, prior_info=None, **kwargs): - BanditWithSimplePolicy.__init__(self, env, **kwargs) - if prior_info is None: - # Beta-Bernoulli prior by default - def prior_params(tr): - """ - The mean of a Bernoulli arm B(p) has prior distribution Beta(a, b), - where a is the number of success + 1, b the number of failures + 1. - """ - return [ - [ - tr.total_reward(arm) + 1, - tr.n_pulls(arm) - tr.total_reward(arm) + 1, - ] - for arm in tr.arms - ] - - def prior_sampler(tr): - """ - Beta prior. - """ - params = prior_params(tr) - return [tr.rng.beta(params[arm][0], params[arm][1]) for arm in tr.arms] - - def optimal_action(tr): - """ - The mean of a Bernoulli arm B(p) has prior distribution Beta(a, b), - where a is the number of success + 1, b the number of failures + 1. - The expectation of p is a / (a + b), therefore the optimal arm w.r.t - the Beta prior is the one with highest a / (a + b). - """ - params = prior_params(tr) - return np.argmax( - [ - params[arm][0] / (params[arm][0] + params[arm][1]) - for arm in tr.arms - ] - ) - - self.prior_info = { - "params": prior_params, - "sampler": prior_sampler, - "optimal_action": optimal_action, - } - else: - self.prior_info = prior_info - - @property - def prior_sampler(self): - return self.prior_info.get("sampler") - - @property - def get_optimal_action(self): - return self.prior_info.get("optimal_action") - - def fit(self, budget=None, **kwargs): - """ - Train the bandit using the provided environment. - - Parameters - ---------- - budget: int - Total number of iterations, also called horizon. - """ - horizon = budget - - total_reward = 0.0 - - for ep in range(horizon): - # Warmup: play every arm one before starting computing indices - if ep < self.n_arms: - action = ep - else: - # Sample from mean parameters from prior distributions - sample_mu = self.prior_sampler(self.tracker) - # Play the best sampled mean - action = np.argmax(sample_mu) - - _, reward, _, _, _ = self.env.step(action) - - # Feed the played action and the resulting reward to the tracker - self.tracker.update(action, reward) - - total_reward += reward - - # Best action in hinsight is the one with highest index - self.optimal_action = self.get_optimal_action(self.tracker) - - info = {"episode_reward": total_reward} - return info diff --git a/rlberry/agents/dynprog/__init__.py b/rlberry/agents/dynprog/__init__.py deleted file mode 100644 index 8af8271d3..000000000 --- a/rlberry/agents/dynprog/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .value_iteration import ValueIterationAgent diff --git a/rlberry/agents/dynprog/utils.py b/rlberry/agents/dynprog/utils.py deleted file mode 100644 index 0d01c93b4..000000000 --- a/rlberry/agents/dynprog/utils.py +++ /dev/null @@ -1,272 +0,0 @@ -import numpy as np -from rlberry.utils.jit_setup import numba_jit - - -@numba_jit -def backward_induction(R, P, horizon, gamma=1.0, vmax=np.inf): - """Backward induction to compute Q and V functions in the finite horizon - setting. - - Parameters - ---------- - R : numpy.ndarray - array of shape (S, A) contaning the rewards, where S is the number - of states and A is the number of actions - P : numpy.ndarray - array of shape (S, A, S) such that P[s,a,ns] is the probability of - arriving at ns by taking action a in state s. - horizon : int - problem horizon - gamma : double, default: 1.0 - discount factor - vmax : double, default: np.inf - maximum possible value in V - - Returns - -------- - tuple (Q, V) containing the Q and V functions, of shapes (horizon, S, A) - and (horizon, S), respectively. - """ - S, A = R.shape - V = np.zeros((horizon, S)) - Q = np.zeros((horizon, S, A)) - for hh in range(horizon - 1, -1, -1): - for ss in range(S): - max_q = -np.inf - for aa in range(A): - q_aa = R[ss, aa] - if hh < horizon - 1: - # not using .dot instead of loop to avoid scipy dependency - # (numba seems to require scipy for linear - # algebra operations in numpy) - for ns in range(S): - q_aa += gamma * P[ss, aa, ns] * V[hh + 1, ns] - if q_aa > max_q: - max_q = q_aa - Q[hh, ss, aa] = q_aa - V[hh, ss] = max_q - if V[hh, ss] > vmax: - V[hh, ss] = vmax - return Q, V - - -@numba_jit -def backward_induction_reward_sd(Q, V, R, P, gamma=1.0, vmax=np.inf): - """ - Backward induction to compute Q and V functions in - the finite horizon setting. - - Assumes R is stage-dependent, but P is stage-independent. - - Takes as input the arrays where to store Q and V. - - Parameters - ---------- - Q: numpy.ndarray - array of shape (horizon, S, A) where to store the Q function - V: numpy.ndarray - array of shape (horizon, S) where to store the V function - R : numpy.ndarray - array of shape (horizon, S, A) contaning the rewards, where S is the number - of states and A is the number of actions - P : numpy.ndarray - array of shape (S, A, S) such that P[s,a,ns] is the probability of - arriving at ns by taking action a in state s. - horizon : int - problem horizon - gamma : double - discount factor, default = 1.0 - vmax : double - maximum possible value in V - default = np.inf - """ - H, S, A = R.shape - horizon = H - for hh in range(horizon - 1, -1, -1): - for ss in range(S): - max_q = -np.inf - for aa in range(A): - q_aa = R[hh, ss, aa] - if hh < horizon - 1: - # not using .dot instead of loop to avoid scipy dependency - # (numba seems to require scipy for linear algebra - # operations in numpy) - for ns in range(S): - q_aa += gamma * P[ss, aa, ns] * V[hh + 1, ns] - if q_aa > max_q: - max_q = q_aa - Q[hh, ss, aa] = q_aa - V[hh, ss] = max_q - if V[hh, ss] > vmax: - V[hh, ss] = vmax - - -@numba_jit -def backward_induction_in_place(Q, V, R, P, horizon, gamma=1.0, vmax=np.inf): - """ - Backward induction to compute Q and V functions in - the finite horizon setting. - Takes as input the arrays where to store Q and V. - - Parameters - ---------- - Q: numpy.ndarray - array of shape (horizon, S, A) where to store the Q function - V: numpy.ndarray - array of shape (horizon, S) where to store the V function - R : numpy.ndarray - array of shape (S, A) contaning the rewards, where S is the number - of states and A is the number of actions - P : numpy.ndarray - array of shape (S, A, S) such that P[s,a,ns] is the probability of - arriving at ns by taking action a in state s. - horizon : int - problem horizon - gamma : double - discount factor, default = 1.0 - vmax : double - maximum possible value in V - default = np.inf - """ - S, A = R.shape - for hh in range(horizon - 1, -1, -1): - for ss in range(S): - max_q = -np.inf - for aa in range(A): - q_aa = R[ss, aa] - if hh < horizon - 1: - # not using .dot instead of loop to avoid scipy dependency - # (numba seems to require scipy for linear algebra - # operations in numpy) - for ns in range(S): - q_aa += gamma * P[ss, aa, ns] * V[hh + 1, ns] - if q_aa > max_q: - max_q = q_aa - Q[hh, ss, aa] = q_aa - V[hh, ss] = max_q - if V[hh, ss] > vmax: - V[hh, ss] = vmax - - -@numba_jit -def backward_induction_sd(Q, V, R, P, gamma=1.0, vmax=np.inf): - """ - In-place implementation of backward induction to compute Q and V functions - in the finite horizon setting. - - Assumes R and P are stage-dependent. - - Parameters - ---------- - Q: numpy.ndarray - array of shape (H, S, A) where to store the Q function - V: numpy.ndarray - array of shape (H, S) where to store the V function - R : numpy.ndarray - array of shape (H, S, A) contaning the rewards, where S is the number - of states and A is the number of actions - P : numpy.ndarray - array of shape (H, S, A, S) such that P[h, s, a, ns] is the probability of - arriving at ns by taking action a in state s at stage h. - gamma : double, default: 1.0 - discount factor - vmax : double, default: np.inf - maximum possible value in V - - """ - H, S, A = R.shape - for hh in range(H - 1, -1, -1): - for ss in range(S): - max_q = -np.inf - for aa in range(A): - q_aa = R[hh, ss, aa] - if hh < H - 1: - # not using .dot instead of loop to avoid scipy dependency - # (numba seems to require scipy for linear - # algebra operations in numpy) - for ns in range(S): - q_aa += gamma * P[hh, ss, aa, ns] * V[hh + 1, ns] - if q_aa > max_q: - max_q = q_aa - Q[hh, ss, aa] = q_aa - V[hh, ss] = max_q - # clip V - if V[hh, ss] > vmax: - V[hh, ss] = vmax - - -@numba_jit -def value_iteration(R, P, gamma, epsilon=1e-6): - """ - Value iteration for discounted problems. - - Parameters - ---------- - R : numpy.ndarray - array of shape (S, A) contaning the rewards, where S is the number - of states and A is the number of actions - P : numpy.ndarray - array of shape (S, A, S) such that P[s,a,ns] is the probability of - arriving at ns by taking action a in state s. - gamma : double - discount factor - epsilon : double - precision - - Returns - -------- - tuple (Q, V, n_it) containing the epsilon-optimal Q and V functions, - of shapes (S, A) and (S,), respectively, and n_it, the number of iterations - """ - S, A = R.shape - Q = np.zeros((S, A)) - Q_aux = np.full((S, A), np.inf) - n_it = 0 - while np.abs(Q - Q_aux).max() > epsilon: - Q_aux = Q - Q = bellman_operator(Q, R, P, gamma) - n_it += 1 - V = np.zeros(S) - # numba does not support np.max(Q, axis=1) - for ss in range(S): - V[ss] = Q[ss, :].max() - return Q, V, n_it - - -@numba_jit -def bellman_operator(Q, R, P, gamma): - """ - Bellman optimality operator for Q functions - - Parameters - ---------- - Q : numpy.ndarray - array of shape (S, A) containing the Q function to which apply - the operator - R : numpy.ndarray - array of shape (S, A) contaning the rewards, where S is the number - of states and A is the number of actions - P : numpy.ndarray - array of shape (S, A, S) such that P[s,a,ns] is the probability of - arriving at ns by taking action a in state s. - gamma : double - discount factor - - Returns - -------- - TQ, array of shape (S, A) containing the result of the Bellman operator - applied to the input Q - """ - S, A = Q.shape - TQ = np.zeros((S, A)) - V = np.zeros(S) - # numba does not support np.max(Q, axis=1) - for ss in range(S): - V[ss] = Q[ss, :].max() - # - for ss in range(S): - for aa in range(A): - TQ[ss, aa] = R[ss, aa] - for ns in range(S): - TQ[ss, aa] += gamma * P[ss, aa, ns] * V[ns] - return TQ diff --git a/rlberry/agents/dynprog/value_iteration.py b/rlberry/agents/dynprog/value_iteration.py deleted file mode 100644 index fd9a4ec3a..000000000 --- a/rlberry/agents/dynprog/value_iteration.py +++ /dev/null @@ -1,82 +0,0 @@ -from rlberry.agents.agent import AgentWithSimplePolicy -from rlberry.agents.dynprog.utils import backward_induction, value_iteration -from rlberry.envs.finite.finite_mdp import FiniteMDP - - -class ValueIterationAgent(AgentWithSimplePolicy): - """ - Value iteration for enviroments of type FiniteMDP - (rlberry.envs.finite.finite_mdp.FiniteMDP) - - Important: the discount gamma is also used if the problem is - finite horizon, but, in this case, gamma can be set to 1.0. - - Parameters - ----------- - env : rlberry.envs.finite.finite_mdp.FiniteMDP - Environment used to fit the agent. - gamma : double - Discount factor in [0, 1] - horizon : int - Horizon, if the problem is finite-horizon. if None, the discounted - problem is solved - default = None - epsilon : double - Precision of value iteration, only used in discounted problems - (when horizon is None). - - """ - - name = "ValueIteration" - - def __init__(self, env, gamma=0.95, horizon=None, epsilon=1e-6, **kwargs): - AgentWithSimplePolicy.__init__(self, env, **kwargs) - - # initialize base class - assert isinstance( - self.env, FiniteMDP - ), "Value iteration requires a FiniteMDP model." - # - - self.gamma = gamma # attribute gamma - - self.horizon = horizon - self.epsilon = epsilon - - # value functions - self.Q = None - self.V = None - - def fit(self, budget=None, **kwargs): - """ - Run value iteration. - - Parameters - ---------- - budget: None - Not used. Only defined for compatibility purpose with rlberry. - Changing `budget` value has no effect. - """ - del kwargs - info = {} - if self.horizon is None: - assert self.gamma < 1.0, "The discounted setting requires gamma < 1.0" - self.Q, self.V, n_it = value_iteration( - self.env.R, self.env.P, self.gamma, self.epsilon - ) - info["n_iterations"] = n_it - info["precision"] = self.epsilon - else: - self.Q, self.V = backward_induction( - self.env.R, self.env.P, self.horizon, self.gamma - ) - info["n_iterations"] = self.horizon - info["precision"] = 0.0 - return info - - def policy(self, observation): - state = observation - if self.horizon is None: - return self.Q[state, :].argmax() - else: - return self.Q[0, state, :].argmax() diff --git a/rlberry/agents/experimental/__init__.py b/rlberry/agents/experimental/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rlberry/agents/experimental/tests/__init__.py b/rlberry/agents/experimental/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rlberry/agents/experimental/torch/__init__.py b/rlberry/agents/experimental/torch/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rlberry/agents/features/__init__.py b/rlberry/agents/features/__init__.py deleted file mode 100644 index 1e473c0dc..000000000 --- a/rlberry/agents/features/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .feature_map import FeatureMap diff --git a/rlberry/agents/features/feature_map.py b/rlberry/agents/features/feature_map.py deleted file mode 100644 index d847556aa..000000000 --- a/rlberry/agents/features/feature_map.py +++ /dev/null @@ -1,29 +0,0 @@ -from abc import ABC, abstractmethod - - -class FeatureMap(ABC): - """ - Class representing a feature map, from (observation, action) pairs - to numpy arrays. - - Attributes - ---------- - shape : tuple - Shape of feature array. - - Methods - -------- - map() - Maps a (observation, action) pair to a numpy array. - """ - - def __init__(self): - ABC.__init__(self) - self.shape = () - - @abstractmethod - def map(self, observation, action): - """ - Maps a (observation, action) pair to a numpy array. - """ - pass diff --git a/rlberry/agents/kernel_based/__init__.py b/rlberry/agents/kernel_based/__init__.py deleted file mode 100644 index 275e51c86..000000000 --- a/rlberry/agents/kernel_based/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .rs_ucbvi import RSUCBVIAgent -from .rs_kernel_ucbvi import RSKernelUCBVIAgent diff --git a/rlberry/agents/kernel_based/common.py b/rlberry/agents/kernel_based/common.py deleted file mode 100644 index 33757f66a..000000000 --- a/rlberry/agents/kernel_based/common.py +++ /dev/null @@ -1,34 +0,0 @@ -import numpy as np -from rlberry.utils.jit_setup import numba_jit -from rlberry.utils.metrics import metric_lp - - -@numba_jit -def map_to_representative( - state, - lp_metric, - representative_states, - n_representatives, - min_dist, - scaling, - accept_new_repr, -): - """Map state to representative state.""" - dist_to_closest = np.inf - argmin = -1 - for ii in range(n_representatives): - dist = metric_lp(state, representative_states[ii, :], lp_metric, scaling) - if dist < dist_to_closest: - dist_to_closest = dist - argmin = ii - - max_representatives = representative_states.shape[0] - if ( - (dist_to_closest > min_dist) - and (n_representatives < max_representatives) - and accept_new_repr - ): - new_index = n_representatives - representative_states[new_index, :] = state - return new_index - return argmin diff --git a/rlberry/agents/kernel_based/kernels.py b/rlberry/agents/kernel_based/kernels.py deleted file mode 100644 index 88954432c..000000000 --- a/rlberry/agents/kernel_based/kernels.py +++ /dev/null @@ -1,58 +0,0 @@ -import numpy as np -from rlberry.utils.jit_setup import numba_jit - - -@numba_jit -def kernel_func(z, kernel_type): - """ - Returns a kernel function to the real value z. - - Kernel types: - - "uniform" : 1.0*(abs(z) <= 1) - "triangular" : max(0, 1 - abs(z)) - "gaussian" : exp(-z^2/2) - "epanechnikov" : max(0, 1-z^2) - "quartic" : (1-z^2)^2 *(abs(z) <= 1) - "triweight" : (1-z^2)^3 *(abs(z) <= 1) - "tricube" : (1-abs(z)^3)^3 *(abs(z) <= 1) - "cosine" : cos( z * (pi/2) ) *(abs(z) <= 1) - "exp-n" : exp(-abs(z)^n/2), for n integer - - Parameters - ---------- - z : double - kernel_type : string - """ - if kernel_type == "uniform": - return 1.0 * (np.abs(z) <= 1) - elif kernel_type == "triangular": - return (1.0 - np.abs(z)) * (np.abs(z) <= 1) - elif kernel_type == "gaussian": - return np.exp(-np.power(z, 2.0) / 2.0) - elif kernel_type == "epanechnikov": - return (1.0 - np.power(z, 2.0)) * (np.abs(z) <= 1) - elif kernel_type == "quartic": - return np.power((1.0 - np.power(z, 2.0)), 2.0) * (np.abs(z) <= 1) - elif kernel_type == "triweight": - return np.power((1.0 - np.power(z, 2.0)), 3.0) * (np.abs(z) <= 1) - elif kernel_type == "tricube": - return np.power((1.0 - np.power(np.abs(z), 3.0)), 3.0) * (np.abs(z) <= 1) - elif kernel_type == "cosine": - return np.cos(z * np.pi / 2) * (np.abs(z) <= 1) - elif "exp-" in kernel_type: - exponent = _str_to_int(kernel_type.split("-")[1]) - return np.exp(-np.power(np.abs(z), exponent) / 2.0) - else: - raise NotImplementedError("Unknown kernel type.") - - -@numba_jit -def _str_to_int(s): - """ - Source: https://github.com/numba/numba/issues/5650#issuecomment-623511109 - """ - final_index, result = len(s) - 1, 0 - for i, v in enumerate(s): - result += (ord(v) - 48) * (10 ** (final_index - i)) - return result diff --git a/rlberry/agents/kernel_based/rs_kernel_ucbvi.py b/rlberry/agents/kernel_based/rs_kernel_ucbvi.py deleted file mode 100644 index f27449577..000000000 --- a/rlberry/agents/kernel_based/rs_kernel_ucbvi.py +++ /dev/null @@ -1,390 +0,0 @@ -import numpy as np -from rlberry.utils.jit_setup import numba_jit - -import gymnasium.spaces as spaces -from rlberry.agents import AgentWithSimplePolicy -from rlberry.agents.dynprog.utils import backward_induction -from rlberry.agents.dynprog.utils import backward_induction_in_place -from rlberry.utils.metrics import metric_lp -from rlberry.agents.kernel_based.kernels import kernel_func -from rlberry.agents.kernel_based.common import map_to_representative - -import rlberry - -logger = rlberry.logger - - -@numba_jit -def update_model( - repr_state, - action, - repr_next_state, - reward, - n_representatives, - repr_states, - lp_metric, - scaling, - bandwidth, - bonus_scale_factor, - beta, - v_max, - bonus_type, - kernel_type, - N_sa, - B_sa, - P_hat, - R_hat, -): - """ - Model update function, lots of arguments so we can use JIT :) - """ - # aux var for transition update - dirac_next_s = np.zeros(n_representatives) - dirac_next_s[repr_next_state] = 1.0 - - for u_repr_state in range(n_representatives): - # compute weight - dist = metric_lp( - repr_states[repr_state, :], repr_states[u_repr_state, :], lp_metric, scaling - ) - weight = kernel_func(dist / bandwidth, kernel_type=kernel_type) - - # aux variables - prev_N_sa = beta + N_sa[u_repr_state, action] # regularization beta - current_N_sa = prev_N_sa + weight - - # update weights - N_sa[u_repr_state, action] += weight - - # update transitions - P_hat[u_repr_state, action, :n_representatives] = ( - dirac_next_s * weight / current_N_sa - + (prev_N_sa / current_N_sa) - * P_hat[u_repr_state, action, :n_representatives] - ) - - # update rewards - R_hat[u_repr_state, action] = ( - weight * reward / current_N_sa - + (prev_N_sa / current_N_sa) * R_hat[u_repr_state, action] - ) - - # update bonus - B_sa[u_repr_state, action] = compute_bonus( - N_sa[u_repr_state, action], beta, bonus_scale_factor, v_max, bonus_type - ) - - -@numba_jit -def compute_bonus(sum_weights, beta, bonus_scale_factor, v_max, bonus_type): - n = beta + sum_weights - if bonus_type == "simplified_bernstein": - return bonus_scale_factor * np.sqrt(1.0 / n) + (1 + beta) * (v_max) / n - else: - raise NotImplementedError("Error: unknown bonus type.") - - -class RSKernelUCBVIAgent(AgentWithSimplePolicy): - """ - Implements KernelUCBVI [1] with representative states [2, 3]. - - Value iteration with exploration bonuses for continuous-state environments, - using a online discretization strategy + kernel smoothing: - - Build (online) a set of representative states - - Using smoothing kernels, estimate transtions an rewards on the - finite set of representative states and actions. - - Criterion: finite-horizon with discount factor gamma. - If the discount is not 1, only the Q function at h=0 is used. - - The recommended policy after all the episodes is computed without - exploration bonuses. - - - Parameters - ---------- - env : Model - Online model with continuous (Box) state space and discrete actions - gamma : double - Discount factor in [0, 1]. If gamma is 1.0, the problem is set to - be finite-horizon. - horizon : int - Horizon of the objective function. If None and gamma<1, set to - 1/(1-gamma). - lp_metric: int - The metric on the state space is the one induced by the p-norm, - where p = lp_metric. Default = 2, for the Euclidean metric. - kernel_type : string - See rlberry.agents.kernel_based.kernels.kernel_func for - possible kernel types. - scaling: numpy.ndarray - Must have the same size as state array, used to scale the states - before computing the metric. - If None, set to: - - (env.observation_space.high - env.observation_space.low) if high - and low are bounded - - np.ones(env.observation_space.shape[0]) if high or low - are unbounded - bandwidth : double - Kernel bandwidth. - min_dist : double - Minimum distance between two representative states - max_repr : int - Maximum number of representative states. - If None, it is set to (sqrt(d)/min_dist)**d, where d - is the dimension of the state space - bonus_scale_factor : double - Constant by which to multiply the exploration bonus, - controls the level of exploration. - beta : double - Regularization constant. - bonus_type : string - Type of exploration bonus. Currently, only "simplified_bernstein" - is implemented. - - - References - ---------- - [1] Domingues et al., 2020 - Regret Bounds for Kernel-Based Reinforcement Learning - https://arxiv.org/abs/2004.05599 - [2] Domingues et al., 2020 - A Kernel-Based Approach to Non-Stationary Reinforcement Learning - in Metric Spaces - https://arxiv.org/abs/2007.05078 - [3] Kveton & Theocharous, 2012 - Kernel-Based Reinforcement Learning on Representative States - https://www.aaai.org/ocs/index.php/AAAI/AAAI12/paper/viewFile/4967/5509 - """ - - name = "RSKernelUCBVI" - - def __init__( - self, - env, - gamma=0.99, - horizon=None, - lp_metric=2, - kernel_type="epanechnikov", - scaling=None, - bandwidth=0.05, - min_dist=0.1, - max_repr=1000, - bonus_scale_factor=1.0, - beta=0.01, - bonus_type="simplified_bernstein", - **kwargs - ): - # init base class - AgentWithSimplePolicy.__init__(self, env, **kwargs) - - self.gamma = gamma - self.horizon = horizon - self.lp_metric = lp_metric - self.kernel_type = kernel_type - self.bandwidth = bandwidth - self.min_dist = min_dist - self.bonus_scale_factor = bonus_scale_factor - self.beta = beta - self.bonus_type = bonus_type - - # check environment - assert isinstance(self.env.observation_space, spaces.Box) - assert isinstance(self.env.action_space, spaces.Discrete) - - # other checks - assert gamma >= 0 and gamma <= 1.0 - if self.horizon is None: - assert gamma < 1.0, "If no horizon is given, gamma must be smaller than 1." - self.horizon = int(np.ceil(1.0 / (1.0 - gamma))) - - # state dimension - self.state_dim = self.env.observation_space.shape[0] - - # compute scaling, if it is None - if scaling is None: - # if high and low are bounded - if (self.env.observation_space.high == np.inf).sum() == 0 and ( - self.env.observation_space.low == -np.inf - ).sum() == 0: - scaling = ( - self.env.observation_space.high - self.env.observation_space.low - ) - # if high or low are unbounded - else: - scaling = np.ones(self.state_dim) - else: - assert scaling.ndim == 1 - assert scaling.shape[0] == self.state_dim - self.scaling = scaling - - # maximum value - r_range = self.env.reward_range[1] - self.env.reward_range[0] - if r_range == np.inf or r_range == 0.0: - logger.warning( - "{}: Reward range is zero or infinity. ".format(self.name) - + "Setting it to 1." - ) - r_range = 1.0 - - if self.gamma == 1.0: - self.v_max = r_range * horizon - else: - self.v_max = ( - r_range - * (1.0 - np.power(self.gamma, self.horizon)) - / (1.0 - self.gamma) - ) - - # number of representative states and number of actions - if max_repr is None: - max_repr = int( - np.ceil( - (1.0 * np.sqrt(self.state_dim) / self.min_dist) ** self.state_dim - ) - ) - self.max_repr = max_repr - - # current number of representative states - self.M = None - self.A = self.env.action_space.n - - # declaring variables - self.episode = None # current episode - self.representative_states = None # coordinates of all repr states - self.N_sa = None # sum of weights at (s, a) - self.B_sa = None # bonus at (s, a) - self.R_hat = None # reward estimate - self.P_hat = None # transitions estimate - self.Q = None # Q function - self.V = None # V function - - self.Q_policy = None # Q function for recommended policy - - # initialize - self.reset() - - def reset(self, **kwargs): - self.M = 0 - self.representative_states = np.zeros((self.max_repr, self.state_dim)) - self.N_sa = np.zeros((self.max_repr, self.A)) - self.B_sa = self.v_max * np.ones((self.max_repr, self.A)) - - self.R_hat = np.zeros((self.max_repr, self.A)) - self.P_hat = np.zeros((self.max_repr, self.A, self.max_repr)) - - self.V = np.zeros((self.horizon, self.max_repr)) - self.Q = np.zeros((self.horizon, self.max_repr, self.A)) - self.Q_policy = None - - self.episode = 0 - - def policy(self, observation): - state = observation - assert self.Q_policy is not None - repr_state = self._map_to_repr(state, False) - return self.Q_policy[0, repr_state, :].argmax() - - def fit(self, budget: int, **kwargs): - """ - Train the agent using the provided environment. - - Parameters - ---------- - budget: int - number of episodes. Each episode runs for self.horizon unless it - enconters a terminal state in which case it stops early. - """ - del kwargs - for _ in range(budget): - self._run_episode() - - # compute Q function for the recommended policy - self.Q_policy, _ = backward_induction( - self.R_hat[: self.M, :], - self.P_hat[: self.M, :, : self.M], - self.horizon, - self.gamma, - ) - - def _map_to_repr(self, state, accept_new_repr=True): - repr_state = map_to_representative( - state, - self.lp_metric, - self.representative_states, - self.M, - self.min_dist, - self.scaling, - accept_new_repr, - ) - # check if new representative state - if repr_state == self.M: - self.M += 1 - return repr_state - - def _update(self, state, action, next_state, reward): - repr_state = self._map_to_repr(state) - repr_next_state = self._map_to_repr(next_state) - - update_model( - repr_state, - action, - repr_next_state, - reward, - self.M, - self.representative_states, - self.lp_metric, - self.scaling, - self.bandwidth, - self.bonus_scale_factor, - self.beta, - self.v_max, - self.bonus_type, - self.kernel_type, - self.N_sa, - self.B_sa, - self.P_hat, - self.R_hat, - ) - - def _get_action(self, state, hh=0): - assert self.Q is not None - repr_state = self._map_to_repr(state, False) - return self.Q[hh, repr_state, :].argmax() - - def _run_episode(self): - # interact for H steps - episode_rewards = 0 - observation, info = self.env.reset() - for hh in range(self.horizon): - action = self._get_action(observation, hh) - next_observation, reward, terminated, truncated, info = self.env.step( - action - ) - done = terminated or truncated - self._update(observation, action, next_observation, reward) - observation = next_observation - episode_rewards += reward - - if done: - break - - # run backward induction - backward_induction_in_place( - self.Q[:, : self.M, :], - self.V[:, : self.M], - self.R_hat[: self.M, :] + self.B_sa[: self.M, :], - self.P_hat[: self.M, :, : self.M], - self.horizon, - self.gamma, - self.v_max, - ) - - self.episode += 1 - # - if self.writer is not None: - self.writer.add_scalar("episode_rewards", episode_rewards, self.episode) - self.writer.add_scalar("representative states", self.M, self.episode) - - # return sum of rewards collected in the episode - return episode_rewards diff --git a/rlberry/agents/kernel_based/rs_ucbvi.py b/rlberry/agents/kernel_based/rs_ucbvi.py deleted file mode 100644 index cee45ce56..000000000 --- a/rlberry/agents/kernel_based/rs_ucbvi.py +++ /dev/null @@ -1,332 +0,0 @@ -from rlberry.agents.agent import AgentWithSimplePolicy -import numpy as np - -import gymnasium.spaces as spaces -from rlberry.agents.dynprog.utils import backward_induction -from rlberry.agents.dynprog.utils import backward_induction_in_place -from rlberry.agents.kernel_based.common import map_to_representative - -import rlberry - -logger = rlberry.logger - - -class RSUCBVIAgent(AgentWithSimplePolicy): - """ - Value iteration with exploration bonuses for continuous-state environments, - using a online discretization strategy. - - The strategy: - - Build (online) a set of representative states - - Estimate transtions an rewards on the finite set of representative states - and actions. - - Criterion: finite-horizon with discount factor gamma. - If the discount is not 1, only the Q function at h=0 is used. - - The recommended policy after all the episodes is computed without - exploration bonuses. - - Parameters - ---------- - env : Model - Online model with continuous (Box) state space and discrete actions - gamma : double - Discount factor in [0, 1]. If gamma is 1.0, the problem is set to - be finite-horizon. - horizon : int - Horizon of the objective function. If None and gamma<1, set to - 1/(1-gamma). - lp_metric: int - The metric on the state space is the one induced by the p-norm, - where p = lp_metric. Default = 2, for the Euclidean metric. - scaling: numpy.ndarray - Must have the same size as state array, used to scale the states - before computing the metric. - If None, set to: - - (env.observation_space.high - env.observation_space.low) if high - and low are bounded - - np.ones(env.observation_space.shape[0]) if high or low are - unbounded - min_dist: double - Minimum distance between two representative states - max_repr: int - Maximum number of representative states. - If None, it is set to (sqrt(d)/min_dist)**d, where d - is the dimension of the state space - bonus_scale_factor : double - Constant by which to multiply the exploration bonus, controls - the level of exploration. - bonus_type : string - Type of exploration bonus. Currently, only "simplified_bernstein" - is implemented. If `reward_free` is true, this parameter is ignored - and the algorithm uses 1/n bonuses. - reward_free : bool - If true, ignores rewards and uses only 1/n bonuses. - - References - ---------- - .. [1] Azar, Mohammad Gheshlaghi, Ian Osband, and Rémi Munos. - "Minimax regret bounds for reinforcement learning." - Proceedings of the 34th ICML, 2017. - - .. [2] Strehl, Alexander L., and Michael L. Littman. - "An analysis of model-based interval estimation for Markov decision - processes." - Journal of Computer and System Sciences 74.8 (2008): 1309-1331. - - .. [3] Kveton, Branislav, and Georgios Theocharous. - "Kernel-Based Reinforcement Learning on Representative States." - AAAI, 2012. - - .. [4] Domingues, O. D., Ménard, P., Pirotta, M., Kaufmann, E., & Valko, M.(2020). - A kernel-based approach to non-stationary reinforcement learning in metric - spaces. - arXiv preprint arXiv:2007.05078. - """ - - name = "RSUCBVI" - - def __init__( - self, - env, - gamma=0.99, - horizon=100, - lp_metric=2, - scaling=None, - min_dist=0.1, - max_repr=1000, - bonus_scale_factor=1.0, - bonus_type="simplified_bernstein", - reward_free=False, - **kwargs - ): - # init base class - AgentWithSimplePolicy.__init__(self, env, **kwargs) - - self.gamma = gamma - self.horizon = horizon - self.lp_metric = lp_metric - self.min_dist = min_dist - self.bonus_scale_factor = bonus_scale_factor - self.bonus_type = bonus_type - self.reward_free = reward_free - - # check environment - assert isinstance(self.env.observation_space, spaces.Box) - assert isinstance(self.env.action_space, spaces.Discrete) - - # other checks - assert gamma >= 0 and gamma <= 1.0 - if self.horizon is None: - assert gamma < 1.0, "If no horizon is given, gamma must be smaller than 1." - self.horizon = int(np.ceil(1.0 / (1.0 - gamma))) - - # state dimension - self.state_dim = self.env.observation_space.shape[0] - - # compute scaling, if it is None - if scaling is None: - # if high and low are bounded - if (self.env.observation_space.high == np.inf).sum() == 0 and ( - self.env.observation_space.low == -np.inf - ).sum() == 0: - scaling = ( - self.env.observation_space.high - self.env.observation_space.low - ) - # if high or low are unbounded - else: - scaling = np.ones(self.state_dim) - else: - assert scaling.ndim == 1 - assert scaling.shape[0] == self.state_dim - self.scaling = scaling - - # maximum value - r_range = self.env.reward_range[1] - self.env.reward_range[0] - if r_range == np.inf or r_range == 0.0: - logger.warning( - "{}: Reward range is zero or infinity. ".format(self.name) - + "Setting it to 1." - ) - r_range = 1.0 - - if self.gamma == 1.0: - self.v_max = r_range * horizon - else: - self.v_max = ( - r_range - * (1.0 - np.power(self.gamma, self.horizon)) - / (1.0 - self.gamma) - ) - - # number of representative states and number of actions - if max_repr is None: - max_repr = int( - np.ceil( - (1.0 * np.sqrt(self.state_dim) / self.min_dist) ** self.state_dim - ) - ) - self.max_repr = max_repr - - # current number of representative states - self.M = None - self.A = self.env.action_space.n - - # declaring variables - self.episode = None # current episode - self.representative_states = None # coordinates of all repr states - self.N_sa = None # visits to (s, a) - self.N_sas = None # visits to (s, a, s') - self.S_sa = None # sum of rewards at (s, a) - self.B_sa = None # bonus at (s, a) - self.Q = None # Q function - self.V = None # V function - - self.Q_policy = None # Q function for recommended policy - - # initialize - self.reset() - - def reset(self, **kwargs): - self.M = 0 - self.representative_states = np.zeros((self.max_repr, self.state_dim)) - self.N_sa = np.zeros((self.max_repr, self.A)) - self.N_sas = np.zeros((self.max_repr, self.A, self.max_repr)) - self.S_sa = np.zeros((self.max_repr, self.A)) - self.B_sa = self.v_max * np.ones((self.max_repr, self.A)) - - self.R_hat = np.zeros((self.max_repr, self.A)) - self.P_hat = np.zeros((self.max_repr, self.A, self.max_repr)) - - self.V = np.zeros((self.horizon, self.max_repr)) - self.Q = np.zeros((self.horizon, self.max_repr, self.A)) - self.Q_policy = None - - self.episode = 0 - - def policy(self, observation): - state = observation - assert self.Q_policy is not None - repr_state = self._map_to_repr(state, False) - return self.Q_policy[0, repr_state, :].argmax() - - def fit(self, budget: int, **kwargs): - """ - Train the agent using the provided environment. - - Parameters - ---------- - budget: int - number of episodes. Each episode runs for self.horizon unless it - enconters a terminal state in which case it stops early. - """ - del kwargs - n_episodes_to_run = budget - count = 0 - while count < n_episodes_to_run: - self._run_episode() - count += 1 - - # compute Q function for the recommended policy - self.Q_policy, _ = backward_induction( - self.R_hat[: self.M, :], - self.P_hat[: self.M, :, : self.M], - self.horizon, - self.gamma, - ) - - def _map_to_repr(self, state, accept_new_repr=True): - repr_state = map_to_representative( - state, - self.lp_metric, - self.representative_states, - self.M, - self.min_dist, - self.scaling, - accept_new_repr, - ) - # check if new representative state - if repr_state == self.M: - self.M += 1 - return repr_state - - def _update(self, state, action, next_state, reward): - repr_state = self._map_to_repr(state) - repr_next_state = self._map_to_repr(next_state) - - self.N_sa[repr_state, action] += 1 - self.N_sas[repr_state, action, repr_next_state] += 1 - self.S_sa[repr_state, action] += reward - - self.R_hat[repr_state, action] = ( - self.S_sa[repr_state, action] / self.N_sa[repr_state, action] - ) - self.P_hat[repr_state, action, :] = ( - self.N_sas[repr_state, action, :] / self.N_sa[repr_state, action] - ) - self.B_sa[repr_state, action] = self._compute_bonus( - self.N_sa[repr_state, action] - ) - - def _compute_bonus(self, n): - # reward-free - if self.reward_free: - bonus = 1.0 / n - return bonus - - # not reward-free - if self.bonus_type == "simplified_bernstein": - bonus = self.bonus_scale_factor * np.sqrt(1.0 / n) + self.v_max / n - bonus = min(bonus, self.v_max) - return bonus - else: - raise NotImplementedError( - "Error: bonus type {} not implemented".format(self.bonus_type) - ) - - def _get_action(self, state, hh=0): - assert self.Q is not None - repr_state = self._map_to_repr(state, False) - return self.Q[hh, repr_state, :].argmax() - - def _run_episode(self): - # interact for H steps - episode_rewards = 0 - observation, info = self.env.reset() - for hh in range(self.horizon): - action = self._get_action(observation, hh) - next_observation, reward, terminated, truncated, info = self.env.step( - action - ) - done = terminated or truncated - episode_rewards += reward # used for logging only - - if self.reward_free: - reward = 0.0 # set to zero before update if reward_free - - self._update(observation, action, next_observation, reward) - - observation = next_observation - if done: - break - - # run backward induction - backward_induction_in_place( - self.Q[:, : self.M, :], - self.V[:, : self.M], - self.R_hat[: self.M, :] + self.B_sa[: self.M, :], - self.P_hat[: self.M, :, : self.M], - self.horizon, - self.gamma, - self.v_max, - ) - - self.episode += 1 - # - if self.writer is not None: - self.writer.add_scalar("episode_rewards", episode_rewards, self.episode) - self.writer.add_scalar("representative states", self.M, self.episode) - - # return sum of rewards collected in the episode - return episode_rewards diff --git a/rlberry/agents/linear/__init__.py b/rlberry/agents/linear/__init__.py deleted file mode 100644 index 3db7865c3..000000000 --- a/rlberry/agents/linear/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .lsvi_ucb import LSVIUCBAgent diff --git a/rlberry/agents/linear/lsvi_ucb.py b/rlberry/agents/linear/lsvi_ucb.py deleted file mode 100644 index e777d05d9..000000000 --- a/rlberry/agents/linear/lsvi_ucb.py +++ /dev/null @@ -1,356 +0,0 @@ -import numpy as np -from rlberry.agents import AgentWithSimplePolicy -from gymnasium.spaces import Discrete -from rlberry.utils.jit_setup import numba_jit - -import rlberry - -logger = rlberry.logger - - -@numba_jit -def run_lsvi_jit( - dim, - horizon, - bonus_factor, - lambda_mat_inv, - reward_hist, - gamma, - feat_hist, - n_actions, - feat_ns_all_actions, - v_max, - total_time_steps, -): - """ - Jit version of Least-Squares Value Iteration. - - Parameters - ---------- - dim : int - Dimension of the features - horiton : int - - bonus_factor : int - - lambda_mat_inv : numpy array (dim, dim) - Inverse of the design matrix - - reward_hist : numpy array (time,) - - gamma : double - - feat_hist : numpy array (time, dim) - - n_actions : int - - feat_ns_all_actions : numpy array (time, n_actions, dim) - History of next state features for all actions - - vmax : double - Maximum value of the value function - - total_time_steps : int - Current step count - """ - # run value iteration - q_w = np.zeros((horizon + 1, dim)) - for hh in range(horizon - 1, -1, -1): - T = total_time_steps - b = np.zeros(dim) - for tt in range(T): - # compute q function at next state, q_ns - q_ns = np.zeros(n_actions) - for aa in range(n_actions): - # - feat_ns_aa = feat_ns_all_actions[tt, aa, :] - inverse_counts = feat_ns_aa.dot(lambda_mat_inv.T.dot(feat_ns_aa)) - bonus = bonus_factor * np.sqrt( - inverse_counts - ) + v_max * inverse_counts * (bonus_factor > 0.0) - # - q_ns[aa] = feat_ns_aa.dot(q_w[hh + 1, :]) + bonus - q_ns[aa] = min(q_ns[aa], v_max) - - # compute regretion targets - target = reward_hist[tt] + gamma * q_ns.max() - feat = feat_hist[tt, :] - b = b + target * feat - - # solve M x = b, where x = q_w, and M = self.lambda_mat - q_w[hh, :] = lambda_mat_inv.T @ b - return q_w - - -class LSVIUCBAgent(AgentWithSimplePolicy): - """ - A version of Least-Squares Value Iteration with UCB (LSVI-UCB), - proposed by Jin et al. (2020). - - If bonus_scale_factor is 0.0, performs random exploration. - - Notes - ----- - The computation of exploration bonuses was adapted to match the "simplified Bernstein" - bonuses that works well empirically for UCBVI in the tabular case. - - The transition probabilities are assumed to be *independent* of the timestep h. - - Parameters - ---------- - env : Model - Online model of an environment. - horizon : int - Maximum length of each episode. - feature_map_fn : function(env, kwargs) - Function that returns a feature map instance - (rlberry.agents.features.FeatureMap class). - feature_map_kwargs: - kwargs for feature_map_fn - gamma : double - Discount factor. - bonus_scale_factor : double - Constant by which to multiply the exploration bonus. - reg_factor : double - Linear regression regularization factor. - - References - ---------- - Jin, C., Yang, Z., Wang, Z., & Jordan, M. I. (2020, July). - Provably efficient reinforcement learning with linear - function approximation. In Conference on Learning Theory (pp. 2137-2143). - """ - - name = "LSVI-UCB" - - def __init__( - self, - env, - horizon, - feature_map_fn, - feature_map_kwargs=None, - gamma=0.99, - bonus_scale_factor=1.0, - reg_factor=0.1, - **kwargs - ): - AgentWithSimplePolicy.__init__(self, env, **kwargs) - - self.n_episodes = None - self.horizon = horizon - self.gamma = gamma - self.bonus_scale_factor = bonus_scale_factor - self.reg_factor = reg_factor - feature_map_kwargs = feature_map_kwargs or {} - self.feature_map = feature_map_fn(self.env, **feature_map_kwargs) - - # - if self.bonus_scale_factor == 0.0: - self.name = "LSVI-Random-Expl" - - # maximum value - r_range = self.env.reward_range[1] - self.env.reward_range[0] - if r_range == np.inf: - logger.warning( - "{}: Reward range is infinity. ".format(self.name) + "Clipping it to 1." - ) - r_range = 1.0 - - if self.gamma == 1.0: - self.v_max = r_range * horizon - else: - self.v_max = ( - r_range - * (1.0 - np.power(self.gamma, self.horizon)) - / (1.0 - self.gamma) - ) - - # - assert isinstance( - self.env.action_space, Discrete - ), "LSVI-UCB requires discrete actions." - - # - assert len(self.feature_map.shape) == 1 - self.dim = self.feature_map.shape[0] - - # attributes initialized in reset() - self.episode = None - self.lambda_mat = None # lambda matrix - self.lambda_mat_inv = None # inverse of lambda matrix - self.w_vec = None # vector representation of Q - self.w_policy = None # representation of Q for final policy - self.reward_hist = None # reward history - self.state_hist = None # state history - self.action_hist = None # action history - self.nstate_hist = None # next state history - - self.feat_hist = None # feature history - self.feat_ns_all_actions = None # next state features for all actions - # - - # aux variables (init in reset() too) - self._rewards = None - - def reset(self): - self.episode = 0 - self.total_time_steps = 0 - self.lambda_mat = self.reg_factor * np.eye(self.dim) - self.lambda_mat_inv = (1.0 / self.reg_factor) * np.eye(self.dim) - self.w_vec = np.zeros((self.horizon + 1, self.dim)) - self.reward_hist = np.zeros(self.n_episodes * self.horizon) - self.state_hist = [] - self.action_hist = [] - self.nstate_hist = [] - # episode rewards - self._rewards = np.zeros(self.n_episodes) - # - self.feat_hist = np.zeros((self.n_episodes * self.horizon, self.dim)) - self.feat_ns_all_actions = np.zeros( - (self.n_episodes * self.horizon, self.env.action_space.n, self.dim) - ) - # - self.w_policy = None - - def fit(self, budget, **kwargs): - """ - Train the agent using the provided environment. - - Parameters - ---------- - budget: int - number of episodes. Each episode runs for self.horizon unless it - enconters a terminal state in which case it stops early. - Warning: Calling fit() more than once will reset the algorithm - (to realocate memory according to the number of episodes) - """ - del kwargs - - # Allocate memory according to budget. - # TODO: avoid the need to reset() the algorithm if fit() is called again. - if self.n_episodes is not None: - logger.warning( - "[LSVI-UCB]: Calling fit() more than once will reset the algorithm" - + " (to realocate memory according to the number of episodes)." - ) - self.n_episodes = budget - self.reset() - - for ep in range(self.n_episodes): - self.run_episode() - if self.bonus_scale_factor > 0.0 or ep == self.n_episodes - 1: - # update Q function representation - self.w_vec = self._run_lsvi(self.bonus_scale_factor) - - self.w_policy = self._run_lsvi(bonus_factor=0.0)[0, :] - - def policy(self, observation): - q_w = self.w_policy - assert q_w is not None - # - q_vec = self._compute_q_vec(q_w, observation, 0.0) - return q_vec.argmax() - - def _optimistic_policy(self, observation, hh): - q_w = self.w_vec[hh, :] - q_vec = self._compute_q_vec(q_w, observation, self.bonus_scale_factor) - return q_vec.argmax() - - def run_episode(self): - observation, info = self.env.reset() - episode_rewards = 0 - for hh in range(self.horizon): - if self.bonus_scale_factor == 0.0: - action = self.env.action_space.sample() - else: - action = self._optimistic_policy(observation, hh) - - next_observation, reward, terminated, truncated, info = self.env.step( - action - ) - done = terminated or truncated - - feat = self.feature_map.map(observation, action) - outer_prod = np.outer(feat, feat) - inv = self.lambda_mat_inv - - # - self.lambda_mat += np.outer(feat, feat) - # update inverse - self.lambda_mat_inv -= (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat) - - # update history - self.reward_hist[self.total_time_steps] = reward - self.state_hist.append(observation) - self.action_hist.append(action) - self.nstate_hist.append(next_observation) - - # - tt = self.total_time_steps - self.feat_hist[tt, :] = self.feature_map.map(observation, action) - for aa in range(self.env.action_space.n): - self.feat_ns_all_actions[tt, aa, :] = self.feature_map.map( - next_observation, aa - ) - - # increments - self.total_time_steps += 1 - episode_rewards += reward - - # - observation = next_observation - if done: - break - - # store data - self._rewards[self.episode] = episode_rewards - - # update ep - self.episode += 1 - - # - if self.writer is not None: - self.writer.add_scalar("episode_rewards", episode_rewards, self.episode) - - return episode_rewards - - def _compute_q(self, q_w, state, action, bonus_factor): - """q_w is the vector representation of the Q function.""" - feat = self.feature_map.map(state, action) - inverse_counts = feat @ (self.lambda_mat_inv.T @ feat) - bonus = bonus_factor * np.sqrt(inverse_counts) + self.v_max * inverse_counts * ( - bonus_factor > 0.0 - ) - q = feat.dot(q_w) + bonus - return q - - def _compute_q_vec(self, q_w, state, bonus_factor): - A = self.env.action_space.n - q_vec = np.zeros(A) - for aa in range(A): - # q_vec[aa] = self._compute_q(q_w, state, aa, bonus_factor) - feat = self.feature_map.map(state, aa) - inverse_counts = feat @ (self.lambda_mat_inv.T @ feat) - bonus = bonus_factor * np.sqrt( - inverse_counts - ) + self.v_max * inverse_counts * (bonus_factor > 0.0) - q_vec[aa] = feat.dot(q_w) + bonus - # q_vec[aa] = min(q_vec[aa], self.v_max) # !!!!!!!!! - return q_vec - - def _run_lsvi(self, bonus_factor): - # run value iteration - q_w = run_lsvi_jit( - self.dim, - self.horizon, - bonus_factor, - self.lambda_mat_inv, - self.reward_hist, - self.gamma, - self.feat_hist, - self.env.action_space.n, - self.feat_ns_all_actions, - self.v_max, - self.total_time_steps, - ) - return q_w diff --git a/rlberry/agents/mbqvi/__init__.py b/rlberry/agents/mbqvi/__init__.py deleted file mode 100644 index 4856b69b5..000000000 --- a/rlberry/agents/mbqvi/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .mbqvi import MBQVIAgent diff --git a/rlberry/agents/mbqvi/mbqvi.py b/rlberry/agents/mbqvi/mbqvi.py deleted file mode 100644 index 83031a168..000000000 --- a/rlberry/agents/mbqvi/mbqvi.py +++ /dev/null @@ -1,152 +0,0 @@ -import numpy as np - - -from rlberry.agents import AgentWithSimplePolicy -from rlberry.agents.dynprog.utils import backward_induction, value_iteration -from gymnasium.spaces import Discrete - -import rlberry - -logger = rlberry.logger - - -class MBQVIAgent(AgentWithSimplePolicy): - """ - Model-Basel Q-Value iteration (MBQVI). - - Builds an empirical MDP and runs value iteration on it. - Corresponds to the "indirect" algorithm studied by Kearns and Singh (1999). - - Parameters - ----------- - env : Model - generative model with finite state-action space - n_samples : int - number of samples *per state-action pair* used to estimate - the empirical MDP. - gamma : double - discount factor in [0, 1] - horizon : int - horizon, if the problem is finite-horizon. if None, the discounted - problem is solved. default = None - epsilon : double - precision of value iteration, only used in discounted problems - (when horizon is None). - - - References - ---------- - Kearns, Michael J., and Satinder P. Singh. - "Finite-sample convergence rates for Q-learning and indirect algorithms." - Advances in neural information processing systems. 1999. - """ - - name = "MBQVI" - - def __init__( - self, env, n_samples=10, gamma=0.99, horizon=None, epsilon=1e-6, **kwargs - ): - AgentWithSimplePolicy.__init__(self, env, **kwargs) - - # initialize base class - assert self.env.is_generative(), "MBQVI requires a generative model." - assert isinstance( - self.env.observation_space, Discrete - ), "MBQVI requires a finite state space." - assert isinstance( - self.env.action_space, Discrete - ), "MBQVI requires a finite action space." - - # - self.n_samples = n_samples - self.gamma = gamma - self.horizon = horizon - self.epsilon = epsilon - - # empirical MDP, created in fit() - self.R_hat = None - self.P_hat = None - - # value functions - self.V = None - self.Q = None - - def _update(self, state, action, next_state, reward): - """Update model statistics.""" - self.N_sa[state, action] += 1 - self.N_sas[state, action, next_state] += 1 - self.S_sa[state, action] += reward - - def fit(self, budget=None, **kwargs): - """ - Build empirical MDP and run value iteration. - - Parameters - ---------- - budget: None - Not used. Only defined for compatibility purpose with rlberry. - Changing `budget` value has no effect. - """ - del kwargs - S = self.env.observation_space.n - A = self.env.action_space.n - self.N_sa = np.zeros((S, A)) - self.N_sas = np.zeros((S, A, S)) - self.S_sa = np.zeros((S, A)) - - # collect data - total_samples = S * A * self.n_samples - count = 0 - logger.debug( - f"[{self.name}] collecting {self.n_samples} samples per (s,a)" - f", total = {total_samples} samples." - ) - for ss in range(S): - for aa in range(A): - for _ in range(self.n_samples): - next_state, reward, _, _, _ = self.env.sample(ss, aa) - self._update(ss, aa, next_state, reward) - - count += 1 - if count % 10000 == 0: - completed = 100 * count / total_samples - logger.debug( - "[{}] ... {}/{} ({:0.0f}%)".format( - self.name, count, total_samples, completed - ) - ) - - # build model and run VI - logger.debug(f"{self.name} building model and running backward induction...") - - N_sa = np.maximum(self.N_sa, 1) - self.R_hat = self.S_sa / N_sa - self.P_hat = np.zeros((S, A, S)) - for ss in range(S): - self.P_hat[:, :, ss] = self.N_sas[:, :, ss] / N_sa - - info = {} - info["n_samples"] = self.n_samples - info["total_samples"] = total_samples - if self.horizon is None: - assert self.gamma < 1.0, "The discounted setting requires gamma < 1.0" - self.Q, self.V, n_it = value_iteration( - self.R_hat, self.P_hat, self.gamma, self.epsilon - ) - info["n_iterations"] = n_it - info["precision"] = self.epsilon - else: - self.Q, self.V = backward_induction( - self.R_hat, self.P_hat, self.horizon, self.gamma - ) - info["n_iterations"] = self.horizon - info["precision"] = 0.0 - return info - - def policy(self, observation): - state = observation - assert self.env.observation_space.contains(state) - if self.horizon is None: - return self.Q[state, :].argmax() - else: - return self.Q[0, state, :].argmax() diff --git a/rlberry/agents/optql/__init__.py b/rlberry/agents/optql/__init__.py deleted file mode 100644 index 5c538141b..000000000 --- a/rlberry/agents/optql/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .optql import OptQLAgent diff --git a/rlberry/agents/optql/optql.py b/rlberry/agents/optql/optql.py deleted file mode 100644 index 951d1d834..000000000 --- a/rlberry/agents/optql/optql.py +++ /dev/null @@ -1,206 +0,0 @@ -import numpy as np - -import gymnasium.spaces as spaces -from rlberry.agents import AgentWithSimplePolicy -from rlberry.exploration_tools.discrete_counter import DiscreteCounter - -import rlberry - -logger = rlberry.logger - - -class OptQLAgent(AgentWithSimplePolicy): - """ - Optimistic Q-Learning [1]_ with custom exploration bonuses. - - Parameters - ---------- - env : gym.Env - Environment with discrete states and actions. - gamma : double, default: 1.0 - Discount factor in [0, 1]. - horizon : int - Horizon of the objective function. - bonus_scale_factor : double, default: 1.0 - Constant by which to multiply the exploration bonus, controls - the level of exploration. - bonus_type : {"simplified_bernstein"} - Type of exploration bonus. Currently, only "simplified_bernstein" - is implemented. - add_bonus_after_update : bool, default: False - If True, add bonus to the Q function after performing the update, - instead of adding it to the update target. - - References - ---------- - .. [1] Jin et al., 2018 - Is Q-Learning Provably Efficient? - https://arxiv.org/abs/1807.03765 - """ - - name = "OptQL" - - def __init__( - self, - env, - gamma=1.0, - horizon=100, - bonus_scale_factor=1.0, - bonus_type="simplified_bernstein", - add_bonus_after_update=False, - **kwargs - ): - # init base class - AgentWithSimplePolicy.__init__(self, env, **kwargs) - - self.gamma = gamma - self.horizon = horizon - self.bonus_scale_factor = bonus_scale_factor - self.bonus_type = bonus_type - self.add_bonus_after_update = add_bonus_after_update - - # check environment - assert isinstance(self.env.observation_space, spaces.Discrete) - assert isinstance(self.env.action_space, spaces.Discrete) - - # maximum value - r_range = self.env.reward_range[1] - self.env.reward_range[0] - if r_range == np.inf or r_range == 0.0: - logger.warning( - "{}: Reward range is zero or infinity. ".format(self.name) - + "Setting it to 1." - ) - r_range = 1.0 - - self.v_max = np.zeros(self.horizon) - self.v_max[-1] = r_range - for hh in reversed(range(self.horizon - 1)): - self.v_max[hh] = r_range + self.gamma * self.v_max[hh + 1] - - # initialize - self.reset() - - def reset(self, **kwargs): - H = self.horizon - S = self.env.observation_space.n - A = self.env.action_space.n - - # (s, a) visit counter - self.N_sa = np.zeros((H, S, A)) - - # Value functions - self.V = np.ones((H + 1, S)) - self.V[H, :] = 0 - self.Q = np.ones((H, S, A)) - self.Q_bar = np.ones((H, S, A)) - for hh in range(self.horizon): - self.V[hh, :] *= self.horizon - hh - self.Q[hh, :, :] *= self.horizon - hh - self.Q_bar[hh, :, :] *= self.horizon - hh - - if self.add_bonus_after_update: - self.Q *= 0.0 - - # ep counter - self.episode = 0 - - # useful object to compute total number of visited states & entropy of visited states - self.counter = DiscreteCounter( - self.env.observation_space, self.env.action_space - ) - - def policy(self, observation): - """Recommended policy.""" - state = observation - return self.Q_bar[0, state, :].argmax() - - def _get_action(self, state, hh=0): - """Sampling policy.""" - return self.Q_bar[hh, state, :].argmax() - - def _compute_bonus(self, n, hh): - if self.bonus_type == "simplified_bernstein": - bonus = self.bonus_scale_factor * np.sqrt(1.0 / n) + self.v_max[hh] / n - bonus = min(bonus, self.v_max[hh]) - return bonus - else: - raise ValueError( - "Error: bonus type {} not implemented".format(self.bonus_type) - ) - - def _update(self, state, action, next_state, reward, hh): - self.N_sa[hh, state, action] += 1 - nn = self.N_sa[hh, state, action] - - # learning rate - alpha = (self.horizon + 1.0) / (self.horizon + nn) - bonus = self._compute_bonus(nn, hh) - - # bonus in the update - if not self.add_bonus_after_update: - target = reward + bonus + self.gamma * self.V[hh + 1, next_state] - self.Q[hh, state, action] = (1 - alpha) * self.Q[ - hh, state, action - ] + alpha * target - self.V[hh, state] = min(self.v_max[hh], self.Q[hh, state, :].max()) - self.Q_bar[hh, state, action] = self.Q[hh, state, action] - # bonus outside the update - else: - target = reward + self.gamma * self.V[hh + 1, next_state] # bonus not here - self.Q[hh, state, action] = (1 - alpha) * self.Q[ - hh, state, action - ] + alpha * target - self.Q_bar[hh, state, action] = ( - self.Q[hh, state, action] + bonus - ) # bonus here - self.V[hh, state] = min(self.v_max[hh], self.Q_bar[hh, state, :].max()) - - def _run_episode(self): - # interact for H steps - episode_rewards = 0 - observation, info = self.env.reset() - for hh in range(self.horizon): - action = self._get_action(observation, hh) - next_observation, reward, terminated, truncated, info = self.env.step( - action - ) - done = terminated or truncated - episode_rewards += reward # used for logging only - - self.counter.update(observation, action) - - self._update(observation, action, next_observation, reward, hh) - - observation = next_observation - if done: - break - - # update info - self.episode += 1 - - # writer - if self.writer is not None: - self.writer.add_scalar("episode_rewards", episode_rewards, self.episode) - self.writer.add_scalar( - "n_visited_states", self.counter.get_n_visited_states(), self.episode - ) - - # return sum of rewards collected in the episode - return episode_rewards - - def fit(self, budget: int, **kwargs): - """ - Train the agent using the provided environment. - - Parameters - ---------- - budget: int - number of episodes. Each episode runs for self.horizon unless it - enconters a terminal state in which case it stops early. - """ - del kwargs - n_episodes_to_run = budget - count = 0 - while count < n_episodes_to_run: - self._run_episode() - count += 1 diff --git a/rlberry/agents/psrl/__init__.py b/rlberry/agents/psrl/__init__.py deleted file mode 100644 index 417e37106..000000000 --- a/rlberry/agents/psrl/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .psrl import PSRLAgent diff --git a/rlberry/agents/psrl/psrl.py b/rlberry/agents/psrl/psrl.py deleted file mode 100644 index dac8440b9..000000000 --- a/rlberry/agents/psrl/psrl.py +++ /dev/null @@ -1,257 +0,0 @@ -import numpy as np - -import gymnasium.spaces as spaces -from rlberry.agents import AgentWithSimplePolicy -from rlberry.exploration_tools.discrete_counter import DiscreteCounter -from rlberry.agents.dynprog.utils import ( - backward_induction_in_place, - backward_induction_sd, -) - -import rlberry - -logger = rlberry.logger - - -class PSRLAgent(AgentWithSimplePolicy): - """ - PSRL algorithm from [1] with beta prior for the "Bernoullized" rewards - (instead of Gaussian-gamma prior). - - Notes - ----- - The recommended policy after all the episodes is computed without - exploration bonuses. - - Parameters - ---------- - env : gym.Env - Environment with discrete states and actions. - gamma : double, default: 1.0 - Discount factor in [0, 1]. If gamma is 1.0, the problem is set to - be finite-horizon. - horizon : int - Horizon of the objective function. If None and gamma<1, set to - 1/(1-gamma). - scale_prior_reward : double, delfault: 1.0 - scale of the Beta (uniform) prior, - i.e prior is Beta(scale_prior_reward*(1,1)) - scale_prior_transition : double, default: 1/number of state - scale of the (uniform) Dirichlet prior, - i.e prior is Dirichlet(scale_prior_transition*(1,...,1)) - bernoullized_reward: bool, default: True - If true the rewards are Bernoullized - reward_free : bool, default: False - If true, ignores rewards and uses only 1/n bonuses. - stage_dependent : bool, default: False - If true, assume that transitions and rewards can change with the stage h. - - References - ---------- - .. [1] Osband et al., 2013 - (More) Efficient Reinforcement Learning via Posterior Sampling - https://arxiv.org/abs/1306.0940 - - """ - - name = "PSRL" - - def __init__( - self, - env, - gamma=1.0, - horizon=100, - scale_prior_reward=1, - scale_prior_transition=None, - bernoullized_reward=True, - reward_free=False, - stage_dependent=False, - **kwargs - ): - # init base class - AgentWithSimplePolicy.__init__(self, env, **kwargs) - - self.gamma = gamma - self.horizon = horizon - self.scale_prior_reward = scale_prior_reward - self.scale_prior_transition = scale_prior_transition - if scale_prior_transition is None: - self.scale_prior_transition = 1.0 / self.env.observation_space.n - self.bernoullized_reward = bernoullized_reward - self.reward_free = reward_free - self.stage_dependent = stage_dependent - - # check environment - assert isinstance(self.env.observation_space, spaces.Discrete) - assert isinstance(self.env.action_space, spaces.Discrete) - - # other checks - assert gamma >= 0 and gamma <= 1.0 - if self.horizon is None: - assert gamma < 1.0, "If no horizon is given, gamma must be smaller than 1." - self.horizon = int(np.ceil(1.0 / (1.0 - gamma))) - - # maximum value - r_range = self.env.reward_range[1] - self.env.reward_range[0] - if r_range == np.inf or r_range == 0.0: - logger.warning( - "{}: Reward range is zero or infinity. ".format(self.name) - + "Setting it to 1." - ) - r_range = 1.0 - - self.v_max = np.zeros(self.horizon) - self.v_max[-1] = r_range - for hh in reversed(range(self.horizon - 1)): - self.v_max[hh] = r_range + self.gamma * self.v_max[hh + 1] - - # initialize - self.reset() - - def reset(self, **kwargs): - H = self.horizon - S = self.env.observation_space.n - A = self.env.action_space.n - - if self.stage_dependent: - shape_hsa = (H, S, A) - shape_hsas = (H, S, A, S) - else: - shape_hsa = (S, A) - shape_hsas = (S, A, S) - - # Prior transitions - self.N_sas = self.scale_prior_transition * np.ones(shape_hsas) - - # Prior rewards - self.M_sa = self.scale_prior_reward * np.ones(shape_hsa + (2,)) - - # Value functions - self.V = np.zeros((H, S)) - self.Q = np.zeros((H, S, A)) - # for rec. policy - self.V_policy = np.zeros((H, S)) - self.Q_policy = np.zeros((H, S, A)) - - # ep counter - self.episode = 0 - - # useful object to compute total number of visited states & entropy of visited states - self.counter = DiscreteCounter( - self.env.observation_space, self.env.action_space - ) - - def policy(self, observation): - state = observation - assert self.Q_policy is not None - return self.Q_policy[0, state, :].argmax() - - def _get_action(self, state, hh=0): - """Sampling policy.""" - assert self.Q is not None - return self.Q[hh, state, :].argmax() - - def _update(self, state, action, next_state, reward, hh): - bern_reward = reward - if self.bernoullized_reward: - bern_reward = self.rng.binomial(1, reward) - # update posterior - if self.stage_dependent: - self.N_sas[hh, state, action, next_state] += 1 - self.M_sa[hh, state, action, 0] += bern_reward - self.M_sa[hh, state, action, 1] += 1 - bern_reward - - else: - self.N_sas[state, action, next_state] += 1 - self.M_sa[state, action, 0] += bern_reward - self.M_sa[state, action, 1] += 1 - bern_reward - - def _run_episode(self): - # sample reward and transitions from posterior - self.R_sample = self.rng.beta(self.M_sa[..., 0], self.M_sa[..., 1]) - self.P_sample = self.rng.gamma(self.N_sas) - self.P_sample = self.P_sample / self.P_sample.sum(-1, keepdims=True) - # run backward induction - if self.stage_dependent: - backward_induction_sd( - self.Q, self.V, self.R_sample, self.P_sample, self.gamma, self.v_max[0] - ) - else: - backward_induction_in_place( - self.Q, - self.V, - self.R_sample, - self.P_sample, - self.horizon, - self.gamma, - self.v_max[0], - ) - # interact for H steps - episode_rewards = 0 - observation, info = self.env.reset() - for hh in range(self.horizon): - action = self._get_action(observation, hh) - next_observation, reward, terminated, truncated, info = self.env.step( - action - ) - done = terminated or truncated - episode_rewards += reward # used for logging only - - self.counter.update(observation, action) - - if self.reward_free: - reward = 0.0 # set to zero before update if reward_free - - self._update(observation, action, next_observation, reward, hh) - - observation = next_observation - if done: - break - - # update info - self.episode += 1 - - # writer - if self.writer is not None: - self.writer.add_scalar("episode_rewards", episode_rewards, self.episode) - self.writer.add_scalar( - "n_visited_states", self.counter.get_n_visited_states(), self.episode - ) - - # return sum of rewards collected in the episode - return episode_rewards - - def fit(self, budget: int, **kwargs): - """ - Train the agent using the provided environment. - - Parameters - ---------- - budget: int - number of episodes. Each episode runs for self.horizon unless it - enconters a terminal state in which case it stops early. - """ - del kwargs - n_episodes_to_run = budget - count = 0 - while count < n_episodes_to_run: - self._run_episode() - count += 1 - - # compute Q function for the recommended policy - R_hat = self.M_sa[..., 0] / (self.M_sa[..., 0] + self.M_sa[..., 1]) - P_hat = self.N_sas / self.N_sas.sum(-1, keepdims=True) - if self.stage_dependent: - backward_induction_sd( - self.Q_policy, self.V_policy, R_hat, P_hat, self.gamma, self.v_max[0] - ) - else: - backward_induction_in_place( - self.Q_policy, - self.V_policy, - R_hat, - P_hat, - self.horizon, - self.gamma, - self.v_max[0], - ) diff --git a/rlberry/agents/rlsvi/__init__.py b/rlberry/agents/rlsvi/__init__.py deleted file mode 100644 index 11c5adc67..000000000 --- a/rlberry/agents/rlsvi/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .rlsvi import RLSVIAgent diff --git a/rlberry/agents/rlsvi/rlsvi.py b/rlberry/agents/rlsvi/rlsvi.py deleted file mode 100644 index 6e3c2c120..000000000 --- a/rlberry/agents/rlsvi/rlsvi.py +++ /dev/null @@ -1,280 +0,0 @@ -import numpy as np - -import gymnasium.spaces as spaces -from rlberry.agents import AgentWithSimplePolicy -from rlberry.exploration_tools.discrete_counter import DiscreteCounter -from rlberry.agents.dynprog.utils import ( - backward_induction_in_place, - backward_induction_reward_sd, - backward_induction_sd, -) - -import rlberry - -logger = rlberry.logger - - -class RLSVIAgent(AgentWithSimplePolicy): - """ - RLSVI algorithm from [1,2] with Gaussian noise. - - Notes - ----- - The recommended policy after all the episodes is computed with the empirical - MDP. - The std of the noise is of the form: - scale/sqrt(n)+ V_max/n - as for simplified Bernstein bonuses. - - Parameters - ---------- - env : gym.Env - Environment with discrete states and actions. - gamma : double, default: 1.0 - Discount factor in [0, 1]. If gamma is 1.0, the problem is set to - be finite-horizon. - horizon : int - Horizon of the objective function. If None and gamma<1, set to - 1/(1-gamma). - scale_std_noise : double, delfault: 1.0 - scale the std of the noise. At step h the std is - scale_std_noise/sqrt(n)+(H-h+1)/n - reward_free : bool, default: False - If true, ignores rewards. - stage_dependent : bool, default: False - If true, assume that transitions and rewards can change with the stage h. - - References - ---------- - .. [1] Osband et al., 2014 - Generalization and Exploration via Randomized Value Functions - https://arxiv.org/abs/1402.0635 - - .. [2] Russo, 2019 - Worst-Case Regret Bounds for Exploration via Randomized Value Functions - https://arxiv.org/abs/1906.02870 - - """ - - name = "RLSVI" - - def __init__( - self, - env, - gamma=1.0, - horizon=100, - scale_std_noise=1.0, - reward_free=False, - stage_dependent=False, - **kwargs - ): - # init base class - AgentWithSimplePolicy.__init__(self, env, **kwargs) - - self.gamma = gamma - self.horizon = horizon - self.scale_std_noise = scale_std_noise - self.reward_free = reward_free - self.stage_dependent = stage_dependent - - # check environment - assert isinstance(self.env.observation_space, spaces.Discrete) - assert isinstance(self.env.action_space, spaces.Discrete) - - # other checks - assert gamma >= 0 and gamma <= 1.0 - if self.horizon is None: - assert gamma < 1.0, "If no horizon is given, gamma must be smaller than 1." - self.horizon = int(np.ceil(1.0 / (1.0 - gamma))) - - # maximum value - r_range = self.env.reward_range[1] - self.env.reward_range[0] - if r_range == np.inf or r_range == 0.0: - logger.warning( - "{}: Reward range is zero or infinity. ".format(self.name) - + "Setting it to 1." - ) - r_range = 1.0 - - self.v_max = np.zeros(self.horizon) - self.v_max[-1] = r_range - for hh in reversed(range(self.horizon - 1)): - self.v_max[hh] = r_range + self.gamma * self.v_max[hh + 1] - - # initialize - self.reset() - - def reset(self, **kwargs): - H = self.horizon - S = self.env.observation_space.n - A = self.env.action_space.n - - if self.stage_dependent: - shape_hsa = (H, S, A) - shape_hsas = (H, S, A, S) - else: - shape_hsa = (S, A) - shape_hsas = (S, A, S) - - # stds prior - self.std1_sa = self.scale_std_noise * np.ones((H, S, A)) - self.std2_sa = np.ones((H, S, A)) - # visit counter - self.N_sa = np.ones(shape_hsa) - - # MDP estimator - self.R_hat = np.zeros(shape_hsa) - self.P_hat = np.ones(shape_hsas) * 1.0 / S - - # Value functions - self.V = np.zeros((H, S)) - self.Q = np.zeros((H, S, A)) - # for rec. policy - self.V_policy = np.zeros((H, S)) - self.Q_policy = np.zeros((H, S, A)) - - # Init V and variances - for hh in range(self.horizon): - self.std2_sa[hh, :, :] *= self.v_max[hh] - - # ep counter - self.episode = 0 - - # useful object to compute total number of visited states & entropy of visited states - self.counter = DiscreteCounter( - self.env.observation_space, self.env.action_space - ) - - def policy(self, observation): - state = observation - assert self.Q_policy is not None - return self.Q_policy[0, state, :].argmax() - - def _get_action(self, state, hh=0): - """Sampling policy.""" - assert self.Q is not None - return self.Q[hh, state, :].argmax() - - def _update(self, state, action, next_state, reward, hh): - if self.stage_dependent: - self.N_sa[hh, state, action] += 1 - - nn = self.N_sa[hh, state, action] - prev_r = self.R_hat[hh, state, action] - prev_p = self.P_hat[hh, state, action, :] - - self.R_hat[hh, state, action] = ( - 1.0 - 1.0 / nn - ) * prev_r + reward * 1.0 / nn - - self.P_hat[hh, state, action, :] = (1.0 - 1.0 / nn) * prev_p - self.P_hat[hh, state, action, next_state] += 1.0 / nn - - else: - self.N_sa[state, action] += 1 - - nn = self.N_sa[state, action] - prev_r = self.R_hat[state, action] - prev_p = self.P_hat[state, action, :] - - self.R_hat[state, action] = (1.0 - 1.0 / nn) * prev_r + reward * 1.0 / nn - - self.P_hat[state, action, :] = (1.0 - 1.0 / nn) * prev_p - self.P_hat[state, action, next_state] += 1.0 / nn - - def _run_episode(self): - # interact for H steps - episode_rewards = 0 - # stds scale/sqrt(n)+(H-h+1)/n - std_sa = self.std1_sa / np.sqrt(self.N_sa) + self.std2_sa / self.N_sa - noise_sa = self.rng.normal(self.R_hat, std_sa) - # run backward noisy induction - if self.stage_dependent: - backward_induction_sd( - self.Q, - self.V, - self.R_hat + noise_sa, - self.P_hat, - self.gamma, - self.v_max[0], - ) - else: - backward_induction_reward_sd( - self.Q, - self.V, - self.R_hat + noise_sa, - self.P_hat, - self.gamma, - self.v_max[0], - ) - - observation, info = self.env.reset() - for hh in range(self.horizon): - action = self._get_action(observation, hh) - next_observation, reward, terminated, truncated, info = self.env.step( - action - ) - done = terminated or truncated - episode_rewards += reward # used for logging only - - self.counter.update(observation, action) - - if self.reward_free: - reward = 0.0 # set to zero before update if reward_free - - self._update(observation, action, next_observation, reward, hh) - - observation = next_observation - if done: - break - - # update info - self.episode += 1 - - # writer - if self.writer is not None: - self.writer.add_scalar("episode_rewards", episode_rewards, self.episode) - self.writer.add_scalar( - "n_visited_states", self.counter.get_n_visited_states(), self.episode - ) - - # return sum of rewards collected in the episode - return episode_rewards - - def fit(self, budget: int, **kwargs): - """ - Train the agent using the provided environment. - - Parameters - ---------- - budget: int - number of episodes. Each episode runs for self.horizon unless it - enconters a terminal state in which case it stops early. - """ - del kwargs - n_episodes_to_run = budget - count = 0 - while count < n_episodes_to_run: - self._run_episode() - count += 1 - - # compute Q function for the recommended policy - if self.stage_dependent: - backward_induction_sd( - self.Q_policy, - self.V_policy, - self.R_hat, - self.P_hat, - self.gamma, - self.v_max[0], - ) - else: - backward_induction_in_place( - self.Q_policy, - self.V_policy, - self.R_hat, - self.P_hat, - self.horizon, - self.gamma, - self.v_max[0], - ) diff --git a/rlberry/agents/tabular_rl/__init__.py b/rlberry/agents/tabular_rl/__init__.py deleted file mode 100644 index 5eefb55d5..000000000 --- a/rlberry/agents/tabular_rl/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .qlearning import QLAgent -from .sarsa import SARSAAgent diff --git a/rlberry/agents/tabular_rl/qlearning.py b/rlberry/agents/tabular_rl/qlearning.py deleted file mode 100644 index 024147acb..000000000 --- a/rlberry/agents/tabular_rl/qlearning.py +++ /dev/null @@ -1,127 +0,0 @@ -from typing import Optional, Literal -import numpy as np -from gymnasium import spaces -from scipy.special import softmax - -from rlberry import types -from rlberry.agents import AgentWithSimplePolicy - - -class QLAgent(AgentWithSimplePolicy): - """Q-Learning Agent. - - Parameters - ---------- - env: :class:`~rlberry.types.Env` - Environment with discrete states and actions. - gamma: float, default = 0.99 - Discount factor. - alpha: float, default = 0.1 - Learning rate. - exploration_type: {"epsilon", "boltzmann"}, default: None - If "epsilon": Epsilon-Greedy exploration. - If "boltzmann": Boltzmann exploration. - If None: No exploration. - exploration_rate: float, default: None - epsilon parameter for Epsilon-Greedy exploration or tau parameter for Boltzmann exploration. - - Attributes - ---------- - Q : ndarray - 2D array that stores the estimation ofexpected rewards for state-action pairs. - - Examples - -------- - >>> from rlberry.envs import GridWorld - >>> - >>> env = GridWorld(walls=(), nrows=5, ncols=5) - >>> agent = QLAgent() - >>> agent.fit(budget=1000) - >>> agent.policy(env.observation_space.sample()) - >>> agent.reset() - """ - - name = "QL" - - def __init__( - self, - env: types.Env, - gamma: float = 0.99, - alpha: float = 0.1, - exploration_type: Optional[Literal["epsilon", "boltzmann"]] = None, - exploration_rate: Optional[float] = None, - **kwargs - ): - # init base class - AgentWithSimplePolicy.__init__(self, env, **kwargs) - - self.gamma = gamma - self.alpha = alpha - self.exploration_type = exploration_type - self.exploration_rate = exploration_rate - # check environment - assert isinstance(self.env.observation_space, spaces.Discrete) - assert isinstance(self.env.action_space, spaces.Discrete) - - # check exploration type - if self.exploration_type is not None: - assert ( - exploration_type == "epsilon" or "boltzmann" - ) and exploration_rate is not None - - self.Q = np.zeros((self.env.observation_space.n, self.env.action_space.n)) - - def reset(self, **kwargs): - self.Q.fill(0) - - def policy(self, observation): - return self.Q[observation].argmax() - - def get_action(self, observation): - if ( - self.exploration_type == "epsilon" - and np.random.random() <= self.exploration_rate - ): - return np.random.choice(self.env.action_space.n) - elif self.exploration_type == "boltzmann": - return np.random.choice( - self.env.action_space.n, - p=softmax(self.exploration_rate * self.Q[observation]), - ) - else: - return self.Q[observation].argmax() - - def fit(self, budget: int, **kwargs): - """ - Train the agent using the provided environment. - Parameters - ---------- - budget: int - number of Q updates. - """ - del kwargs - observation, info = self.env.reset() - episode_rewards = 0 - for i in range(budget): - action = self.get_action(observation) - next_observation, reward, terminated, truncated, info = self.env.step( - action - ) - done = terminated or truncated - episode_rewards += reward - if self.writer is not None: - self.writer.add_scalar("episode_rewards", episode_rewards, i) - if done: - self.Q[observation, action] = reward - else: - self.Q[observation, action] = self.Q[ - observation, action - ] + self.alpha * ( - reward - + self.gamma * np.amax(self.Q[next_observation]) - - self.Q[observation, action] - ) - observation = next_observation - if done: - observation, info = self.env.reset() - episode_rewards = 0 diff --git a/rlberry/agents/tabular_rl/sarsa.py b/rlberry/agents/tabular_rl/sarsa.py deleted file mode 100644 index 3f097d4d4..000000000 --- a/rlberry/agents/tabular_rl/sarsa.py +++ /dev/null @@ -1,125 +0,0 @@ -from typing import Optional, Literal -import numpy as np -from gymnasium import spaces -from scipy.special import softmax - -from rlberry import types -from rlberry.agents import AgentWithSimplePolicy - - -class SARSAAgent(AgentWithSimplePolicy): - """SARSA Agent. - - Parameters - ---------- - env: :class:`~rlberry.types.Env` - Environment with discrete states and actions. - gamma: float, default = 0.99 - Discount factor. - alpha: float, default = 0.1 - Learning rate. - exploration_type: {"epsilon", "boltzmann"}, default: None - If "epsilon": Epsilon-Greedy exploration. - If "boltzmann": Boltzmann exploration. - If None: No exploration. - exploration_rate: float, default: None - epsilon parameter for Epsilon-Greedy exploration or tau parameter for Boltzmann exploration. - - Attributes - ---------- - Q : ndarray - 2D array that stores the estimation ofexpected rewards for state-action pairs. - Examples - -------- - >>> from rlberry.envs import GridWorld - >>> - >>> env = GridWorld(walls=(), nrows=5, ncols=5) - >>> agent = SARSAAgent() - >>> agent.fit(budget=1000) - >>> agent.policy(env.observation_space.sample()) - >>> agent.reset() - """ - - def __init__( - self, - env: types.Env, - gamma: float = 0.99, - alpha: float = 0.1, - exploration_type: Optional[Literal["epsilon", "boltzmann"]] = None, - exploration_rate: Optional[float] = None, - **kwargs - ): - # init base class - AgentWithSimplePolicy.__init__(self, env, **kwargs) - - self.gamma = gamma - self.alpha = alpha - self.exploration_type = exploration_type - self.exploration_rate = exploration_rate - # check environment - assert isinstance(self.env.observation_space, spaces.Discrete) - assert isinstance(self.env.action_space, spaces.Discrete) - - # check exploration type - if self.exploration_type is not None: - assert ( - exploration_type == "epsilon" or "boltzmann" - ) and exploration_rate is not None - - self.Q = np.zeros((self.env.observation_space.n, self.env.action_space.n)) - - def reset(self, **kwargs): - self.Q.fill(0) - - def policy(self, observation): - return self.Q[observation].argmax() - - def get_action(self, observation): - if ( - self.exploration_type == "epsilon" - and np.random.random() <= self.exploration_rate - ): - return np.random.choice(self.env.action_space.n) - elif self.exploration_type == "boltzmann": - return np.random.choice( - self.env.action_space.n, - p=softmax(self.exploration_rate * self.Q[observation]), - ) - else: - return self.Q[observation].argmax() - - def fit(self, budget: int, **kwargs): - """ - Train the agent using the provided environment. - Parameters - ---------- - budget: int - number of Q updates. - """ - del kwargs - observation, info = self.env.reset() - episode_rewards = 0 - for i in range(budget): - action = self.get_action(observation) - next_observation, reward, terminated, truncated, info = self.env.step( - action - ) - done = terminated or truncated - episode_rewards += reward - if self.writer is not None: - self.writer.add_scalar("episode_rewards", episode_rewards, i) - if done: - self.Q[observation, action] = reward - else: - next_action = self.get_action(next_observation) - self.Q[observation, action] = self.Q[ - observation, action - ] + self.alpha * ( - reward - + self.gamma * self.Q[next_observation, next_action] - - self.Q[observation, action] - ) - observation = next_observation - if done: - observation, info = self.env.reset() - episode_rewards = 0 diff --git a/rlberry/agents/tests/test_adaptiveql.py b/rlberry/agents/tests/test_adaptiveql.py deleted file mode 100644 index 4079dcbf9..000000000 --- a/rlberry/agents/tests/test_adaptiveql.py +++ /dev/null @@ -1,12 +0,0 @@ -from rlberry.agents import AdaptiveQLAgent -from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env -import matplotlib.pyplot as plt - - -def test_adaptive_ql(): - env = get_benchmark_env(level=2) - agent = AdaptiveQLAgent(env, horizon=30) - agent.fit(budget=50) - agent.policy(env.observation_space.sample()) - agent.Qtree.plot(0, 20) - plt.clf() diff --git a/rlberry/agents/tests/test_bandits.py b/rlberry/agents/tests/test_bandits.py deleted file mode 100644 index 441e5a3f0..000000000 --- a/rlberry/agents/tests/test_bandits.py +++ /dev/null @@ -1,131 +0,0 @@ -from rlberry.envs.bandits import NormalBandit, BernoulliBandit -from rlberry.agents.bandits import ( - IndexAgent, - RandomizedAgent, - TSAgent, - BanditWithSimplePolicy, - makeBetaPrior, - makeBoundedIMEDIndex, - makeBoundedMOSSIndex, - makeBoundedNPTSIndex, - makeBoundedUCBIndex, - makeETCIndex, - makeGaussianPrior, - makeEXP3Index, - makeSubgaussianMOSSIndex, - makeSubgaussianUCBIndex, - makeBoundedUCBVIndex, -) -from rlberry.utils import check_bandit_agent - - -TEST_SEED = 42 - - -def test_base_bandit(): - assert check_bandit_agent(BanditWithSimplePolicy, NormalBandit, seed=TEST_SEED) - - -bounded_indices = { - "IMED": makeBoundedIMEDIndex, - "MOSS": makeBoundedMOSSIndex, - "NPTS": makeBoundedNPTSIndex, - "UCB": makeBoundedUCBIndex, - "UCBV": makeBoundedUCBVIndex, -} -subgaussian_indices = { - "UCB": makeSubgaussianUCBIndex, - "MOSS": makeSubgaussianMOSSIndex, -} -misc_indices = { - "ETC": makeETCIndex, -} - - -def test_bounded_indices(): - for agent_name, makeIndex in bounded_indices.items(): - - class Agent(IndexAgent): - name = agent_name - - def __init__(self, env, **kwargs): - index, tracker_params = makeIndex() - IndexAgent.__init__( - self, env, index, tracker_params=tracker_params, **kwargs - ) - - assert check_bandit_agent( - Agent, BernoulliBandit, seed=TEST_SEED - ), "Agent not reproducible" - - -def test_subgaussian_indices(): - for agent_name, makeIndex in subgaussian_indices.items(): - - class Agent(IndexAgent): - name = agent_name - - def __init__(self, env, **kwargs): - index, tracker_params = makeIndex() - IndexAgent.__init__( - self, env, index, tracker_params=tracker_params, **kwargs - ) - - assert check_bandit_agent( - Agent, NormalBandit, seed=TEST_SEED - ), "Agent not reproducible" - - -def test_misc_indices(): - for agent_name, makeIndex in misc_indices.items(): - - class Agent(IndexAgent): - name = agent_name - - def __init__(self, env, **kwargs): - index, tracker_params = makeIndex() - IndexAgent.__init__( - self, env, index, tracker_params=tracker_params, **kwargs - ) - - assert check_bandit_agent( - Agent, BernoulliBandit, seed=TEST_SEED - ), "Agent not reproducible" - - -def test_randomized_bandits(): - class EXP3Agent(RandomizedAgent): - name = "EXP3" - - def __init__(self, env, **kwargs): - prob, tracker_params = makeEXP3Index() - RandomizedAgent.__init__( - self, env, prob, tracker_params=tracker_params, **kwargs - ) - - assert check_bandit_agent( - EXP3Agent, BernoulliBandit, seed=TEST_SEED - ), "Agent not reproducible" - - -priors = { - "Beta": (makeBetaPrior, BernoulliBandit), - "Gaussian": (makeGaussianPrior, NormalBandit), -} - - -def test_TS(): - for agent_name, (makePrior, Bandit) in priors.items(): - - class Agent(TSAgent): - name = agent_name - - def __init__(self, env, **kwargs): - prior_info, tracker_params = makePrior() - TSAgent.__init__( - self, env, prior_info, tracker_params=tracker_params, **kwargs - ) - - assert check_bandit_agent( - Agent, Bandit, seed=TEST_SEED - ), "Agent not reproducible" diff --git a/rlberry/agents/tests/test_dynprog.py b/rlberry/agents/tests/test_dynprog.py deleted file mode 100644 index 6d96b8f49..000000000 --- a/rlberry/agents/tests/test_dynprog.py +++ /dev/null @@ -1,156 +0,0 @@ -import numpy as np -import pytest - -import rlberry.seeding as seeding -from rlberry.agents.dynprog import ValueIterationAgent -from rlberry.agents.dynprog.utils import backward_induction -from rlberry.agents.dynprog.utils import backward_induction_in_place -from rlberry.agents.dynprog.utils import backward_induction_sd -from rlberry.agents.dynprog.utils import backward_induction_reward_sd -from rlberry.agents.dynprog.utils import bellman_operator -from rlberry.agents.dynprog.utils import value_iteration -from rlberry.envs.finite import FiniteMDP - -_rng = seeding.Seeder(123).rng - - -def get_random_mdp(S, A): - R = _rng.uniform(0.0, 1.0, (S, A)) - P = _rng.uniform(0.0, 1.0, (S, A, S)) - for ss in range(S): - for aa in range(A): - P[ss, aa, :] /= P[ss, aa, :].sum() - return R, P - - -@pytest.mark.parametrize( - "gamma, S, A", - [ - (0.001, 2, 1), - (0.25, 2, 1), - (0.5, 2, 1), - (0.75, 2, 1), - (0.999, 2, 1), - (0.001, 4, 2), - (0.25, 4, 2), - (0.5, 4, 2), - (0.75, 4, 2), - (0.999, 4, 2), - (0.001, 20, 4), - (0.25, 20, 4), - (0.5, 20, 4), - (0.75, 20, 4), - (0.999, 20, 4), - ], -) -def test_bellman_operator_monotonicity_and_contraction(gamma, S, A): - rng = seeding.Seeder(123).rng - vmax = 1.0 / (1.0 - gamma) - for _ in range(10): - # generate random MDP - R, P = get_random_mdp(S, A) - - # generate random Q functions - Q0 = rng.uniform(-vmax, vmax, (S, A)) - Q1 = rng.uniform(-vmax, vmax, (S, A)) - # apply Bellman operator - TQ0 = bellman_operator(Q0, R, P, gamma) - TQ1 = bellman_operator(Q1, R, P, gamma) - - # test contraction - norm_tq = np.abs(TQ1 - TQ0).max() - norm_q = np.abs(Q1 - Q0).max() - assert norm_tq <= gamma * norm_q - - # test monotonicity - Q2 = rng.uniform(-vmax / 2, vmax / 2, (S, A)) - Q3 = Q2 + rng.uniform(0.0, vmax / 2, (S, A)) - TQ2 = bellman_operator(Q2, R, P, gamma) - TQ3 = bellman_operator(Q3, R, P, gamma) - assert np.greater(TQ2, TQ3).sum() == 0 - - -@pytest.mark.parametrize( - "gamma, S, A", - [(0.01, 10, 4), (0.25, 10, 4), (0.5, 10, 4), (0.75, 10, 4), (0.99, 10, 4)], -) -def test_value_iteration(gamma, S, A): - for epsilon in np.logspace(-1, -6, num=5): - for sim in range(5): - # generate random MDP - R, P = get_random_mdp(S, A) - - # run value iteration - Q, V, n_it = value_iteration(R, P, gamma, epsilon) - # check precision - TQ = bellman_operator(Q, R, P, gamma) - assert np.abs(TQ - Q).max() <= epsilon - - -@pytest.mark.parametrize("horizon, S, A", [(10, 5, 4), (20, 10, 4)]) -def test_backward_induction(horizon, S, A): - for sim in range(5): - # generate random MDP - R, P = get_random_mdp(S, A) - - # run backward induction - Q, V = backward_induction(R, P, horizon) - - assert Q.max() <= horizon - assert V.max() <= horizon - - # run backward with clipping V to 1.0 - Q, V = backward_induction(R, P, horizon, vmax=1.0) - assert V.max() <= 1.0 - - # run bacward induction in place - Q2 = np.zeros((horizon, S, A)) - V2 = np.zeros((horizon, S)) - backward_induction_in_place(Q2, V2, R, P, horizon, vmax=1.0) - assert np.array_equal(Q, Q2) - assert np.array_equal(V, V2) - - -@pytest.mark.parametrize("horizon, S, A", [(10, 5, 4), (20, 10, 4)]) -def test_backward_induction_sd(horizon, S, A): - """ - Test stage-dependent MDPs - """ - for sim in range(5): - # generate random MDP - Rstat, Pstat = get_random_mdp(S, A) - R = np.zeros((horizon, S, A)) - P = np.zeros((horizon, S, A, S)) - for ii in range(horizon): - R[ii, :, :] = Rstat - P[ii, :, :, :] = Pstat - - # run backward induction in stationary MDP - Qstat, Vstat = backward_induction(Rstat, Pstat, horizon) - - # run backward induction in stage-dependent MDP - Q = np.zeros((horizon, S, A)) - V = np.zeros((horizon, S)) - backward_induction_sd(Q, V, R, P) - - # run backward induction with stage-dependent rewards - Q2 = np.zeros((horizon, S, A)) - V2 = np.zeros((horizon, S)) - backward_induction_reward_sd(Q2, V2, R, Pstat) - - assert np.array_equal(Q, Qstat) - assert np.array_equal(V, Vstat) - assert np.array_equal(Q2, Qstat) - assert np.array_equal(V2, Vstat) - - -@pytest.mark.parametrize("horizon, gamma, S, A", [(None, 0.5, 10, 4), (10, 1.0, 10, 4)]) -def test_value_iteration_agent(horizon, gamma, S, A): - for sim in range(5): - # generate random MDP - R, P = get_random_mdp(S, A) - # create env and agent - env = FiniteMDP(R, P) - agent = ValueIterationAgent(env, gamma=gamma, horizon=horizon) - # run - agent.fit() diff --git a/rlberry/agents/tests/test_kernel_based.py b/rlberry/agents/tests/test_kernel_based.py deleted file mode 100644 index 65abac706..000000000 --- a/rlberry/agents/tests/test_kernel_based.py +++ /dev/null @@ -1,58 +0,0 @@ -import pytest -from rlberry.agents.kernel_based import RSKernelUCBVIAgent -from rlberry.agents.kernel_based import RSUCBVIAgent -from rlberry.agents.kernel_based.kernels import _str_to_int -from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env - - -@pytest.mark.parametrize( - "kernel_type", - [ - "uniform", - "triangular", - "gaussian", - "epanechnikov", - "quartic", - "triweight", - "tricube", - "cosine", - "exp-2", - ], -) -def test_rs_kernel_ucbvi(kernel_type): - for horizon in [None, 30]: - env = get_benchmark_env(level=1) - agent = RSKernelUCBVIAgent( - env, - gamma=0.95, - horizon=horizon, - bonus_scale_factor=0.01, - min_dist=0.2, - bandwidth=0.05, - beta=1.0, - kernel_type=kernel_type, - ) - agent.fit(budget=5) - agent.policy(env.observation_space.sample()) - - -def test_str_to_int(): - for ii in range(100): - assert _str_to_int(str(ii)) == ii - - -def test_rs_ucbvi(): - env = get_benchmark_env(level=1) - agent = RSUCBVIAgent(env, gamma=0.99, horizon=30, bonus_scale_factor=0.1) - agent.fit(budget=5) - agent.policy(env.observation_space.sample()) - - -def test_rs_ucbvi_reward_free(): - env = get_benchmark_env(level=1) - agent = RSUCBVIAgent( - env, gamma=0.99, horizon=30, bonus_scale_factor=0.1, reward_free=True - ) - agent.fit(budget=5) - agent.policy(env.observation_space.sample()) - assert agent.R_hat.sum() == 0.0 diff --git a/rlberry/agents/tests/test_lsvi_ucb.py b/rlberry/agents/tests/test_lsvi_ucb.py deleted file mode 100644 index 03299b747..000000000 --- a/rlberry/agents/tests/test_lsvi_ucb.py +++ /dev/null @@ -1,218 +0,0 @@ -import numpy as np -import pytest -from rlberry.agents.features import FeatureMap -from rlberry.agents.linear.lsvi_ucb import LSVIUCBAgent -from rlberry.agents.dynprog import ValueIterationAgent -from rlberry.envs.finite import GridWorld - - -class OneHotFeatureMap(FeatureMap): - def __init__(self, S, A): - self.S = S - self.A = A - self.shape = (S * A,) - - def map(self, observation, action): - feat = np.zeros((self.S, self.A)) - feat[observation, action] = 1.0 - return feat.flatten() - - -class RandomFeatMap(FeatureMap): - def __init__(self, S, A): - self.feat_mat = np.random.randn(S, A, 10) - self.shape = (10,) - - def map(self, observation, action): - feat = self.feat_mat[observation, action, :] - return feat.copy() - - -@pytest.mark.parametrize("FeatMapClass", [OneHotFeatureMap, RandomFeatMap]) -def test_lsvi_ucb_matrix_inversion(FeatMapClass): - env = GridWorld(nrows=3, ncols=3, walls=()) - env.reseed(123) - - def feature_map_fn(_env): - return FeatMapClass(_env.observation_space.n, _env.action_space.n) - - reg_factor = 0.1 - agent = LSVIUCBAgent( - env, feature_map_fn=feature_map_fn, horizon=10, reg_factor=reg_factor - ) - agent.reseed(123) - agent.fit(budget=50) - assert np.allclose(np.linalg.inv(agent.lambda_mat), agent.lambda_mat_inv) - assert agent.episode == 50 - agent.policy(env.observation_space.sample()) - - # Check counts - if FeatMapClass != OneHotFeatureMap: - return - - S = env.observation_space.n - A = env.action_space.n - N_sa = np.zeros((S, A)) - for state, action in zip(agent.state_hist, agent.action_hist): - N_sa[state, action] += 1.0 - - assert np.allclose( - agent.lambda_mat_inv.diagonal(), 1.0 / (N_sa.flatten() + reg_factor) - ) - - for ss in range(S): - for aa in range(A): - feat = agent.feature_map.map(ss, aa) - assert np.allclose( - feat @ (agent.lambda_mat_inv.T @ feat), - 1.0 / (N_sa[ss, aa] + reg_factor), - ) - - -def test_lsvi_without_bonus(): - def lsvi_debug_gather_data(agent): - """ - Function to gather data sampling uniformly - states and actions - """ - N = agent.n_episodes * agent.horizon - count = 0 - while count < N: - state = agent.env.observation_space.sample() - action = agent.env.action_space.sample() - next_state, reward, terminated, truncated, info = agent.env.sample( - state, action - ) - done = terminated or truncated - # - # - feat = agent.feature_map.map(state, action) - outer_prod = np.outer(feat, feat) - inv = agent.lambda_mat_inv - - # - agent.lambda_mat += np.outer(feat, feat) - # update inverse - agent.lambda_mat_inv -= (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat) - - # update history - agent.reward_hist[count] = reward - agent.state_hist.append(state) - agent.action_hist.append(action) - agent.nstate_hist.append(next_state) - - # - tt = agent.total_time_steps - agent.feat_hist[tt, :] = agent.feature_map.map(state, action) - for aa in range(agent.env.action_space.n): - agent.feat_ns_all_actions[tt, aa, :] = agent.feature_map.map( - next_state, aa - ) - - # increments - agent.total_time_steps += 1 - count += 1 - - env = GridWorld(nrows=2, ncols=2, walls=(), success_probability=0.95) - env.reseed(123) - - def feature_map_fn(_env): - return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n) - - agent = LSVIUCBAgent( - env, feature_map_fn=feature_map_fn, horizon=20, gamma=0.99, reg_factor=1e-5 - ) - agent.reseed(123) - agent.n_episodes = 100 - agent.reset() - - lsvi_debug_gather_data(agent) - # estimated Q - S = env.observation_space.n - Q_est = agent._run_lsvi(bonus_factor=0.0)[0, :].reshape((S, -1)) - - # near optimal Q - agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=20) - agent_opt.fit() - Q = agent_opt.Q[0, :, :] - - print(Q) - print("---") - print(Q_est) - - print("-------") - print(np.abs(Q - Q_est)) - # Check error - assert Q_est == pytest.approx(Q, rel=0.01) - - -def test_lsvi_random_exploration(): - env = GridWorld(nrows=2, ncols=2, walls=(), success_probability=0.95) - env.reseed(123) - - def feature_map_fn(_env): - return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n) - - agent = LSVIUCBAgent( - env, - feature_map_fn=feature_map_fn, - horizon=20, - gamma=0.99, - reg_factor=1e-5, - bonus_scale_factor=0.0, - ) - agent.reseed(123) - agent.fit(budget=250) - - # estimated Q - S = env.observation_space.n - Q_est = agent._run_lsvi(bonus_factor=0.0)[0, :].reshape((S, -1)) - - # near optimal Q - agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=20) - agent_opt.fit() - Q = agent_opt.Q[0, :, :] - - print(Q) - print("---") - print(Q_est) - - print("-------") - print(np.abs(Q - Q_est)) - # Check error - assert np.abs(Q - Q_est).mean() < 0.1 - - -def test_lsvi_optimism(): - env = GridWorld(nrows=2, ncols=2, walls=()) - - def feature_map_fn(_env): - return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n) - - agent = LSVIUCBAgent( - env, - gamma=0.99, - feature_map_fn=feature_map_fn, - horizon=3, - bonus_scale_factor=3, - reg_factor=0.000001, - ) - agent.fit(budget=250) - - # near optimal Q - agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=3) - agent_opt.fit() - Q = agent_opt.Q[0, :, :] - - # optimistic Q - S = env.observation_space.n - A = env.action_space.n - Q_optimistic = np.zeros((S, A)) - for ss in range(S): - Q_optimistic[ss, :] = agent._compute_q_vec( - agent.w_vec[0, :], ss, agent.bonus_scale_factor - ) - - print(Q) - print(Q_optimistic) - assert (Q_optimistic - Q).min() >= -1e-5 diff --git a/rlberry/agents/tests/test_mbqvi.py b/rlberry/agents/tests/test_mbqvi.py deleted file mode 100644 index cafdb5566..000000000 --- a/rlberry/agents/tests/test_mbqvi.py +++ /dev/null @@ -1,27 +0,0 @@ -import numpy as np -import pytest - -from rlberry.seeding import Seeder -from rlberry.agents.mbqvi import MBQVIAgent -from rlberry.envs.finite import FiniteMDP - - -@pytest.mark.parametrize("S, A", [(5, 2), (10, 4)]) -def test_mbqvi(S, A): - rng = Seeder(123).rng - - for sim in range(5): - # generate random MDP with deterministic transitions - R = rng.uniform(0.0, 1.0, (S, A)) - P = np.zeros((S, A, S)) - for ss in range(S): - for aa in range(A): - ns = rng.integers(0, S) - P[ss, aa, ns] = 1 - - # run MBQVI and check exactness of estimators - env = FiniteMDP(R, P) - agent = MBQVIAgent(env, n_samples=1) - agent.fit() - assert np.abs(R - agent.R_hat).max() < 1e-16 - assert np.abs(P - agent.P_hat).max() < 1e-16 diff --git a/rlberry/agents/tests/test_optql.py b/rlberry/agents/tests/test_optql.py deleted file mode 100644 index 35adf21d6..000000000 --- a/rlberry/agents/tests/test_optql.py +++ /dev/null @@ -1,9 +0,0 @@ -from rlberry.agents.optql import OptQLAgent -from rlberry.envs.finite import GridWorld - - -def test_optql(): - env = GridWorld(walls=(), nrows=5, ncols=5) - agent = OptQLAgent(env, horizon=11, gamma=0.99, bonus_scale_factor=0.1) - agent.fit(budget=50) - agent.policy(env.observation_space.sample()) diff --git a/rlberry/agents/tests/test_psrl.py b/rlberry/agents/tests/test_psrl.py deleted file mode 100644 index 325777f6d..000000000 --- a/rlberry/agents/tests/test_psrl.py +++ /dev/null @@ -1,29 +0,0 @@ -import pytest -from rlberry.agents.psrl import PSRLAgent -from rlberry.envs.finite import GridWorld - - -@pytest.mark.parametrize( - "gamma, stage_dependent, bernoullized_reward", - [ - (1.0, True, True), - (1.0, True, False), - (1.0, False, True), - (1.0, False, False), - (0.9, True, True), - (0.9, True, False), - (0.9, False, True), - (0.9, False, False), - ], -) -def test_ucbvi(gamma, stage_dependent, bernoullized_reward): - env = GridWorld(walls=(), nrows=5, ncols=5) - agent = PSRLAgent( - env, - horizon=11, - bernoullized_reward=bernoullized_reward, - stage_dependent=stage_dependent, - gamma=gamma, - ) - agent.fit(budget=50) - agent.policy(env.observation_space.sample()) diff --git a/rlberry/agents/tests/test_replay.py b/rlberry/agents/tests/test_replay.py index bad1a297e..167bf80c5 100644 --- a/rlberry/agents/tests/test_replay.py +++ b/rlberry/agents/tests/test_replay.py @@ -1,7 +1,7 @@ import pytest import numpy as np from rlberry.agents.utils import replay -from rlberry.envs.finite import GridWorld +from rlberry_research.envs.finite import GridWorld from gymnasium.wrappers import TimeLimit @@ -56,27 +56,31 @@ def test_replay_size(): @pytest.mark.parametrize("sampling_mode", ["uniform", "prioritized"]) -def test_replay_sampling(sampling_mode): +@pytest.mark.parametrize("max_replay_size", [128, 500]) +def test_replay_sampling(sampling_mode, max_replay_size): batch_size = 128 chunk_size = 256 # get replay buffer - buffer, _ = _get_filled_replay(max_replay_size=500) + buffer, _ = _get_filled_replay(max_replay_size=max_replay_size) # Sample batches, check shape and dtype for _ in range(10): batch = buffer.sample( batch_size=batch_size, chunk_size=chunk_size, sampling_mode=sampling_mode ) - for tag in buffer.tags: - assert batch.data[tag].shape[:2] == (batch_size, chunk_size) - assert batch.data[tag].dtype == buffer.dtypes[tag] - assert np.array_equal( - np.array(buffer.data[tag], dtype=buffer.dtypes[tag])[ - batch.info["indices"] - ], - batch.data[tag], - ) + if chunk_size > max_replay_size: + assert batch is None + else: + for tag in buffer.tags: + assert batch.data[tag].shape[:2] == (batch_size, chunk_size) + assert batch.data[tag].dtype == buffer.dtypes[tag] + assert np.array_equal( + np.array(buffer.data[tag], dtype=buffer.dtypes[tag])[ + batch.info["indices"] + ], + batch.data[tag], + ) def test_replay_priority_update(): diff --git a/rlberry/agents/tests/test_rlsvi.py b/rlberry/agents/tests/test_rlsvi.py deleted file mode 100644 index 0907d8d33..000000000 --- a/rlberry/agents/tests/test_rlsvi.py +++ /dev/null @@ -1,19 +0,0 @@ -import pytest -from rlberry.agents.rlsvi import RLSVIAgent -from rlberry.envs.finite import GridWorld - - -@pytest.mark.parametrize( - "gamma, stage_dependent", - [ - (1.0, True), - (1.0, False), - (0.9, True), - (0.9, False), - ], -) -def test_rlsvi(gamma, stage_dependent): - env = GridWorld(walls=(), nrows=5, ncols=5) - agent = RLSVIAgent(env, horizon=11, stage_dependent=stage_dependent, gamma=gamma) - agent.fit(budget=50) - agent.policy(env.observation_space.sample()) diff --git a/rlberry/agents/tests/test_tabular_rl.py b/rlberry/agents/tests/test_tabular_rl.py deleted file mode 100644 index ab7f618a3..000000000 --- a/rlberry/agents/tests/test_tabular_rl.py +++ /dev/null @@ -1,33 +0,0 @@ -import pytest -from rlberry.agents import QLAgent, SARSAAgent -from rlberry.envs import GridWorld - - -@pytest.mark.parametrize( - "exploration_type, exploration_rate", - [("epsilon", 0.5), ("boltzmann", 0.5), (None, None)], -) -def test_ql(exploration_type, exploration_rate): - env = GridWorld(walls=(), nrows=5, ncols=5) - agent = QLAgent( - env, exploration_type=exploration_type, exploration_rate=exploration_rate - ) - agent.fit(budget=50) - agent.policy(env.observation_space.sample()) - agent.reset() - assert not agent.Q.any() - - -@pytest.mark.parametrize( - "exploration_type, exploration_rate", - [("epsilon", 0.5), ("boltzmann", 0.5), (None, None)], -) -def test_sarsa(exploration_type, exploration_rate): - env = GridWorld(walls=(), nrows=5, ncols=5) - agent = SARSAAgent( - env, exploration_type=exploration_type, exploration_rate=exploration_rate - ) - agent.fit(budget=50) - agent.policy(env.observation_space.sample()) - agent.reset() - assert not agent.Q.any() diff --git a/rlberry/agents/tests/test_ucbvi.py b/rlberry/agents/tests/test_ucbvi.py deleted file mode 100644 index 641fe0c02..000000000 --- a/rlberry/agents/tests/test_ucbvi.py +++ /dev/null @@ -1,30 +0,0 @@ -import pytest -from rlberry.agents.ucbvi import UCBVIAgent -from rlberry.envs.finite import GridWorld - - -@pytest.mark.parametrize( - "gamma, stage_dependent, real_time_dp", - [ - (1.0, True, True), - (1.0, True, False), - (1.0, False, True), - (1.0, False, False), - (0.9, True, True), - (0.9, True, False), - (0.9, False, True), - (0.9, False, False), - ], -) -def test_ucbvi(gamma, stage_dependent, real_time_dp): - env = GridWorld(walls=(), nrows=5, ncols=5) - agent = UCBVIAgent( - env, - horizon=11, - stage_dependent=stage_dependent, - gamma=gamma, - real_time_dp=real_time_dp, - bonus_scale_factor=0.1, - ) - agent.fit(budget=50) - agent.policy(env.observation_space.sample()) diff --git a/rlberry/agents/torch/__init__.py b/rlberry/agents/torch/__init__.py deleted file mode 100644 index 896403dc6..000000000 --- a/rlberry/agents/torch/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Torch agents (in alphabetical order) -from .a2c import A2CAgent -from .dqn import DQNAgent -from .dqn import MunchausenDQNAgent -from .ppo import PPOAgent -from .reinforce import REINFORCEAgent -from .sac import SACAgent diff --git a/rlberry/agents/torch/a2c/__init__.py b/rlberry/agents/torch/a2c/__init__.py deleted file mode 100644 index 4581caf68..000000000 --- a/rlberry/agents/torch/a2c/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .a2c import A2CAgent diff --git a/rlberry/agents/torch/a2c/a2c.py b/rlberry/agents/torch/a2c/a2c.py deleted file mode 100644 index 9907677e1..000000000 --- a/rlberry/agents/torch/a2c/a2c.py +++ /dev/null @@ -1,338 +0,0 @@ -import torch -import torch.nn as nn - -import gymnasium.spaces as spaces -import numpy as np -from rlberry.agents import AgentWithSimplePolicy, AgentTorch -from rlberry.agents.utils.replay import ReplayBuffer -from rlberry.agents.torch.utils.training import optimizer_factory -from rlberry.agents.torch.utils.models import default_policy_net_fn -from rlberry.agents.torch.utils.models import default_value_net_fn -from rlberry.utils.torch import choose_device -from rlberry.utils.factory import load -from typing import Optional - -import rlberry - -logger = rlberry.logger - - -class A2CAgent(AgentTorch, AgentWithSimplePolicy): - """ - Advantage Actor Critic Agent. - - A2C, or Advantage Actor Critic, is a synchronous version of the A3C policy - gradient method. As an alternative to the asynchronous implementation of - A3C, A2C is a synchronous, deterministic implementation that waits for each - actor to finish its segment of experience before updating, averaging over - all of the actors. This more effectively uses GPUs due to larger batch sizes. - - Parameters - ---------- - env : Model - Online model with continuous (Box) state space and discrete actions - batch_size : int - Number of timesteps to wait before updating the policy. - gamma : double - Discount factor in [0, 1]. - entr_coef : double - Entropy coefficient. - learning_rate : double - Learning rate. - optimizer_type: str - Type of optimizer. 'ADAM' by defaut. - policy_net_fn : function(env, **kwargs) - Function that returns an instance of a policy network (pytorch). - If None, a default net is used. - value_net_fn : function(env, **kwargs) - Function that returns an instance of a value network (pytorch). - If None, a default net is used. - policy_net_kwargs : dict - kwargs for policy_net_fn - value_net_kwargs : dict - kwargs for value_net_fn - device : str - Device to put the tensors on - eval_interval : int, default = None - Interval (in number of transitions) between agent evaluations in fit(). - If None, never evaluate. - - References - ---------- - Mnih, V., Badia, A.P., Mirza, M., Graves, A., Lillicrap, T., Harley, T., - Silver, D. & Kavukcuoglu, K. (2016). - "Asynchronous methods for deep reinforcement learning." - In International Conference on Machine Learning (pp. 1928-1937). - """ - - name = "A2C" - - def __init__( - self, - env, - batch_size=256, - gamma=0.99, - entr_coef=0.01, - learning_rate=0.01, - optimizer_type="ADAM", - policy_net_fn=None, - value_net_fn=None, - policy_net_kwargs=None, - value_net_kwargs=None, - device="cuda:best", - eval_interval: Optional[int] = None, - **kwargs - ): - AgentWithSimplePolicy.__init__(self, env, **kwargs) - - self.batch_size = batch_size - self.gamma = gamma - self.entr_coef = entr_coef - self.learning_rate = learning_rate - self.device = choose_device(device) - self.eval_interval = eval_interval - - self.policy_net_kwargs = policy_net_kwargs or {} - self.value_net_kwargs = value_net_kwargs or {} - - if isinstance(policy_net_fn, str): - self.policy_net_fn = load(policy_net_fn) - elif policy_net_fn is None: - self.policy_net_fn = default_policy_net_fn - else: - self.policy_net_fn = policy_net_fn - - if isinstance(value_net_fn, str): - self.value_net_fn = load(value_net_fn) - elif value_net_fn is None: - self.value_net_fn = default_value_net_fn - else: - self.value_net_fn = value_net_fn - - self.optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate} - self.optimizer_type = optimizer_type - - # check environment - assert isinstance(self.env.observation_space, spaces.Box) - - # get horizon - if hasattr(self.env, "_max_episode_steps"): - max_episode_steps = self.env._max_episode_steps - else: - max_episode_steps = np.inf - self._max_episode_steps = max_episode_steps - - self._policy = None # categorical policy function - - # initialize - self.reset() - - def reset(self): - self._policy = self.policy_net_fn(self.env, **self.policy_net_kwargs).to( - self.device - ) - self._policy_optimizer = optimizer_factory( - self._policy.parameters(), **self.optimizer_kwargs - ) - - self.value_net = self.value_net_fn(self.env, **self.value_net_kwargs).to( - self.device - ) - - self.value_optimizer = optimizer_factory( - self.value_net.parameters(), **self.optimizer_kwargs - ) - - self._policy_old = self.policy_net_fn(self.env, **self.policy_net_kwargs).to( - self.device - ) - self._policy_old.load_state_dict(self._policy.state_dict()) - - self.mse_loss = nn.MSELoss() - - self.memory = ReplayBuffer(max_replay_size=self.batch_size, rng=self.rng) - self.memory.setup_entry("states", dtype=np.float32) - if self._policy.ctns_actions: - self.memory.setup_entry("actions", dtype=np.float32) - else: - self.memory.setup_entry("actions", dtype=int) - self.memory.setup_entry("rewards", dtype=np.float32) - self.memory.setup_entry("dones", dtype=bool) - - self.total_timesteps = 0 - self.total_episodes = 0 - - def policy(self, observation): - state = observation - assert self._policy is not None - state = torch.from_numpy(state).float().to(self.device) - action_dist = self._policy_old(state) - if self._policy.ctns_actions: - action = action_dist.sample().numpy() - else: - action = action_dist.sample().item() - return action - - def fit(self, budget: int, **kwargs): - """ - Train the agent using the provided environment. - - Parameters - ---------- - budget: int - Number of timesteps to train the agent for. - One step = one transition in the environment. - """ - del kwargs - timesteps_counter = 0 - episode_rewards = 0.0 - episode_timesteps = 0 - observation, info = self.env.reset() - while timesteps_counter < budget: - action = self._select_action(observation) - next_observation, reward, terminated, truncated, info = self.env.step( - action - ) - done = terminated or truncated - # if self._policy.ctns_actions: - # action = torch.from_numpy(action).float().to(self.device) - # store data - episode_rewards += reward - self.memory.append( - { - "states": observation, - "actions": action, - "rewards": reward, - "dones": done, - } - ) - - # counters and next obs - self.total_timesteps += 1 - timesteps_counter += 1 - episode_timesteps += 1 - observation = next_observation - - # update - if self.total_timesteps % self.batch_size == 0: - self._update() - - # eval - total_timesteps = self.total_timesteps - if ( - self.eval_interval is not None - and total_timesteps % self.eval_interval == 0 - ): - eval_rewards = self.eval( - eval_horizon=self._max_episode_steps, gamma=1.0 - ) - if self.writer: - memory_size = len(self.memory) - self.writer.add_scalar( - "eval_rewards", eval_rewards, total_timesteps - ) - self.writer.add_scalar("memory_size", memory_size, total_timesteps) - - # check if episode ended - if done: - self.total_episodes += 1 - self.memory.end_episode() - if self.writer: - self.writer.add_scalar( - "episode_rewards", episode_rewards, total_timesteps - ) - self.writer.add_scalar( - "total_episodes", self.total_episodes, total_timesteps - ) - episode_rewards = 0.0 - episode_timesteps = 0 - observation, info = self.env.reset() - - def _select_action(self, state): - state = torch.from_numpy(state).float().to(self.device) - action_dist = self._policy_old(state) - action = action_dist.sample() - if self._policy.ctns_actions: - action = action.numpy() - else: - action = action.item() - return action - - def _update(self): - # monte carlo estimate of rewards - rewards = [] - discounted_reward = 0 - - memory_data = self.memory.data - memory_states = memory_data["states"] - memory_actions = memory_data["actions"] - memory_rewards = memory_data["rewards"] - memory_dones = memory_data["dones"] - - for reward, is_terminal in zip( - reversed(memory_rewards), reversed(memory_dones) - ): - if is_terminal: - discounted_reward = 0 - discounted_reward = reward + (self.gamma * discounted_reward) - rewards.insert(0, discounted_reward) - - # convert to tensor - rewards = torch.FloatTensor(rewards).to(self.device) - memory_states_tensors = [ - torch.tensor(states).to(self.device).float() for states in memory_states - ] - memory_actions_tensors = [ - torch.tensor(actions).to(self.device) for actions in memory_actions - ] - - # convert list to tensor - old_states = torch.stack(memory_states_tensors).to(self.device).detach() - old_actions = torch.stack(memory_actions_tensors).to(self.device).detach() - - # evaluate old actions and values - action_dist = self._policy(old_states) - logprobs = action_dist.log_prob(old_actions) - state_values = torch.squeeze(self.value_net(old_states)) - dist_entropy = action_dist.entropy() - - # normalize the advantages - advantages = rewards - state_values.detach() - advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) - # find pg loss - pg_loss = -logprobs * advantages - loss = ( - pg_loss - + 0.5 * self.mse_loss(state_values, rewards) - - self.entr_coef * dist_entropy - ) - - # take gradient step - self._policy_optimizer.zero_grad() - self.value_optimizer.zero_grad() - - loss.mean().backward() - - self._policy_optimizer.step() - self.value_optimizer.step() - - # copy new weights into old policy - self._policy_old.load_state_dict(self._policy.state_dict()) - - # - # For hyperparameter optimization - # - @classmethod - def sample_parameters(cls, trial): - batch_size = trial.suggest_categorical("batch_size", [1, 4, 8, 16, 32]) - gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.99]) - learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True) - - entr_coef = trial.suggest_float("entr_coef", 1e-8, 0.1, log=True) - - return { - "batch_size": batch_size, - "gamma": gamma, - "learning_rate": learning_rate, - "entr_coef": entr_coef, - } diff --git a/rlberry/agents/torch/dqn/__init__.py b/rlberry/agents/torch/dqn/__init__.py deleted file mode 100644 index 7f799acbe..000000000 --- a/rlberry/agents/torch/dqn/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .dqn import DQNAgent -from .mdqn import MunchausenDQNAgent diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py deleted file mode 100644 index 84219c8c8..000000000 --- a/rlberry/agents/torch/dqn/dqn.py +++ /dev/null @@ -1,513 +0,0 @@ -import inspect -from typing import Callable, Optional, Union - -from gymnasium import spaces -import numpy as np -import torch - -from rlberry import types -from rlberry.agents import AgentWithSimplePolicy, AgentTorch -from rlberry.agents.torch.utils.training import ( - loss_function_factory, - model_factory, - optimizer_factory, - size_model_config, -) -from rlberry.agents.torch.dqn.dqn_utils import polynomial_schedule, lambda_returns -from rlberry.agents.utils import replay -from rlberry.utils.torch import choose_device -from rlberry.utils.factory import load - - -import rlberry - -logger = rlberry.logger - - -def default_q_net_fn(env, **kwargs): - """ - Returns a default Q value network. - """ - del kwargs - model_config = { - "type": "MultiLayerPerceptron", - "layer_sizes": (64, 64), - "reshape": False, - } - model_config = size_model_config(env, **model_config) - return model_factory(**model_config) - - -class DQNAgent(AgentTorch, AgentWithSimplePolicy): - """DQN Agent based on PyTorch. - - Notes - ----- - Uses Q(lambda) for computing targets by default. To recover - the standard DQN, set :code:`lambda_ = 0.0` and :code:`chunk_size = 1`. - - Parameters - ---------- - env: :class:`~rlberry.types.Env` - Environment, can be a tuple (constructor, kwargs) - gamma: float, default = 0.99 - Discount factor. - batch_size: int, default=32 - Batch size. - chunk_size: int, default=8 - Length of sub-trajectories sampled from the replay buffer. - lambda_: float, default=0.5 - Q(lambda) parameter. - target_update_parameter : int or float - If int: interval (in number total number of online updates) between updates of the target network. - If float: soft update coefficient - device: str - Torch device, see :func:`~rlberry.utils.torch.choose_device` - learning_rate : float, default = 1e-3 - Optimizer learning rate. - loss_function: {"l1", "l2", "smooth_l1"}, default: "l2" - Loss function used to compute Bellman error. - epsilon_init: float, default = 1.0 - Initial epsilon value for epsilon-greedy exploration. - epsilon_final: float, default = 0.1 - Final epsilon value for epsilon-greedy exploration. - epsilon_decay_interval : int - After :code:`epsilon_decay` timesteps, epsilon approaches :code:`epsilon_final`. - optimizer_type : {"ADAM", "RMS_PROP"} - Optimization algorithm. - q_net_constructor : Callable, str or None - Function/constructor that returns a torch module for the Q-network: - :code:`qnet = q_net_constructor(env, **kwargs)`. - - Module (Q-network) requirements: - - * Input shape = (batch_dim, chunk_size, obs_dims) - - * Ouput shape = (batch_dim, chunk_size, number_of_actions) - - Example: use `rlberry.agents.torch.utils.training.model_factory_from_env`, - and `q_net_kwargs` - parameter to modify the neural network:: - - model_configs = { - "type": "MultiLayerPerceptron", - "layer_sizes": (5, 5), - "reshape": False, - } - - agent = DQNAgent(env, - q_net_constructor=model_factory_from_env, - q_net_kwargs=model_configs - ) - If str then it should correspond to the full path to the constructor function, - e.g.:: - agent = DQNAgent(env, - q_net_constructor='rlberry.agents.torch.utils.training.model_factory_from_env', - q_net_kwargs=model_configs - ) - - If None then it is set to MultiLayerPerceptron with 2 hidden layers - of size 64 - - q_net_kwargs : optional, dict - Parameters for q_net_constructor. - use_double_dqn : bool, default = False - If True, use Double DQN. - use_prioritized_replay : bool, default = False - If True, use Prioritized Experience Replay. - train_interval: int - Update the model every :code:`train_interval` steps. - If -1, train only at the end of the episodes. - gradient_steps: int - How many gradient steps to do at each update. - If -1, take the number of timesteps since last update. - max_replay_size : int - Maximum number of transitions in the replay buffer. - learning_starts : int - How many steps of the model to collect transitions for before learning starts - eval_interval : int, default = None - Interval (in number of transitions) between agent evaluations in fit(). - If None, never evaluate. - - Attributes - ---------- - gamma : float, default: 0.99 - Discount factor used to discount future rewards in the Bellman equation. - batch_size : int, default: 32 - Batch size used during the training process. - chunk_size : int, default: 8 - Length of sub-trajectories sampled from the replay buffer. - lambda_ : float, default: 0.5 - Q(lambda) parameter used in Q(lambda) algorithm for computing targets. - target_update_parameter : int or float - The parameter that controls the update frequency of the target network. - If int: interval (in number of total online updates) between updates of the target network. - If float: soft update coefficient, which controls the rate at which the target network approaches - the online network. - device : str - Torch device on which the agent's neural networks are placed. Use "cuda:best" to choose the best - available GPU device. - learning_rate : float, default: 1e-3 - Learning rate used by the optimizer during neural network training. - epsilon_init : float, default: 1.0 - Initial epsilon value for epsilon-greedy exploration. Epsilon-greedy policy is used to balance - exploration and exploitation during training. - epsilon_final : float, default: 0.1 - Final epsilon value for epsilon-greedy exploration. Epsilon will approach this value as the agent - gains more experience. - epsilon_decay_interval : int - The number of timesteps after which the epsilon value will approach `epsilon_final`. - loss_function : {"l1", "l2", "smooth_l1"}, default: "l2" - The loss function used to compute the Bellman error during training. The available options are - Mean Absolute Error ("l1"), Mean Squared Error ("l2"), and Smooth L1 Loss ("smooth_l1"). - optimizer_type : {"ADAM", "RMS_PROP"} - The optimization algorithm used during neural network training. Choose between ADAM and RMS_PROP. - q_net_constructor : Callable, str or None - Function/constructor that returns a torch module for the Q-network. - Example: use `rlberry.agents.torch.utils.training.model_factory_from_env` and `q_net_kwargs` - parameter to modify the neural network. - q_net_kwargs : optional, dict - Parameters for `q_net_constructor`. - use_double_dqn : bool, default: False - If True, use Double DQN algorithm, which helps to reduce overestimation bias in Q-value estimates. - use_prioritized_replay : bool, default: False - If True, use Prioritized Experience Replay, which prioritizes transitions in the replay buffer - based on their TD-errors, to improve the learning process. - train_interval : int - The agent updates the model every `train_interval` steps. If -1, the agent only trains at the end - of each episode. - gradient_steps : int - The number of gradient steps to perform at each model update. If -1, the number of timesteps since - the last update will be used. - max_replay_size : int - The maximum number of transitions allowed in the replay buffer. - learning_starts : int - The number of steps of the model to collect transitions for before learning starts. - eval_interval : int, default: None - The interval (in number of transitions) between agent evaluations in the `fit()` method. If None, - the agent won't evaluate during training. - """ - - name = "DQN" - - def __init__( - self, - env: types.Env, - gamma: float = 0.99, - batch_size: int = 32, - chunk_size: int = 8, - lambda_: float = 0.5, - target_update_parameter: Union[int, float] = 0.005, - device: str = "cuda:best", - learning_rate: float = 1e-3, - epsilon_init: float = 1.0, - epsilon_final: float = 0.1, - epsilon_decay_interval: int = 20_000, - loss_function: str = "l2", - optimizer_type: str = "ADAM", - q_net_constructor: Optional[Callable[..., torch.nn.Module]] = None, - q_net_kwargs: Optional[dict] = None, - use_double_dqn: bool = False, - use_prioritized_replay: bool = False, - train_interval: int = 10, - gradient_steps: int = -1, - max_replay_size: int = 200_000, - learning_starts: int = 5_000, - eval_interval: Optional[int] = None, - **kwargs, - ): - # For all parameters, define self.param = param - _, _, _, values = inspect.getargvalues(inspect.currentframe()) - - values.pop("self") - for arg, val in values.items(): - setattr(self, arg, val) - - AgentWithSimplePolicy.__init__(self, env, **kwargs) - env = self.env - assert isinstance(env.observation_space, spaces.Box) - assert isinstance(env.action_space, spaces.Discrete) - - # DQN parameters - - # Online and target Q networks, torch device - self._device = choose_device(device) - if isinstance(q_net_constructor, str): - q_net_ctor = load(q_net_constructor) - elif q_net_constructor is None: - q_net_ctor = default_q_net_fn - else: - q_net_ctor = q_net_constructor - q_net_kwargs = q_net_kwargs or dict() - self._qnet_online = q_net_ctor(env, **q_net_kwargs).to(self._device) - self._qnet_target = q_net_ctor(env, **q_net_kwargs).to(self._device) - - # Optimizer and loss - optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate} - self._optimizer = optimizer_factory( - self._qnet_online.parameters(), **optimizer_kwargs - ) - self._loss_function = loss_function_factory(loss_function, reduction="none") - - # Training params - self._train_interval = train_interval - self._gradient_steps = gradient_steps - self._learning_starts = learning_starts - self._learning_starts = learning_starts - self._eval_interval = eval_interval - - # Setup replay buffer - if hasattr(self.env, "_max_episode_steps"): - max_episode_steps = self.env._max_episode_steps - else: - max_episode_steps = np.inf - self._max_episode_steps = max_episode_steps - - self._replay_buffer = replay.ReplayBuffer( - max_replay_size=max_replay_size, - rng=self.rng, - max_episode_steps=self._max_episode_steps, - enable_prioritized=use_prioritized_replay, - ) - self._replay_buffer.setup_entry("observations", np.float32) - self._replay_buffer.setup_entry("next_observations", np.float32) - self._replay_buffer.setup_entry("actions", np.int32) - self._replay_buffer.setup_entry("rewards", np.float32) - self._replay_buffer.setup_entry("dones", bool) - - # Counters - self._total_timesteps = 0 - self._total_episodes = 0 - self._total_updates = 0 - self._timesteps_since_last_update = 0 - - # epsilon scheduling - self._epsilon_schedule = polynomial_schedule( - self.epsilon_init, - self.epsilon_final, - power=1.0, - transition_steps=self.epsilon_decay_interval, - transition_begin=0, - ) - - @property - def total_timesteps(self): - return self._total_timesteps - - def _must_update(self, is_end_of_episode): - """Returns true if the model must be updated in the current timestep, - and the number of gradient steps to take""" - total_timesteps = self._total_timesteps - n_gradient_steps = self._gradient_steps - - if total_timesteps < self._learning_starts: - return False, -1 - - if n_gradient_steps == -1: - n_gradient_steps = self._timesteps_since_last_update - - run_update = False - if self._train_interval == -1: - run_update = is_end_of_episode - else: - run_update = total_timesteps % self._train_interval == 0 - return run_update, n_gradient_steps - - def _update(self, n_gradient_steps): - """Update networks.""" - if self.use_prioritized_replay: - sampling_mode = "prioritized" - else: - sampling_mode = "uniform" - - for _ in range(n_gradient_steps): - # Sample a batch - sampled = self._replay_buffer.sample( - self.batch_size, self.chunk_size, sampling_mode=sampling_mode - ) - if not sampled: - return - - # Update counters - self._timesteps_since_last_update = 0 - self._total_updates += 1 - - batch = sampled.data - batch_info = sampled.info - assert batch["rewards"].shape == (self.batch_size, self.chunk_size) - - # Compute targets - batch_observations = torch.FloatTensor(batch["observations"]).to( - self._device - ) - batch_next_observations = torch.FloatTensor(batch["next_observations"]).to( - self._device - ) - batch_actions = torch.LongTensor(batch["actions"]).to(self._device) - - target_q_values_tp1 = self._qnet_target(batch_next_observations).detach() - # Check if double DQN - if self.use_double_dqn: - online_q_values_tp1 = self._qnet_online( - batch_next_observations - ).detach() - a_argmax = online_q_values_tp1.argmax(dim=-1).detach() - else: - a_argmax = target_q_values_tp1.argmax(dim=-1).detach() - - v_tp1 = ( - torch.gather(target_q_values_tp1, dim=-1, index=a_argmax[:, :, None])[ - :, :, 0 - ] - .cpu() - .numpy() - ) - - batch_lambda_returns = lambda_returns( - batch["rewards"], - self.gamma * (1.0 - np.array(batch["dones"], dtype=np.float32)), - v_tp1, - np.array(self.lambda_, dtype=np.float32), - ) - targets = torch.tensor(batch_lambda_returns).to(self._device) - - # Compute loss - batch_q_values = self._qnet_online(batch_observations) - batch_values = torch.gather( - batch_q_values, dim=-1, index=batch_actions[:, :, None] - )[ - :, :, 0 - ] # shape (batch, chunk) - - assert batch_values.shape == targets.shape - per_element_loss = self._loss_function(batch_values, targets) - per_batch_element_loss = per_element_loss.mean(dim=1) - weights = torch.FloatTensor(batch_info["weights"]).to(self._device) - loss = torch.sum(per_batch_element_loss * weights) / torch.sum(weights) - - self._optimizer.zero_grad() - loss.backward() - self._optimizer.step() - - if self.writer: - self.writer.add_scalar( - "losses/q_loss", loss.item(), self._total_updates - ) - - # update priorities - if self.use_prioritized_replay: - new_priorities = per_element_loss.abs().detach().cpu().numpy() + 1e-6 - self._replay_buffer.update_priorities( - batch_info["indices"], new_priorities - ) - - # target update - if self.target_update_parameter > 1: - if self._total_updates % self.target_update_parameter == 0: - self._qnet_target.load_state_dict(self._qnet_online.state_dict()) - else: - tau = self.target_update_parameter - for param, target_param in zip( - self._qnet_online.parameters(), self._qnet_target.parameters() - ): - target_param.data.copy_( - tau * param.data + (1 - tau) * target_param.data - ) - - def fit(self, budget: int, **kwargs): - """ - Train the agent using the provided environment. - - Parameters - ---------- - budget: int - Number of timesteps to train the agent for. - One step = one transition in the environment. - """ - del kwargs - timesteps_counter = 0 - episode_rewards = 0.0 - episode_timesteps = 0 - observation, info = self.env.reset() - while timesteps_counter < budget: - if self.total_timesteps < self._learning_starts: - action = self.env.action_space.sample() - else: - self._timesteps_since_last_update += 1 - action = self._policy(observation, evaluation=False) - next_observation, reward, terminated, truncated, info = self.env.step( - action - ) - done = terminated or truncated - - # store data - episode_rewards += reward - self._replay_buffer.append( - { - "observations": observation, - "actions": action, - "rewards": reward, - "dones": done, - "next_observations": next_observation, - } - ) - - # counters and next obs - self._total_timesteps += 1 - timesteps_counter += 1 - episode_timesteps += 1 - observation = next_observation - - # update - run_update, n_gradient_steps = self._must_update(done) - if run_update: - self._update(n_gradient_steps) - - # eval - total_timesteps = self._total_timesteps - if ( - self._eval_interval is not None - and total_timesteps % self._eval_interval == 0 - ): - eval_rewards = self.eval( - eval_horizon=self._max_episode_steps, gamma=1.0 - ) - if self.writer: - buffer_size = len(self._replay_buffer) - self.writer.add_scalar( - "eval_rewards", eval_rewards, total_timesteps - ) - self.writer.add_scalar("buffer_size", buffer_size, total_timesteps) - - # check if episode ended - if done: - self._total_episodes += 1 - self._replay_buffer.end_episode() - if self.writer: - self.writer.add_scalar( - "episode_rewards", episode_rewards, total_timesteps - ) - self.writer.add_scalar( - "total_episodes", self._total_episodes, total_timesteps - ) - episode_rewards = 0.0 - episode_timesteps = 0 - observation, info = self.env.reset() - - def _policy(self, observation, evaluation=False): - epsilon = self._epsilon_schedule(self.total_timesteps) - if (not evaluation) and self.rng.uniform() < epsilon: - if self.writer: - self.writer.add_scalar("epsilon", epsilon, self.total_timesteps) - return self.env.action_space.sample() - else: - with torch.no_grad(): - observation = ( - torch.FloatTensor(observation).to(self._device).unsqueeze(0) - ) - qvals_tensor = self._qnet_online(observation)[0] - action = qvals_tensor.argmax().item() - return action - - def policy(self, observation): - return self._policy(observation, evaluation=True) diff --git a/rlberry/agents/torch/dqn/dqn_utils.py b/rlberry/agents/torch/dqn/dqn_utils.py deleted file mode 100644 index 2d100b218..000000000 --- a/rlberry/agents/torch/dqn/dqn_utils.py +++ /dev/null @@ -1,142 +0,0 @@ -import numpy as np -import torch -import torch.nn.functional as F - - -from rlberry.utils.jit_setup import numba_jit - - -import rlberry - -logger = rlberry.logger - - -def stable_scaled_log_softmax(x, tau, dim=-1): - """Scaled log_softmax operation. - - Parameters - ---------- - x: tensor of floats, - inputs of the softmax (logits). - tau: float, - softmax temperature. - dim: int, - axis to perform the softmax operation. - Returns: - tau * log softmax(x/tau, dim=dim) - """ - max_x = torch.max(x, dim=dim, keepdim=True).values - y = x - max_x - return tau * F.log_softmax(y / tau, dim=dim) - - -def stable_softmax(x, tau, dim=-1): - """Stable softmax operation. - - Parameters - ---------- - x: tensor of floats, - inputs of the softmax (logits). - tau: float, - softmax temperature. - dim: int, - axis to perform the softmax operation. - Returns: - softmax(x / tau, dim=dim) - """ - max_x = torch.max(x, dim=dim, keepdim=True).values - y = x - max_x - return F.softmax(y / tau, dim=dim) - - -def polynomial_schedule( - init_value: float, - end_value: float, - power: float, - transition_steps: int, - transition_begin: int = 0, -): - """Constructs a schedule with polynomial transition from init to end value. - - Notes - ----- - Function taken from: https://github.com/deepmind/optax/blob/master/optax/_src/schedule.py, - which is licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - - Modifications with respect to source: - - * Remove chex typing from the arguments. - * `import rlberry; logger=rlberry.logger` instead of :code:`logging.info()`. - * Changed documentation style. - - Parameters - ---------- - init_value: float - Initial value for the scalar to be annealed. - end_value: float - End value of the scalar to be annealed. - power: float - The power of the polynomial used to transition from init to end. - transition_steps: float - Number of steps over which annealing takes place, - the scalar starts changing at `transition_begin` steps and completes - the transition by `transition_begin + transition_steps` steps. - If `transition_steps <= 0`, then the entire annealing process is disabled - and the value is held fixed at `init_value`. - transition_begin: float - Must be positive. After how many steps to start annealing - (before this many steps the scalar value is held fixed at `init_value`). - - Returns - ------- - schedule: Callable[[int], float] - A function that maps step counts to values. - """ - if transition_steps <= 0: - logger.info( - "A polynomial schedule was set with a non-positive `transition_steps` " - "value; this results in a constant schedule with value `init_value`." - ) - return lambda count: init_value - - if transition_begin < 0: - logger.info( - "An exponential schedule was set with a negative `transition_begin` " - "value; this will result in `transition_begin` falling back to `0`." - ) - transition_begin = 0 - - def schedule(count): - count = np.clip(count - transition_begin, 0, transition_steps) - frac = 1 - count / transition_steps - return (init_value - end_value) * (frac**power) + end_value - - return schedule - - -@numba_jit -def lambda_returns(r_t, discount_t, v_tp1, lambda_): - """ - Computer lambda returns - - Parameters - ---------- - r_t: array - Array of shape (batch_dim, time_dim) containing the rewards. - discount_t: array - Array of shape (batch_dim, time_dim) containing the discounts (0.0 if terminal state). - v_tp1: array - Array of shape (batch_dim, time_dim) containing the values at timestep t+1 - lambda_ : float in [0, 1] - Lambda-returns parameter. - """ - returns = np.zeros_like(r_t) - aux = v_tp1[:, -1] - time_dim = v_tp1.shape[1] - for tt in range(time_dim): - i = time_dim - tt - 1 - aux = r_t[:, i] + discount_t[:, i] * ( - (1 - lambda_) * v_tp1[:, i] + lambda_ * aux - ) - returns[:, i] = aux - return returns diff --git a/rlberry/agents/torch/dqn/mdqn.py b/rlberry/agents/torch/dqn/mdqn.py deleted file mode 100644 index 746e01d24..000000000 --- a/rlberry/agents/torch/dqn/mdqn.py +++ /dev/null @@ -1,478 +0,0 @@ -import inspect - -import numpy as np -import torch -from gymnasium import spaces -from rlberry import types -from rlberry.agents import AgentWithSimplePolicy, AgentTorch -from rlberry.agents.torch.utils.training import ( - loss_function_factory, - model_factory, - optimizer_factory, - size_model_config, -) -from rlberry.agents.torch.dqn.dqn_utils import ( - lambda_returns, - polynomial_schedule, - stable_scaled_log_softmax, - stable_softmax, -) -from rlberry.agents.utils import replay -from rlberry.utils.torch import choose_device -from rlberry.utils.factory import load -from typing import Callable, Optional, Union - - -import rlberry - -logger = rlberry.logger - - -def default_q_net_fn(env, **kwargs): - """ - Returns a default Q value network. - """ - del kwargs - model_config = { - "type": "MultiLayerPerceptron", - "layer_sizes": (64, 64), - "reshape": False, - } - model_config = size_model_config(env, **model_config) - return model_factory(**model_config) - - -class MunchausenDQNAgent(AgentTorch, AgentWithSimplePolicy): - """Munchausen DQN Agent based on PyTorch. - - Notes - ----- - Uses Munchausen trick for DQN for computing targets by default. - Compared to DQN, the scaled log-policy was added to the immediate - reward. Slightly modifying DQN in that way provides an agent that - is competitive with distributional methods on Atari games, without - making use of distributional RL, n-step returns or prioritized replay. - See more: https://arxiv.org/pdf/2007.14430.pdf - - Parameters - ---------- - env: :class:`~rlberry.types.Env` - Environment, can be a tuple (constructor, kwargs) - gamma: float, default = 0.99 - Discount factor. - batch_size: int, default=32 - Batch size. - chunk_size: int, default=8 - Length of sub-trajectories sampled from the replay buffer. - lambda_: float, default=0.5 - Q(lambda) parameter. - tau: float, default=0.03 - softmax temperature for the policy - alpha: float, default=0.9 - Munchausen coefficient - target_update_parameter : int or float - If int: interval (in number total number of online updates) between updates of the target network. - If float: soft update coefficient - device: str - Torch device, see :func:`~rlberry.utils.torch.choose_device` - learning_rate : float, default = 1e-3 - Optimizer learning rate. - clip_value_min: float, default = -1, - minimum value for munchausen term - loss_function: {"l1", "l2", "smooth_l1"}, default: "l2" - Loss function used to compute Bellman error. - epsilon_init: float, default = 1.0 - Initial epsilon value for epsilon-greedy exploration. - epsilon_final: float, default = 0.1 - Final epsilon value for epsilon-greedy exploration. - epsilon_decay_interval : int - After :code:`epsilon_decay` timesteps, epsilon approaches :code:`epsilon_final`. - optimizer_type : {"ADAM", "RMS_PROP"} - Optimization algorithm. - q_net_constructor : Callable, str or None - Function/constructor that returns a torch module for the Q-network: - :code:`qnet = q_net_constructor(env, **kwargs)`. - - Module (Q-network) requirements: - - * Input shape = (batch_dim, chunk_size, obs_dims) - - * Ouput shape = (batch_dim, chunk_size, number_of_actions) - - Example: use `rlberry.agents.torch.utils.training.model_factory_from_env`, - and `q_net_kwargs` - parameter to modify the neural network:: - - model_configs = { - "type": "MultiLayerPerceptron", - "layer_sizes": (5, 5), - "reshape": False, - } - - agent = MunchausenDQNAgent(env, - q_net_constructor=model_factory_from_env, - q_net_kwargs=model_configs - ) - If str then it should correspond to the full path to the constructor function, - e.g.:: - agent = MunchausenDQNAgent(env, - q_net_constructor='rlberry.agents.torch.utils.training.model_factory_from_env', - q_net_kwargs=model_configs - ) - - If None then it is set to MultiLayerPerceptron with 2 hidden layers - of size 64 - - q_net_kwargs : optional, dict - Parameters for q_net_constructor. - use_prioritized_replay : bool, default = False - If True, use Prioritized Experience Replay. - train_interval: int - Update the model every :code:`train_interval` steps. - If -1, train only at the end of the episodes. - gradient_steps: int - How many gradient steps to do at each update. - If -1, take the number of timesteps since last update. - max_replay_size : int - Maximum number of transitions in the replay buffer. - learning_starts : int - How many steps of the model to collect transitions for before learning starts - eval_interval : int, default = None - Interval (in number of transitions) between agent evaluations in fit(). - If None, never evaluate. - """ - - name = "Munchausen DQN" - - def __init__( - self, - env: types.Env, - gamma: float = 0.99, - batch_size: int = 32, - chunk_size: int = 8, - lambda_: float = 0.5, - tau: float = 0.03, - alpha: float = 0.9, - target_update_parameter: Union[int, float] = 0.005, - # tardet_update_freq: int = 8000, - device: str = "cuda:best", - learning_rate: float = 5e-5, - clip_value_min: float = -1.0, - epsilon_init: float = 1.0, - epsilon_final: float = 0.1, - epsilon_decay_interval: int = 20_000, - loss_function: str = "l2", - optimizer_type: str = "ADAM", - q_net_constructor: Optional[Callable[..., torch.nn.Module]] = None, - q_net_kwargs: Optional[dict] = None, - use_prioritized_replay: bool = False, - train_interval: int = 4, - gradient_steps: int = -1, - max_replay_size: int = 1_000_000, - learning_starts: int = 5_000, - eval_interval: Optional[int] = None, - **kwargs, - ): - # For all parameters, define self.param = param - _, _, _, values = inspect.getargvalues(inspect.currentframe()) - values.pop("self") - for arg, val in values.items(): - setattr(self, arg, val) - - AgentWithSimplePolicy.__init__(self, env, **kwargs) - env = self.env - assert isinstance(env.observation_space, spaces.Box) - assert isinstance(env.action_space, spaces.Discrete) - - # M-DQN parameters - - # Online and target Q networks, torch device - self._device = choose_device(device) - if isinstance(q_net_constructor, str): - q_net_ctor = load(q_net_constructor) - elif q_net_constructor is None: - q_net_ctor = default_q_net_fn - else: - q_net_ctor = q_net_constructor - q_net_kwargs = q_net_kwargs or dict() - self._qnet_online = q_net_ctor(env, **q_net_kwargs).to(self._device) - self._qnet_target = q_net_ctor(env, **q_net_kwargs).to(self._device) - - # Optimizer and loss - optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate} - self._optimizer = optimizer_factory( - self._qnet_online.parameters(), **optimizer_kwargs - ) - self._loss_function = loss_function_factory(loss_function, reduction="none") - - # Training params - self._train_interval = train_interval - self._gradient_steps = gradient_steps - self._learning_starts = learning_starts - self._learning_starts = learning_starts - self._eval_interval = eval_interval - - # Setup replay buffer - if hasattr(self.env, "_max_episode_steps"): - max_episode_steps = self.env._max_episode_steps - else: - max_episode_steps = np.inf - self._max_episode_steps = max_episode_steps - - self._replay_buffer = replay.ReplayBuffer( - max_replay_size=max_replay_size, - rng=self.rng, - max_episode_steps=self._max_episode_steps, - enable_prioritized=use_prioritized_replay, - ) - self._replay_buffer.setup_entry("observations", np.float32) - self._replay_buffer.setup_entry("next_observations", np.float32) - self._replay_buffer.setup_entry("actions", np.int32) - self._replay_buffer.setup_entry("rewards", np.float32) - self._replay_buffer.setup_entry("dones", bool) - - # Counters - self._total_timesteps = 0 - self._total_episodes = 0 - self._total_updates = 0 - self._timesteps_since_last_update = 0 - - # epsilon scheduling - self._epsilon_schedule = polynomial_schedule( - self.epsilon_init, - self.epsilon_final, - power=1.0, - transition_steps=self.epsilon_decay_interval, - transition_begin=0, - ) - - @property - def total_timesteps(self): - return self._total_timesteps - - def _must_update(self, is_end_of_episode): - """Returns true if the model must be updated in the current timestep, - and the number of gradient steps to take""" - total_timesteps = self._total_timesteps - n_gradient_steps = self._gradient_steps - - if total_timesteps < self._learning_starts: - return False, -1 - - if n_gradient_steps == -1: - n_gradient_steps = self._timesteps_since_last_update - - run_update = False - if self._train_interval == -1: - run_update = is_end_of_episode - else: - run_update = total_timesteps % self._train_interval == 0 - return run_update, n_gradient_steps - - def _update(self, n_gradient_steps): - """Update networks.""" - if self.use_prioritized_replay: - sampling_mode = "prioritized" - else: - sampling_mode = "uniform" - - for _ in range(n_gradient_steps): - # Sample a batch - sampled = self._replay_buffer.sample( - self.batch_size, self.chunk_size, sampling_mode=sampling_mode - ) - if not sampled: - return - - # Update counters - self._timesteps_since_last_update = 0 - self._total_updates += 1 - - batch = sampled.data - batch_info = sampled.info - assert batch["rewards"].shape == (self.batch_size, self.chunk_size) - - # Get batched tensors - batch_observations = torch.FloatTensor(batch["observations"]).to( - self._device - ) - batch_rewards = torch.FloatTensor(batch["rewards"]).to(self._device) - batch_next_observations = torch.FloatTensor(batch["next_observations"]).to( - self._device - ) - batch_actions = torch.LongTensor(batch["actions"]).to(self._device) - batch_dones = torch.LongTensor(batch["dones"]).to(self._device) - - # Get target Q estimates - target_q_values_tp1 = self._qnet_target(batch_next_observations).detach() - target_q_values = self._qnet_target(batch_observations).detach() - - # Compute softmax policies for the current and next step - log_pi = stable_scaled_log_softmax(target_q_values, self.tau, -1) - log_pi_tp1 = stable_scaled_log_softmax(target_q_values_tp1, self.tau, -1) - pi_tp1 = stable_softmax(target_q_values_tp1, self.tau, -1) - - # Compute the "next step" part of the target - target_v_tp1 = ( - torch.sum((target_q_values_tp1 - log_pi_tp1) * pi_tp1, -1).cpu().numpy() - ) - - # Compute the Munchausen term - munchausen_term = torch.gather( - log_pi, dim=-1, index=batch_actions[:, :, None] - )[:, :, 0] - clipped_munchausen_term = torch.clip( - munchausen_term, self.clip_value_min, 0 - ) - final_munchausen_term = self.alpha * clipped_munchausen_term - - # Compute the final target - batch_lambda_returns = lambda_returns( - (batch_rewards + final_munchausen_term).cpu().numpy(), - self.gamma * (1.0 - np.array(batch["dones"], dtype=np.float32)), - target_v_tp1, - np.array(self.lambda_, dtype=np.float32), - ) - targets = torch.tensor(batch_lambda_returns, device=self._device) - - # Compute loss - batch_q_values = self._qnet_online(batch_observations) - batch_values = torch.gather( - batch_q_values, dim=-1, index=batch_actions[:, :, None] - )[ - :, :, 0 - ] # shape (batch, chunk) - - assert batch_values.shape == targets.shape - per_element_loss = self._loss_function(batch_values, targets) - per_batch_element_loss = per_element_loss.mean(dim=1) - weights = torch.FloatTensor(batch_info["weights"]).to(self._device) - loss = torch.sum(per_batch_element_loss * weights) / torch.sum(weights) - - self._optimizer.zero_grad() - loss.backward() - self._optimizer.step() - - if self.writer: - self.writer.add_scalar( - "losses/q_loss", loss.item(), self._total_updates - ) - - # update priorities - if self.use_prioritized_replay: - new_priorities = per_element_loss.abs().detach().cpu().numpy() + 1e-6 - self._replay_buffer.update_priorities( - batch_info["indices"], new_priorities - ) - - # target update - - if self.target_update_parameter > 1: - if self._total_updates % self.target_update_parameter == 0: - self._qnet_target.load_state_dict(self._qnet_online.state_dict()) - else: - tau = self.target_update_parameter - for param, target_param in zip( - self._qnet_online.parameters(), self._qnet_target.parameters() - ): - target_param.data.copy_( - tau * param.data + (1 - tau) * target_param.data - ) - - def fit(self, budget: int, **kwargs): - """ - Train the agent using the provided environment. - - Parameters - ---------- - budget: int - Number of timesteps to train the agent for. - One step = one transition in the environment. - """ - del kwargs - timesteps_counter = 0 - episode_rewards = 0.0 - episode_timesteps = 0 - observation, info = self.env.reset() - while timesteps_counter < budget: - if self.total_timesteps < self._learning_starts: - action = self.env.action_space.sample() - else: - self._timesteps_since_last_update += 1 - action = self._policy(observation, evaluation=False) - next_observation, reward, terminated, truncated, info = self.env.step( - action - ) - done = terminated or truncated - - # store data - episode_rewards += reward - self._replay_buffer.append( - { - "observations": observation, - "actions": action, - "rewards": reward, - "dones": done, - "next_observations": next_observation, - } - ) - - # counters and next obs - self._total_timesteps += 1 - timesteps_counter += 1 - episode_timesteps += 1 - observation = next_observation - - # update - run_update, n_gradient_steps = self._must_update(done) - if run_update: - self._update(n_gradient_steps) - - # eval - total_timesteps = self._total_timesteps - if ( - self._eval_interval is not None - and total_timesteps % self._eval_interval == 0 - ): - eval_rewards = self.eval( - eval_horizon=self._max_episode_steps, gamma=1.0 - ) - if self.writer: - buffer_size = len(self._replay_buffer) - self.writer.add_scalar( - "eval_rewards", eval_rewards, total_timesteps - ) - self.writer.add_scalar("buffer_size", buffer_size, total_timesteps) - - # check if episode ended - if done: - self._total_episodes += 1 - self._replay_buffer.end_episode() - if self.writer: - self.writer.add_scalar( - "episode_rewards", episode_rewards, total_timesteps - ) - self.writer.add_scalar( - "total_episodes", self._total_episodes, total_timesteps - ) - episode_rewards = 0.0 - episode_timesteps = 0 - observation, info = self.env.reset() - - def _policy(self, observation, evaluation=False): - epsilon = self._epsilon_schedule(self.total_timesteps) - if (not evaluation) and self.rng.uniform() < epsilon: - if self.writer: - self.writer.add_scalar("epsilon", epsilon, self.total_timesteps) - return self.env.action_space.sample() - else: - with torch.no_grad(): - observation = ( - torch.FloatTensor(observation).to(self._device).unsqueeze(0) - ) - qvals_tensor = self._qnet_online(observation)[0] - action = qvals_tensor.argmax().item() - return action - - def policy(self, observation): - return self._policy(observation, evaluation=True) diff --git a/rlberry/agents/torch/ppo/__init__.py b/rlberry/agents/torch/ppo/__init__.py deleted file mode 100644 index b3f371adb..000000000 --- a/rlberry/agents/torch/ppo/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .ppo import PPOAgent diff --git a/rlberry/agents/torch/ppo/ppo.py b/rlberry/agents/torch/ppo/ppo.py deleted file mode 100644 index fdd27442b..000000000 --- a/rlberry/agents/torch/ppo/ppo.py +++ /dev/null @@ -1,843 +0,0 @@ -import numpy as np -import torch -import torch.nn as nn - -import gymnasium.spaces as spaces -import rlberry -from rlberry.agents import AgentWithSimplePolicy -from rlberry.agents import AgentTorch -from rlberry.envs.utils import process_env -from rlberry.agents.torch.utils.training import optimizer_factory -from rlberry.agents.torch.utils.models import default_policy_net_fn -from rlberry.agents.torch.utils.models import default_value_net_fn -from rlberry.utils.torch import choose_device -from rlberry.utils.factory import load -from rlberry.agents.torch.ppo.ppo_utils import ( - process_ppo_env, - lambda_returns, - RolloutBuffer, -) - -import dill -import pickle -import bz2 -import _pickle as cPickle -from pathlib import Path - - -logger = rlberry.logger - - -# Notes about VecEnvs: -# - reset() returns a numpy array of shape (n_envs, state_dim) -# - step() returns a tuple of arrays (states, rewards, dones, infos) -# - states: np.array (n_envs, state_dim) dtype varies -# - rewards: np.array (n_envs,) np.float64 -# - dones: np.array (n_envs,) bool -# - infos: list (n_envs,) dict -# - close() closes all environments - - -class PPOAgent(AgentTorch, AgentWithSimplePolicy): - """ - Proximal Policy Optimization Agent. - - Policy gradient methods for reinforcement learning, which alternate between - sampling data through interaction with the environment, and optimizing a - “surrogate” objective function using stochastic gradient ascent. - - Parameters - ---------- - env : rlberry Env - Environment with continuous (Box) observation space. - n_envs: int - Number of environments to be used. - n_steps : int - Number of transitions to collect in each environment per update. - batch_size : int - Size of mini batches during each PPO update epoch. It is recommended - that n_envs * n_steps is divisible by batch_size. - gamma : float - Discount factor in [0, 1]. - k_epochs : int - Number of PPO epochs per update. - clip_eps : float - PPO clipping range (epsilon). - target_kl: float - Target KL divergence. If KL divergence between the current policy and - the new policy is greater than target_kl, the update is stopped early. - Set to None to disable early stopping. - normalize_avantages : bool - Whether or not to normalize advantages. - gae_lambda : float - Lambda parameter for TD(lambda) and Generalized Advantage Estimation. - entr_coef : float - Entropy coefficient. - vf_coef : float - Value function loss coefficient. - value_loss: str - Type of value loss. 'mse' corresponds to mean squared error, - 'clipped' corresponds to the original PPO loss, and 'avec' - corresponds to the AVEC loss (Flet-Berliac et al. 2021). - max_grad_norm : float - Maximum norm of the gradient of both actor and critic networks. - learning_rate : float - Learning rate. - lr_schedule: str - Learning rate schedule. 'constant' corresponds to a constant learning - rate, and 'linear' corresponds to a linearly decreasing learning rate, - starting at learning_rate and ending at 0. WARNING: the schedule is - reset at each call to fit(). - optimizer_type: str - Type of optimizer. 'ADAM' by defaut. - policy_net_fn : function(env, **kwargs) - Function that returns an instance of a policy network (pytorch). - If None, a default net is used. - policy_net_kwargs : dict - kwargs for policy_net_fn - value_net_fn : function(env, **kwargs) - Function that returns an instance of a value network (pytorch). - If None, a default net is used. - value_net_kwargs : dict - kwargs for value_net_fn - eval_env : rlberry Env - Environment used for evaluation. If None, env is used. - n_eval_episodes : int - Number of episodes to be used for evaluation. - eval_horizon : int - Maximum number of steps per episode during evaluation. - eval_freq : int - Number of updates between evaluations. If None, no evaluation is - performed. - device: str - Device on which to put the tensors. 'cuda:best' by default. - - Attributes - ---------- - __value_losses__ : list - List of supported value loss types. ["clipped", "mse", "avec"] - __lr_schedule___ : list - List of supported learning rate schedule types. ["constant", "linear"] - copy_env : bool - If True, copy the environment to create multiple environments for parallel interaction. - n_envs : int - Number of environments used by the agent. - n_steps : int - Number of transitions to collect in each environment per update. - batch_size : int - Size of mini batches during each PPO update epoch. - gamma : float - Discount factor used to discount future rewards. - k_epochs : int - Number of PPO epochs per update. - clip_eps : float - PPO clipping range (epsilon). - target_kl: float - Target KL divergence for early stopping. If None, early stopping is disabled. - normalize_advantages : bool - Whether or not to normalize advantages. - gae_lambda : float - Lambda parameter for TD(lambda) and Generalized Advantage Estimation. - entr_coef : float - Entropy coefficient. Controls the contribution of entropy regularization to the policy's objective. - vf_coef : float - Value function loss coefficient. Controls the contribution of the value function loss to the total loss. - value_loss: str - Type of value loss used. Can be "mse", "clipped", or "avec". - max_grad_norm : float - Maximum norm of the gradient of both actor and critic networks. Used for gradient clipping. - learning_rate : float - Learning rate used by the optimizer during neural network training. - lr_schedule : str - Learning rate schedule used during training. Can be "constant" or "linear". - optimizer_type : str - Type of optimizer used during neural network training. - n_eval_episodes : int - Number of episodes used for evaluation. - eval_horizon : int - Maximum number of steps per episode during evaluation. - eval_freq : int - Number of updates between evaluations. If None, no evaluation is performed. - policy_net_fn : function(env, **kwargs) - Function that returns an instance of a policy network (PyTorch). - policy_net_kwargs : dict - Keyword arguments for `policy_net_fn`. - value_net_fn : function(env, **kwargs) - Function that returns an instance of a value network (PyTorch). - value_net_kwargs : dict - Keyword arguments for `value_net_fn`. - eval_env : rlberry.Env - The environment used for evaluation. If None, the same environment as env is used. - state_dim : int - Dimensionality of the continuous state space of the environment. - policy_net : torch.nn.Module - The policy network used by the agent. - value_net : torch.nn.Module - The value network used by the agent. - device : str - Torch device on which the agent's neural networks are placed. - optimizer_kwargs : dict - Keyword arguments for the optimizer used during neural network training. - - References - ---------- - Schulman, J., Wolski, F., Dhariwal, P., Radford, A. & Klimov, O. (2017). - "Proximal Policy Optimization Algorithms." - arXiv preprint arXiv:1707.06347. - - Schulman, J., Levine, S., Abbeel, P., Jordan, M., & Moritz, P. (2015). - "Trust region policy optimization." - In International Conference on Machine Learning (pp. 1889-1897). - - Flet-Berliac, Y., Ouhamma, R., Maillard, O.-A., Preux, P. (2021) - "Learning Value Functions in Deep Policy Gradients using Residual Variance." - In 9th International Conference on Learning Representations (ICLR). - """ - - name = "PPO" - __value_losses__ = ["clipped", "mse", "avec"] - __lr_schedule___ = ["constant", "linear"] - - def __init__( - self, - env, - copy_env=True, - n_envs=1, - n_steps=512, - batch_size=64, - gamma=0.99, - k_epochs=10, - clip_eps=0.2, - target_kl=0.05, - normalize_advantages=True, - gae_lambda=0.95, - entr_coef=0.01, - vf_coef=0.5, - value_loss="mse", - max_grad_norm=0.5, - learning_rate=3e-4, - lr_schedule="constant", - optimizer_type="ADAM", - policy_net_fn=None, - policy_net_kwargs=None, - value_net_fn=None, - value_net_kwargs=None, - eval_env=None, - n_eval_episodes=10, - eval_horizon=int(1e5), - eval_freq=None, - device="cuda:best", - **kwargs - ): - kwargs.pop("eval_env", None) - AgentWithSimplePolicy.__init__( - self, None, **kwargs - ) # PPO handles the env internally - - # create environment - self.copy_env = copy_env - self.n_envs = n_envs - self.env = process_ppo_env(env, self.seeder, num_envs=n_envs, copy_env=copy_env) - eval_env = eval_env or env - self.eval_env = process_env(eval_env, self.seeder, copy_env=copy_env) - - # hyperparameters - value_loss, lr_schedule = value_loss.lower(), lr_schedule.lower() - assert value_loss in self.__value_losses__, "value_loss must be in {}".format( - self.__value_losses__ - ) - assert lr_schedule in self.__lr_schedule___, "lr_schedule must be in {}".format( - self.__lr_schedule___ - ) - - self.n_steps = n_steps - self.batch_size = batch_size - self.gamma = gamma - self.k_epochs = k_epochs - self.clip_eps = clip_eps - self.target_kl = target_kl - self.normalize_advantages = normalize_advantages - self.gae_lambda = gae_lambda - self.entr_coef = entr_coef - self.vf_coef = vf_coef - self.value_loss = value_loss - self.max_grad_norm = max_grad_norm - self.learning_rate = learning_rate - self.lr_schedule = lr_schedule - self.optimizer_type = optimizer_type - self.n_eval_episodes = n_eval_episodes - self.eval_horizon = eval_horizon - self.eval_freq = eval_freq - self.kwargs = kwargs - - self.state_dim = self.env.observation_space.shape[0] - - # policy network - self.policy_net_kwargs = policy_net_kwargs or {} - if isinstance(policy_net_fn, str): - self.policy_net_fn = load(policy_net_fn) - elif policy_net_fn is None: - self.policy_net_fn = default_policy_net_fn - else: - self.policy_net_fn = policy_net_fn - - # value network - self.value_net_kwargs = value_net_kwargs or {} - if isinstance(value_net_fn, str): - self.value_net_fn = load(value_net_fn) - elif value_net_fn is None: - self.value_net_fn = default_value_net_fn - else: - self.value_net_fn = value_net_fn - - self.device = choose_device(device) - - self.optimizer_kwargs = { - "optimizer_type": optimizer_type, - "lr": learning_rate, - "eps": 1e-5, - } - - # check environment - # TODO: should we restrict this to Box? - # what about the action space? - assert isinstance(self.env.observation_space, spaces.Box) - - # initialize - self.policy_net = self.value_net = None - self.reset() - - @classmethod - def from_config(cls, **kwargs): - kwargs["policy_net_fn"] = eval(kwargs["policy_net_fn"]) - kwargs["value_net_fn"] = eval(kwargs["value_net_fn"]) - return cls(**kwargs) - - def reset(self, **kwargs): - """ - Reset the agent. - """ - self.total_timesteps = 0 - self.total_episodes = 0 - - # Initialize rollout buffer - self.memory = RolloutBuffer(self.rng, self.n_steps) - self.memory.setup_entry("observations", dtype=np.float32) - self.memory.setup_entry("actions", dtype=self.env.single_action_space.dtype) - self.memory.setup_entry("rewards", dtype=np.float32) - self.memory.setup_entry("dones", dtype=bool) - self.memory.setup_entry("logprobs", dtype=np.float32) - self.memory.setup_entry("infos", dtype=dict) - - # Initialize neural networks and optimizers - # TODO: using a single env to configure the networks is a hack that - # should be fixed when model factories are revised - env = self.env.envs[0] - self.policy_net = self.policy_net_fn(env, **self.policy_net_kwargs).to( - self.device - ) - self.value_net = self.value_net_fn(env, **self.value_net_kwargs).to(self.device) - self.optimizer = optimizer_factory( - list(self.policy_net.parameters()) + list(self.value_net.parameters()), - **self.optimizer_kwargs - ) - - def policy(self, observation): - assert self.policy_net is not None - obs = torch.from_numpy(observation).float().to(self.device) - action = self.policy_net(obs).sample() - return action.cpu().numpy() - - def fit(self, budget: int, lr_scheduler=None, **kwargs): - """ - Train the agent using the provided environment. - - Parameters - ---------- - budget: int - Total number of steps to be performed in the environment. Parameters - are updated every n_steps interactions with the environment. - lr_scheduler: callable - A function that takes the current step and returns the current learning - rate. If None, a default scheduler is used. - """ - del kwargs - - if lr_scheduler is None: - lr_scheduler = self._get_lr_scheduler(budget) - - if len(self.memory) == 0: - timesteps_counter = 0 - else: # it's not the first "fit" on this agent, so there is a previous buffer to continue - timesteps_counter = len(self.memory) * self.n_envs - - episode_returns = np.zeros(self.n_envs, dtype=np.float32) - episode_lengths = np.zeros(self.n_envs, dtype=np.int32) - - next_obs, infos = self.env.reset() - next_obs = torch.Tensor(next_obs).to( - self.device - ) # should always be a torch tensor - next_done = np.zeros(self.n_envs, dtype=bool) # initialize done to False - while timesteps_counter < budget: - obs = next_obs - done = next_done - - # select action and take step - with torch.no_grad(): - action, logprobs = self._select_action(obs) - next_obs, reward, next_terminated, next_truncated, info = self.env.step( - action - ) - next_done = np.logical_or(next_terminated, next_truncated) - next_obs = torch.Tensor(next_obs).to(self.device) - - # end of episode logging - for i in range(self.n_envs): - if next_done[i]: - self.total_episodes += 1 - if self.writer and "episode" in info["final_info"][i]: - if "episode" in info["final_info"][i]: - r, l = ( - info["final_info"][i]["episode"]["r"], - info["final_info"][i]["episode"]["l"], - ) - else: - r, l = episode_returns[i], episode_lengths[i] - self.writer.add_scalar( - "episode_returns", r, self.total_timesteps - ) - self.writer.add_scalar( - "episode_lengths", l, self.total_timesteps - ) - self.writer.add_scalar( - "total_episodes", self.total_episodes, self.total_timesteps - ) - episode_returns[i], episode_lengths[i] = 0.0, 0 - - # append data to memory and update variables - self.memory.append( - { - "observations": obs.cpu().numpy(), - "actions": action, - "rewards": reward, - "dones": done, - "infos": info, - "logprobs": logprobs, - } - ) - self.total_timesteps += self.n_envs - timesteps_counter += self.n_envs - episode_returns += reward - episode_lengths += 1 - - # evaluation - if ( - self.writer - and self.eval_freq is not None - and self.total_timesteps % self.eval_freq == 0 - ): - evaluation = self.eval( - eval_horizon=self.eval_horizon, - n_simulations=self.n_eval_episodes, - gamma=1.0, - ) - self.writer.add_scalar("evaluation", evaluation, self.total_timesteps) - - # update with collected experience - if timesteps_counter % (self.n_envs * self.n_steps) == 0: - if self.lr_schedule != "constant": - lr = lr_scheduler(self.total_timesteps) - self.optimizer.param_groups[0]["lr"] = lr - self._update(next_obs=next_obs, next_done=next_done) - - def _get_lr_scheduler(self, budget): - """ - Returns a learning rate schedule for the policy and value networks. - """ - if self.lr_schedule == "constant": - return lambda t: self.learning_rate - elif self.lr_schedule == "linear": - return lambda t: self.learning_rate * (1 - t / float(budget)) - - def _select_action(self, obs): - """ - Select an action given the current state using the policy network. - Also returns the log probability of the selected action. - - Parameters - ---------- - obs: torch.Tensor - Observation tensor of shape (batch_size, obs_dim) - - Returns - ------- - A tuple (action, log_prob). - """ - action_dist = self.policy_net(obs) - action = action_dist.sample() - action_logprob = action_dist.log_prob(action) - return action.cpu().numpy(), action_logprob.cpu().numpy() - - def _update(self, next_obs=None, next_done=None): - """ - Performs a PPO update based on the data in `self.memory`. - - Parameters - ---------- - next_obs: torch.Tensor or None - Next observation tensor of shape (n_envs, obs_dim). Used to - bootstrap the value function. If None, the value function is - bootstrapped with zeros. - next_done: np.ndarray or None - Array of shape (n_envs,) indicating whether the next observation - is terminal. If None, this function assumes that they are not - terminal. - - Notes - ----- - This function assumes that the data in `self.memory` is complete, - and it will clear the memory during the update. - """ - assert ( - int(next_obs is None) + int(next_done is None) - ) % 2 == 0, "'next_obs' and 'next_done' should be both None or not None at the same time." - - # get batch data - batch = self.memory.get() - self.memory.clear() - - # get shapes - n_steps, n_envs, *obs_shape = batch["observations"].shape - _, _, *action_shape = batch["actions"].shape - - # create tensors from batch data - def _to_tensor(x): - return torch.from_numpy(x).to(self.device).detach() - - b_obs = _to_tensor(batch["observations"]) - - # create buffers - b_values = torch.zeros( - (n_steps, n_envs), dtype=torch.float32, device=self.device - ) - b_advantages = torch.zeros_like(b_values) - b_returns = torch.zeros_like(b_values) - - # compute values - # note: some implementations compute the value when collecting the data - # and use those stale values for the update. This can be better - # in architectures with a shared encoder, because you avoid two - # forward passes through the encoder. However, we choose to compute - # the values here, because it is easier to implement and it has no - # impact on performance in most cases. - with torch.no_grad(): - b_values = self.value_net(b_obs).squeeze(-1) - if next_obs is not None: - b_next_value = self.value_net(next_obs).squeeze(-1) - - # compute returns and advantages - # using numpy and numba for speedup - rewards = np.copy(batch["rewards"]) - - next_dones = np.zeros_like(batch["dones"]) - next_dones[:-1] = batch["dones"][1:] - if next_obs is not None: - next_dones[-1] = next_done - - values = b_values.cpu().numpy() - next_values = np.zeros_like(values) - next_values[:-1] = values[1:] - if next_obs is not None: - next_values[-1] = b_next_value.cpu().numpy() - - returns = lambda_returns( - rewards, next_dones, next_values, self.gamma, self.gae_lambda - ) - advantages = returns - values - - # convert to tensor - b_actions = _to_tensor(batch["actions"]) - b_logprobs = _to_tensor(batch["logprobs"]) - b_returns = _to_tensor(returns) - b_advantages = _to_tensor(advantages) - - # flatten the batch - b_obs = b_obs.view(n_steps * n_envs, *obs_shape) - b_actions = b_actions.view(n_steps * n_envs, *action_shape) - b_logprobs = b_logprobs.view(n_steps * n_envs, *action_shape) - b_values = b_values.view(n_steps * n_envs) - b_returns = b_returns.view(n_steps * n_envs) - b_advantages = b_advantages.view(n_steps * n_envs) - - # run minibatch updates - clipped = [] # whether the policy loss was clipped - b_indices = np.arange(n_steps * n_envs) - for epoch in range(self.k_epochs): - self.rng.shuffle(b_indices) - for start in range(0, n_steps * n_envs, self.batch_size): - end = min(start + self.batch_size, n_steps * n_envs) - mb_indices = b_indices[start:end] - - mb_obs = b_obs[mb_indices] - mb_actions = b_actions[mb_indices] - mb_old_logprobs = b_logprobs[mb_indices] - mb_returns = b_returns[mb_indices] - mb_advantages = b_advantages[mb_indices] - - # normalize advantages - if self.normalize_advantages: - mb_advantages = (mb_advantages - mb_advantages.mean()) / ( - mb_advantages.std() + 1e-8 - ) - - # forward pass to values and logprobs - action_dist = self.policy_net(mb_obs) - mb_values = self.value_net(mb_obs).squeeze(-1) - - mb_logprobs = action_dist.log_prob(mb_actions) - mb_entropy = action_dist.entropy() - if len(mb_logprobs.shape) > 1: - # in continuous action spaces, the distribution returns one - # value per action dim, so we sum over them - mb_logprobs = torch.sum(mb_logprobs, dim=-1) - mb_old_logprobs = torch.sum(mb_old_logprobs, dim=-1) - mb_entropy = torch.sum(mb_entropy, dim=-1) - mb_logratio = mb_logprobs - mb_old_logprobs - mb_ratio = torch.exp(mb_logratio) - - # compute approximated kl divergence and whether the policy loss - # was clipped - with torch.no_grad(): - approx_kl = torch.mean((mb_ratio - 1) - mb_logratio) - clipped.append( - (torch.abs(mb_ratio - 1.0) > self.clip_eps) - .float() - .mean() - .item() - ) - - # policy loss - pg_loss1 = -mb_advantages * mb_ratio - pg_loss2 = -mb_advantages * torch.clamp( - mb_ratio, 1 - self.clip_eps, 1 + self.clip_eps - ) - pg_loss = torch.mean(torch.max(pg_loss1, pg_loss2)) - - # value loss - if self.value_loss == "mse": - v_loss = 0.5 * torch.mean((mb_values - mb_returns) ** 2) - elif self.value_loss == "avec": - v_loss = torch.var(mb_returns - mb_values) - elif self.value_loss == "clipped": - mb_old_values = b_values[ - mb_indices - ] # these are stale after the first minibatch - mb_clipped_values = mb_old_values + torch.clamp( - mb_values - mb_old_values, -self.clip_eps, self.clip_eps - ) - - v_loss_unclipped = (mb_values - mb_returns) ** 2 - v_loss_clipped = (mb_clipped_values - mb_returns) ** 2 - v_loss = 0.5 * torch.mean( - torch.max(v_loss_unclipped, v_loss_clipped) - ) - - # entropy loss - entropy_loss = torch.mean(mb_entropy) - - # total loss - loss = pg_loss + self.vf_coef * v_loss - self.entr_coef * entropy_loss - - # optimize - self.optimizer.zero_grad() - loss.backward() - if self.max_grad_norm is not None: - nn.utils.clip_grad_norm_( - list(self.policy_net.parameters()) - + list(self.value_net.parameters()), - self.max_grad_norm, - ) - self.optimizer.step() - - if self.target_kl and approx_kl > self.target_kl: - break - - # compute explained variance - y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy() - var_y = np.var(y_true) - explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y - - # log metrics - # note: this approach only logs the last batch of the last - # epoch, which is not ideal. However, it is the way it is - # done in most implementations of PPO. - if self.writer: - self.writer.add_scalar( - "fit/policy_loss", - pg_loss.item(), - self.total_timesteps, - ) - self.writer.add_scalar( - "fit/value_loss", - v_loss.item(), - self.total_timesteps, - ) - self.writer.add_scalar( - "fit/entropy_loss", - entropy_loss.item(), - self.total_episodes, - ) - self.writer.add_scalar( - "fit/approx_kl", - approx_kl.item(), - self.total_episodes, - ) - self.writer.add_scalar( - "fit/clipfrac", - np.mean(clipped), - self.total_episodes, - ) - self.writer.add_scalar( - "fit/explained_variance", - explained_var, - self.total_episodes, - ) - self.writer.add_scalar( - "fit/learning_rate", - self.optimizer.param_groups[0]["lr"], - ) - - # - # For hyperparameter optimization - # - @classmethod - def sample_parameters(cls, trial): - batch_size = trial.suggest_categorical("batch_size", [1, 4, 8, 16, 32]) - gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.99]) - learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True) - entr_coef = trial.suggest_float("entr_coef", 1e-8, 0.1, log=True) - - clip_eps = trial.suggest_categorical("clip_eps", [0.1, 0.2, 0.3]) - - k_epochs = trial.suggest_categorical("k_epochs", [1, 5, 10, 20]) - - return { - "batch_size": batch_size, - "gamma": gamma, - "learning_rate": learning_rate, - "entr_coef": entr_coef, - "clip_eps": clip_eps, - "k_epochs": k_epochs, - } - - ##### Overwrite some inherited functions - - def save(self, filename): - """ - Overwrite the 'save' and 'load' functions to not store the env if it's a "vectorized env" (can't be managed with pickle) - - ----- documentation from original save ----- - - Save agent object. By default, the agent is pickled. - - If overridden, the load() method must also be overriden. - - Before saving, consider setting writer to None if it can't be pickled (tensorboard writers - keep references to files and cannot be pickled). - - Note: dill[1]_ is used when pickle fails - (see https://stackoverflow.com/a/25353243, for instance). - Pickle is tried first, since it is faster. - - Parameters - ---------- - filename: Path or str - File in which to save the Agent. - - Returns - ------- - pathlib.Path - If save() is successful, a Path object corresponding to the filename is returned. - Otherwise, None is returned. - .. warning:: The returned filename might differ from the input filename: For instance, - the method can append the correct suffix to the name before saving. - - References - ---------- - .. [1] https://github.com/uqfoundation/dill - """ - # remove writer if not pickleable - if not dill.pickles(self.writer): - self.set_writer(None) - # save - filename = Path(filename).with_suffix(".pickle") - filename.parent.mkdir(parents=True, exist_ok=True) - - dict_to_save = dict(self.__dict__) - del dict_to_save["env"] - del dict_to_save["eval_env"] - - try: - if not self.compress_pickle: - with filename.open("wb") as ff: - pickle.dump(dict_to_save, ff) - else: - with bz2.BZ2File(filename, "wb") as ff: - cPickle.dump(dict_to_save, ff) - except Exception: - try: - if not self.compress_pickle: - with filename.open("wb") as ff: - dill.dump(dict_to_save, ff) - else: - with bz2.BZ2File(filename, "wb") as ff: - dill.dump(dict_to_save, ff) - except Exception as ex: - logger.warning("Agent instance cannot be pickled: " + str(ex)) - return None - - return filename - - @classmethod - def load(cls, filename, **kwargs): - """ - Overwrite the 'save' and 'load' functions to not store the env if it's a "vectorized env" (can't be managed with pickle) - - ----- documentation from original load ----- - Load agent object. - If overridden, save() method must also be overriden. - - Parameters - ---------- - **kwargs: dict - Arguments to required by the __init__ method of the Agent subclass. - """ - filename = Path(filename).with_suffix(".pickle") - obj = cls(**kwargs) - - try: - if not obj.compress_pickle: - with filename.open("rb") as ff: - tmp_dict = pickle.load(ff) - else: - with bz2.BZ2File(filename, "rb") as ff: - tmp_dict = cPickle.load(ff) - except Exception: - if not obj.compress_pickle: - with filename.open("rb") as ff: - tmp_dict = dill.load(ff) - else: - with bz2.BZ2File(filename, "rb") as ff: - tmp_dict = dill.load(ff) - - temp_env = obj.__dict__["env"] - temp_eval_env = obj.__dict__["eval_env"] - - obj.__dict__.clear() - obj.__dict__.update(tmp_dict) - - obj.__dict__["env"] = temp_env - obj.__dict__["eval_env"] = temp_eval_env - - return obj diff --git a/rlberry/agents/torch/ppo/ppo_utils.py b/rlberry/agents/torch/ppo/ppo_utils.py deleted file mode 100644 index ec7f6df2f..000000000 --- a/rlberry/agents/torch/ppo/ppo_utils.py +++ /dev/null @@ -1,193 +0,0 @@ -import copy -import logging - -import gymnasium as gym -import numpy as np - -from rlberry.envs.utils import process_env -from rlberry.utils.jit_setup import numba_jit - - -logger = logging.getLogger(__name__) - - -def process_ppo_env(env, seeder, num_envs=1, asynchronous=False, copy_env=True): - """ - Process environment for PPO. It's the only agent that supports vectorized - environments. - - Parameters - ---------- - env : gym.Env - Environment to be processed. - seeder : rlberry.Seeder - Seeder object. - num_envs : int - Number of environments to be used. - asynchronous : bool - If True, the environments are run asynchronously. - - Returns - ------- - vec_env : gymnasium.vector.VectorEnv - Vectorized environment. - """ - vec_env_cls = ( - gym.vector.AsyncVectorEnv if asynchronous else gym.vector.SyncVectorEnv - ) - return vec_env_cls( - [lambda: process_env(env, seeder, copy_env=copy_env) for _ in range(num_envs)] - ) - - -@numba_jit -def lambda_returns(r_t, terminal_tp1, v_tp1, gamma, lambda_): - """ - Compute lambda returns. - - Parameters - ---------- - r_t: array - Array of shape (time_dim, batch_dim) containing the rewards. - terminal_tp1: array - Array of shape (time_dim, batch_dim) containing the discounts (0.0 if terminal state). - v_tp1: array - Array of shape (time_dim, batch_dim) containing the values at timestep t+1 - lambda_ : float in [0, 1] - Lambda-returns parameter. - """ - T = v_tp1.shape[0] - returns = np.zeros_like(r_t) - aux = v_tp1[-1].astype(np.float32) - for tt in range(T): - i = T - tt - 1 - returns[i] = r_t[i] + gamma * (1 - terminal_tp1[i]) * ( - (1 - lambda_) * v_tp1[i] + lambda_ * aux - ) - aux = returns[i] - return returns - - -class RolloutBuffer: - """ - Rollout buffer that allows sampling data with shape (batch_size, - num_trajectories, ...). - Parameters - ---------- - rng: numpy.random.Generator - Numpy random number generator. - See https://numpy.org/doc/stable/reference/random/generator.html - max_episode_steps: int, optional - Maximum length of an episode - """ - - def __init__(self, rng, num_rollout_steps): - self._rng = rng - self._num_rollout_steps = num_rollout_steps - self._curr_step = 0 - self._tags = [] - self._data = dict() - self._dtypes = dict() - - @property - def data(self): - """Dict containing all stored data.""" - return self._data - - @property - def tags(self): - """Tags identifying the entries in the replay buffer.""" - return self._tags - - @property - def dtypes(self): - """Dict containing the data types for each tag.""" - return self._dtypes - - @property - def num_rollout_steps(self): - """Number of steps to take in each environment per policy rollout.""" - return self._num_rollout_steps - - @property - def num_envs(self): - return self._num_envs - - def __len__(self): - return self._curr_step - - def full(self): - """Returns True if the buffer is full.""" - return len(self) == self.num_rollout_steps - - def clear(self): - """Clear data in replay.""" - self._curr_step = 0 - for tag in self._data: - self._data[tag] = None - - def setup_entry(self, tag, dtype): - """Configure replay buffer to store data. - Parameters - ---------- - tag : str - Tag that identifies the entry (e.g "observation", "reward") - dtype : obj - Data type of the entry (e.g. `np.float32`). Type is not - checked in :meth:`append`, but it is used to construct the numpy - arrays returned by the :meth:`sample`method. - """ - assert len(self) == 0, "Cannot setup entry on non-empty buffer." - if tag in self._data: - raise ValueError(f"Entry {tag} already added to replay buffer.") - self._tags.append(tag) - self._dtypes[tag] = dtype - self._data[tag] = None - - def append(self, data): - """ - Stores data from an environment step in the buffer. - - Parameters - ---------- - data : dict - Dictionary containing scalar values, whose keys must be in self.tags. - """ - assert set(data.keys()) == set(self.tags), "Data keys must be in self.tags" - assert len(self) < self.num_rollout_steps, "Buffer is full." - for tag in self.tags: - # - if self._data[tag] is None: - if isinstance(data[tag], np.ndarray): - # if data[tag].dtype != self._dtypes[tag]: - # logger.warning( - # f"Data type for tag {tag} is {data[tag].dtype}, " - # f"but it was configured as {self._dtypes[tag]}.") - shape = data[tag].shape - self._data[tag] = np.zeros( - (self.num_rollout_steps, *shape), dtype=self._dtypes[tag] - ) - elif isinstance(data[tag], float) or isinstance(data[tag], int): - self._data[tag] = np.zeros( - self.num_rollout_steps, dtype=self._dtypes[tag] - ) - else: - self._data[tag] = [None] * self.num_rollout_steps - self._data[tag][self._curr_step] = data[tag] - self._curr_step += 1 - - def get(self): - """ - Returns the collected data. If the appended data for a given tag is a - numpy array, the returned data will be a numpy array of shape: - - (T, *S), where T is the number of rollout steps, and S is the shape of - the data that was appended. - - Otherwise, the returned data will be a list of length T. - - Returns - ------- - Returns a dict with the collected data. - """ - return copy.deepcopy(self._data) diff --git a/rlberry/agents/torch/reinforce/__init__.py b/rlberry/agents/torch/reinforce/__init__.py deleted file mode 100644 index a0efecff6..000000000 --- a/rlberry/agents/torch/reinforce/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .reinforce import REINFORCEAgent diff --git a/rlberry/agents/torch/reinforce/reinforce.py b/rlberry/agents/torch/reinforce/reinforce.py deleted file mode 100644 index f9f0c2e2f..000000000 --- a/rlberry/agents/torch/reinforce/reinforce.py +++ /dev/null @@ -1,270 +0,0 @@ -import torch -import inspect -import numpy as np - -import gymnasium.spaces as spaces -from rlberry.agents import AgentWithSimplePolicy, AgentTorch -from rlberry.agents.utils.memories import Memory -from rlberry.agents.torch.utils.training import optimizer_factory -from rlberry.agents.torch.utils.models import default_policy_net_fn -from rlberry.utils.torch import choose_device - -import rlberry - -logger = rlberry.logger - - -class REINFORCEAgent(AgentTorch, AgentWithSimplePolicy): - """ - REINFORCE with entropy regularization. - - Parameters - ---------- - env : Model - Online model with continuous (Box) state space and discrete actions - batch_size : int, default: 8 - Number of episodes used for the update of the policy netowrk. - horizon : int, default: 256 - Episode length: one transition per episode steps. So total number of transitions used for one policy update is batch_size * horizon. - gamma : double - Discount factor in [0, 1]. - entr_coef : double - Entropy coefficient. - learning_rate : double - Learning rate. - normalize: bool - If True normalize rewards - optimizer_type: str - Type of optimizer. 'ADAM' by defaut. - policy_net_fn : function(env, **kwargs) - Function that returns an instance of a policy network (pytorch). - If None, a default net is used. - policy_net_kwargs : dict - kwargs for policy_net_fn - use_bonus_if_available : bool, default = False - If true, check if environment info has entry 'exploration_bonus' - and add it to the reward. See also UncertaintyEstimatorWrapper. - device: str - Device to put the tensors on - - Attributes - ---------- - device : str - Torch device on which the agent's neural networks are placed. - batch_size : int, default: 8 - Number of episodes used for the update of the policy netowrk. - horizon : int, default: 256 - Episode length: one transition per episode steps. - gamma : float, default: 0.99 - Discount factor used to discount future rewards in the Bellman equation. - state_dim : int - Dimensionality of the continuous state space of the environment. - action_dim : int - Number of discrete actions available in the environment. - policy_net_fn : function(env, **kwargs) - Function that returns an instance of a policy network (PyTorch). - policy_net_kwargs : dict - Keyword arguments for `policy_net_fn`. - optimizer_kwargs : dict - Keyword arguments for the optimizer used during neural network training. - policy_net : torch.nn.Module - The policy network used by the agent. - policy_optimizer : torch.optim.Optimizer - The optimizer used for training the policy network. - memory : Memory - The memory buffer used to store the agent's experiences. - episode : int - A counter that keeps track of the number of episodes. - - References - ---------- - Williams, Ronald J., - "Simple statistical gradient-following algorithms for connectionist - reinforcement learning." - ReinforcementLearning.Springer,Boston,MA,1992.5-3 - """ - - name = "REINFORCE" - - def __init__( - self, - env, - batch_size=8, - horizon=256, - gamma=0.99, - entr_coef=0.01, - learning_rate=0.0001, - normalize=True, - optimizer_type="ADAM", - policy_net_fn=None, - policy_net_kwargs=None, - use_bonus_if_available=False, - device="cuda:best", - **kwargs - ): - # For all parameters, define self.param = param - _, _, _, values = inspect.getargvalues(inspect.currentframe()) - values.pop("self") - for arg, val in values.items(): - setattr(self, arg, val) - - AgentWithSimplePolicy.__init__(self, env, **kwargs) - - self.device = choose_device(device) - - self.state_dim = self.env.observation_space.shape[0] - self.action_dim = self.env.action_space.n - - self.policy_net_kwargs = policy_net_kwargs or {} - - # - self.policy_net_fn = policy_net_fn or default_policy_net_fn - - self.optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate} - - # check environment - assert isinstance(self.env.observation_space, spaces.Box) - assert isinstance(self.env.action_space, spaces.Discrete) - - self.policy_net = None # policy network - - # initialize - self.reset() - - def reset(self, **kwargs): - self.policy_net = self.policy_net_fn(self.env, **self.policy_net_kwargs).to( - self.device - ) - - self.policy_optimizer = optimizer_factory( - self.policy_net.parameters(), **self.optimizer_kwargs - ) - - self.memory = Memory() - - self.episode = 0 - - def policy(self, observation): - state = observation - assert self.policy_net is not None - state = torch.from_numpy(state).float().to(self.device) - action_dist = self.policy_net(state) - action = action_dist.sample().item() - return action - - def fit(self, budget: int, **kwargs): - """ - Train the agent using the provided environment. - - Parameters - ---------- - budget: int - number of episodes. Each episode runs for self.horizon unless it - enconters a terminal state in which case it stops early. - """ - del kwargs - n_episodes_to_run = budget - count = 0 - while count < n_episodes_to_run: - self._run_episode() - count += 1 - - def _run_episode(self): - # interact for H steps - episode_rewards = 0 - observation, info = self.env.reset() - for _ in range(self.horizon): - # running policy - action = self.policy(observation) - next_observation, reward, terminated, truncated, info = self.env.step( - action - ) - done = terminated or truncated - - # check whether to use bonus - bonus = 0.0 - if self.use_bonus_if_available: - if info is not None and "exploration_bonus" in info: - bonus = info["exploration_bonus"] - - # save in batch - self.memory.states.append(observation) - self.memory.actions.append(action) - self.memory.rewards.append(reward + bonus) # add bonus here - self.memory.is_terminals.append(done) - episode_rewards += reward - - if done: - break - - # update observation - observation = next_observation - - # update - self.episode += 1 - - # - if self.writer is not None: - self.writer.add_scalar("episode_rewards", episode_rewards, self.episode) - - # - if self.episode % self.batch_size == 0: - self._update() - self.memory.clear_memory() - - return episode_rewards - - def _normalize(self, x): - return (x - x.mean()) / (x.std() + 1e-5) - - def _update(self): - # monte carlo estimate of rewards - rewards = [] - discounted_reward = 0 - for reward, is_terminal in zip( - reversed(self.memory.rewards), reversed(self.memory.is_terminals) - ): - if is_terminal: - discounted_reward = 0 - discounted_reward = reward + (self.gamma * discounted_reward) - rewards.insert(0, discounted_reward) - - # convert list to tensor - states = torch.FloatTensor(np.array(self.memory.states)).to(self.device) - actions = torch.LongTensor(self.memory.actions).to(self.device) - rewards = torch.FloatTensor(rewards).to(self.device) - if self.normalize: - rewards = self._normalize(rewards) - - # evaluate logprobs - action_dist = self.policy_net(states) - logprobs = action_dist.log_prob(actions) - dist_entropy = action_dist.entropy() - - # compute loss - loss = -logprobs * rewards - self.entr_coef * dist_entropy - - # take gradient step - self.policy_optimizer.zero_grad() - - loss.mean().backward() - - self.policy_optimizer.step() - - # - # For hyperparameter optimization - # - @classmethod - def sample_parameters(cls, trial): - batch_size = trial.suggest_categorical("batch_size", [1, 4, 8, 16, 32]) - gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.99]) - learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True) - - entr_coef = trial.suggest_float("entr_coef", 1e-8, 0.1, log=True) - - return { - "batch_size": batch_size, - "gamma": gamma, - "learning_rate": learning_rate, - "entr_coef": entr_coef, - } diff --git a/rlberry/agents/torch/sac/__init__.py b/rlberry/agents/torch/sac/__init__.py deleted file mode 100644 index 5a7ff1963..000000000 --- a/rlberry/agents/torch/sac/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .sac import SACAgent diff --git a/rlberry/agents/torch/sac/sac.py b/rlberry/agents/torch/sac/sac.py deleted file mode 100644 index 1828dc566..000000000 --- a/rlberry/agents/torch/sac/sac.py +++ /dev/null @@ -1,543 +0,0 @@ -import time - -import gymnasium.spaces as spaces -import numpy as np -import rlberry -import torch -import torch.nn as nn -import torch.optim as optim -from rlberry.agents import AgentTorch, AgentWithSimplePolicy -from rlberry.agents.torch.sac.sac_utils import default_policy_net_fn, default_q_net_fn -from rlberry.agents.torch.utils.training import optimizer_factory -from rlberry.agents.utils.replay import ReplayBuffer -from rlberry.utils.factory import load -from rlberry.utils.torch import choose_device - -logger = rlberry.logger - - -class SACAgent(AgentTorch, AgentWithSimplePolicy): - """ - Soft Actor Critic Agent. - - SAC, or SOFT Actor Critic, an offpolicy actor-critic deep RL algorithm - based on the maximum entropy reinforcement learning framework. In this - framework, the actor aims to maximize expected reward while also - maximizing entropy. - - Parameters - ---------- - env : Model - Online model with continuous (Box) state space and continuous actions - batch_size : int - Number of episodes to wait before updating the policy. - gamma : double - Discount factor in [0, 1]. - learning_rate : double - Learning rate. - buffer_capacity : int - Capacity of the replay buffer - optimizer_type: str - Type of optimizer. 'ADAM' by defaut. - tau : double - Target smoothing coefficient - policy frequency - Policy training frequency (Delayed TD3 update) - alpha - Entropy regularization coefficient - autotunealpha - Automatic tuning of alpha - learning start - Timesteps done before training starts - policy_net_fn : function(env, **kwargs) - Function that returns an instance of a policy network (pytorch). - If None, a default net is used. - policy_net_kwargs : dict - kwargs for policy_net_fn - q_net_constructor : Callable, str or None - Function/constructor that returns a torch module for the Q-network - q_net_kwargs : optional, dict - Parameters for q_net_constructor. - device : str - Device to put the tensors on - writer_frequency : int - Frequency of tensorboard logging - - References - ---------- - Haarnoja, Tuomas, et al. "Soft actor-critic algorithms and applications." - arXiv preprint arXiv:1812.05905 (2018). - """ - - name = "SAC" - - def __init__( - self, - env, - batch_size=256, - gamma=0.99, - q_learning_rate=1e-3, - policy_learning_rate=3e-4, - buffer_capacity: int = int(1e6), - optimizer_type="ADAM", - tau=0.005, - policy_frequency=2, - alpha=0.2, - autotune_alpha=True, - learning_start=5e3, - policy_net_fn=None, - policy_net_kwargs=None, - q_net_constructor=None, - q_net_kwargs=None, - writer_frequency=100, - device="cuda:best", - **kwargs - ): - AgentWithSimplePolicy.__init__(self, env, **kwargs) - - # check environment - assert isinstance(self.env.observation_space, spaces.Box) - assert isinstance(self.env.action_space, spaces.Box) - - # Setup cuda device - self.device = choose_device(device) - - # Hyperparameters - self.batch_size = batch_size - self.gamma = gamma - self.q_learning_rate = q_learning_rate - self.policy_learning_rate = policy_learning_rate - self.buffer_capacity = buffer_capacity - self.learning_start = learning_start - self.policy_frequency = policy_frequency - self.tau = tau - self.optimizer_type = optimizer_type - - # Setup Actor - self.policy_net_kwargs = policy_net_kwargs or {} - self.policy_net_fn = policy_net_fn or default_policy_net_fn - self.policy_optimizer_kwargs = { - "optimizer_type": self.optimizer_type, - "lr": policy_learning_rate, - } - - # Setup Q networks and their targets - if isinstance(q_net_constructor, str): - q_net_constructor = load(q_net_constructor) - elif q_net_constructor is None: - q_net_constructor = default_q_net_fn - else: - q_net_constructor = q_net_constructor - q_net_kwargs = q_net_kwargs or {} - self.q_net_kwargs = q_net_kwargs - self.q_net_constructor = q_net_constructor - self.q1 = q_net_constructor(self.env, **q_net_kwargs).to(self.device) - self.q2 = q_net_constructor(self.env, **q_net_kwargs).to(self.device) - self.q1_target = q_net_constructor(self.env, **q_net_kwargs).to(self.device) - self.q2_target = q_net_constructor(self.env, **q_net_kwargs).to(self.device) - self.q_optimizer_kwargs = { - "optimizer_type": self.optimizer_type, - "lr": q_learning_rate, - } - - # Setup tensorboard writer - self.writer_frequency = writer_frequency - - # Setup Actor action scaling - self.action_scale = torch.tensor( - (self.env.action_space.high - self.env.action_space.low) / 2.0, - dtype=torch.float32, - ).to(self.device) - self.action_bias = torch.tensor( - (self.env.action_space.high + self.env.action_space.low) / 2.0, - dtype=torch.float32, - ).to(self.device) - - # Autotune alpha or use a fixed default value - self.autotune_alpha = autotune_alpha - if not self.autotune_alpha: - self.alpha = alpha - - # initialize - self.reset() - - def reset(self, **kwargs): - """ - Reset the agent. - This function resets the agent by initializing the necessary components and parameters for training. - """ - - # Initialize the rollout buffer - self.memory = ReplayBuffer(max_replay_size=self.buffer_capacity, rng=self.rng) - self.memory.setup_entry("states", dtype=np.float32) - self.memory.setup_entry("next_states", dtype=np.float32) - self.memory.setup_entry("actions", dtype=np.float32) - self.memory.setup_entry("rewards", dtype=np.float32) - self.memory.setup_entry("dones", dtype=np.float32) - - # Intialize the Actor - self.cont_policy = self.policy_net_fn(self.env, **self.policy_net_kwargs).to( - self.device - ) - self.policy_optimizer = optimizer_factory( - self.cont_policy.parameters(), **self.policy_optimizer_kwargs - ) - self.cont_policy.load_state_dict(self.cont_policy.state_dict()) - - # Intialize the Q networks and their targets - self.q1 = self.q_net_constructor(self.env, **self.q_net_kwargs) - self.q2 = self.q_net_constructor(self.env, **self.q_net_kwargs) - self.q1_target = self.q_net_constructor(self.env, **self.q_net_kwargs) - self.q2_target = self.q_net_constructor(self.env, **self.q_net_kwargs) - self.q1.to(self.device) - self.q2.to(self.device) - self.q1_target.to(self.device) - self.q2_target.to(self.device) - self.q1_optimizer = optimizer_factory( - self.q1.parameters(), **self.q_optimizer_kwargs - ) - self.q2_optimizer = optimizer_factory( - self.q2.parameters(), **self.q_optimizer_kwargs - ) - self.q1_target_optimizer = optimizer_factory( - self.q1.parameters(), **self.q_optimizer_kwargs - ) - self.q2_target_optimizer = optimizer_factory( - self.q2.parameters(), **self.q_optimizer_kwargs - ) - # Define the loss - self.mse_loss = nn.MSELoss() - - # Automatic entropy tuning - if self.autotune_alpha: - self.target_entropy = -torch.prod( - torch.Tensor(self.env.action_space.shape).to(self.device) - ).item() - self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) - self.alpha = self.log_alpha.exp().item() - self.a_optimizer = optim.Adam([self.log_alpha], lr=self.q_learning_rate) - - # initialize episode, steps and time counters - self.total_episodes = 0 - self.total_timesteps = 0 - self.time = time.time() - - def policy(self, state): - assert self.cont_policy is not None - state = np.array([state]) - state = torch.FloatTensor(state).to(self.device) - - # Get the mean and log standard deviation of the action distribution from the policy network - action_dist = self.cont_policy(state) - mean, log_std = action_dist - - # Compute the standard deviation and - # create a normal distribution with the computed mean and standard deviation - std = log_std.exp() - action_dist = torch.distributions.Normal(mean, std) - - # Sample an action using the reparameterization trick - x_t = action_dist.rsample() - y_t = torch.tanh(x_t) - - # Apply scaling and bias to the action - action = y_t * self.action_scale + self.action_bias - return action.detach().cpu().numpy()[0] - - def fit(self, budget: int, **kwargs): - """ - Train the agent using the provided environment. - - Parameters - ---------- - budget: int - number of episodes. Each episode runs for self.horizon unless it - enconters a terminal state in which case it stops early. - """ - - # Intialize environment and get first observation - state, _ = self.env.reset() - - while self.total_timesteps < budget: - # Select action - if self.total_timesteps < self.learning_start: - # In order to improve exploration, before "learning_start" - # actions are sampled from a uniform random distribution over valid actions - action = np.array(self.env.action_space.sample()) - else: - # SAC action selection - tensor_state = np.array([state]) - action, _ = self._select_action(tensor_state) - action = action.detach().cpu().numpy()[0] - - # Step through the environment - next_state, reward, next_terminated, next_truncated, info = self.env.step( - action - ) - done = np.logical_or(next_terminated, next_truncated) - - # End of episode logging - if "episode" in info.keys(): - self.writer.add_scalar( - "episode/episode_rewards", - info["episode"]["r"], - self.total_timesteps, - ) - self.writer.add_scalar( - "episode/episode_length", info["episode"]["l"], self.total_timesteps - ) - - # Add experience to replay buffer - self.memory.append( - { - "states": state, - "next_states": next_state, - "actions": action, - "rewards": reward, - "dones": done, - } - ) - - # Update current state - state = next_state - - # Reset the environment if episode is over - if done: - state, _ = self.env.reset() - self.memory.end_episode() - - # Learning starts when there are enough samples in replay buffer - if self.total_timesteps > self.learning_start: - self._update() - - self.total_timesteps += 1 - - def _select_action(self, state): - """ - Select an action to take based on the current state. - - This function selects an action to take based on the current state. - The action is sampled from a squashed Gaussian distribution defined by the policy network. - - Parameters - ---------- - state: numpy.ndarray or torch.Tensor - The current state of the environment - - Returns - ------- - action torch.Tensor - The selected action - log_prob torch.Tensor - The log probability of the selected action - """ - - # Convert the state to a torch.Tensor if it's not already - state = torch.FloatTensor(state).to(self.device) - - # Get the mean and log standard deviation of the action distribution from the policy network - action_dist = self.cont_policy(state) - mean, log_std = action_dist - - # Compute the standard deviation and - # create a normal distribution with the computed mean and standard deviation - std = log_std.exp() - action_dist = torch.distributions.Normal(mean, std) - - # Sample an action using the reparameterization trick - x_t = action_dist.rsample() - y_t = torch.tanh(x_t) - - # Apply scaling and bias to the action - # and compute the log probability of the selected action - action = y_t * self.action_scale + self.action_bias - log_prob = action_dist.log_prob(x_t) - - # Enforce Action Bound - log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6) - log_prob = log_prob.sum(1, keepdim=True) - return action, log_prob - - def _update(self): - """ - Perform an update step for the SAC agent. - - It updates the Q-networks and the policy network based on the collected - experiences from the replay buffer. - """ - - # Sample a batch from replay buffer - memory_data = self.memory.sample(self.batch_size, 1).data - states = ( - torch.tensor(memory_data["states"]) - .view(self.batch_size, -1) - .to(self.device) - ) - next_state = ( - torch.tensor(memory_data["next_states"]) - .view(self.batch_size, -1) - .to(self.device) - ) - actions = ( - torch.tensor(memory_data["actions"]) - .view(self.batch_size, -1) - .to(self.device) - ) - rewards = ( - torch.tensor(memory_data["rewards"]) - .view(self.batch_size, -1) - .to(self.device) - ) - dones = ( - torch.tensor(memory_data["dones"]).view(self.batch_size, -1).to(self.device) - ) - - with torch.no_grad(): - # Select action using the current policy - next_state_actions, next_state_log_pi = self._select_action( - next_state.detach().cpu().numpy() - ) - # Compute the next state's Q-values - q1_next_target = self.q1_target( - torch.cat([next_state, next_state_actions], dim=-1) - ) - q2_next_target = self.q2_target( - torch.cat([next_state, next_state_actions], dim=-1) - ) - # Compute Q targets: - # - Compute the minimum Q-values between Q1 and Q2 - # - Entropy regularization term is subtracted from the Q-values - # This term encourages exploration by penalizing overly certain or deterministic actions. - min_q_next_target = ( - torch.min(q1_next_target, q2_next_target) - - self.alpha * next_state_log_pi - ) - # Compute the target Q-values using the Bellman equation with entropy regularization - next_q_value = rewards.flatten() + (1 - dones.flatten()) * self.gamma * ( - min_q_next_target - ).view(-1) - - # Compute Q loss - q1_v = self.q1(torch.cat([states, actions], dim=-1)) - q2_v = self.q2(torch.cat([states, actions], dim=-1)) - q1_loss_v = self.mse_loss(q1_v.squeeze(), next_q_value) - q2_loss_v = self.mse_loss(q2_v.squeeze(), next_q_value) - q_loss_v = q1_loss_v + q2_loss_v - - # Update Q networks - self.q1_optimizer.zero_grad() - self.q2_optimizer.zero_grad() - q_loss_v.backward() - self.q1_optimizer.step() - self.q2_optimizer.step() - - act_loss = None - alpha_loss = None - state_log_pi = None - # TD3 Delayed update of the policy network - if self.total_timesteps % self.policy_frequency == 0: - # Compensate for the delay by doing more than one update - for _ in range(self.policy_frequency): - # Select action using the current policy - state_action, state_log_pi = self._select_action( - states.detach().cpu().numpy() - ) - # Compute the next state's Q-values - q_out_v1 = self.q1(torch.cat([states, state_action], dim=-1)) - q_out_v2 = self.q2(torch.cat([states, state_action], dim=-1)) - # Select the minimum Q to reduce over estimation and improve stability - q_out_v = torch.min(q_out_v1, q_out_v2) - # Compute policy loss: - # - Maximize the expected return of the policy : improves action selection - # - Maximize the entropy of the policy : improves exploration - # Alpha is used to balance the trade-off between exploration and exploitation - act_loss = ((self.alpha * state_log_pi) - q_out_v).mean() - - # Update policy network - self.policy_optimizer.zero_grad() - act_loss.backward() - self.policy_optimizer.step() - - # Update alpha if autotuning is enabled - if self.autotune_alpha: - with torch.no_grad(): - state_action, state_log_pi = self._select_action( - states.detach().cpu().numpy() - ) - alpha_loss = ( - -self.log_alpha * (state_log_pi + self.target_entropy) - ).mean() - - self.a_optimizer.zero_grad() - alpha_loss.backward() - self.a_optimizer.step() - self.alpha = self.log_alpha.exp().item() - - # Target Q networks update by polyak averaging - for param, target_param in zip( - self.q1.parameters(), self.q1_target.parameters() - ): - target_param.data.copy_( - self.tau * param.data + (1 - self.tau) * target_param.data - ) - for param, target_param in zip( - self.q2.parameters(), self.q2_target.parameters() - ): - target_param.data.copy_( - self.tau * param.data + (1 - self.tau) * target_param.data - ) - - # Log metrics - if ( - self.writer is not None - and self.total_timesteps % self.writer_frequency == 0 - ): - self.writer.add_scalar( - "fit/loss_q1", float(q1_loss_v.detach()), self.total_timesteps - ) - self.writer.add_scalar( - "fit/loss_q2", float(q2_loss_v.detach()), self.total_timesteps - ) - self.writer.add_scalar( - "fit/value_q1", float(q1_v.mean().detach()), self.total_timesteps - ) - self.writer.add_scalar( - "fit/value_q2", float(q2_v.mean().detach()), self.total_timesteps - ) - if act_loss: - self.writer.add_scalar( - "fit/loss_act", float(act_loss.detach()), self.total_timesteps - ) - self.writer.add_scalar( - "fit/alpha", float(self.alpha), self.total_timesteps - ) - self.writer.add_scalar( - "episode/SPS", - int(self.total_timesteps / (time.time() - self.time)), - self.total_timesteps, - ) - if self.autotune_alpha and alpha_loss: - self.writer.add_scalar( - "fit/alpha_loss", float(alpha_loss.detach()), self.total_timesteps - ) - - # - # For hyperparameter optimization - # - @classmethod - def sample_parameters(cls, trial): - batch_size = trial.suggest_categorical("batch_size", [128, 256, 512, 1024]) - gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.99]) - q_learning_rate = trial.suggest_loguniform("q_learning_rate", 1e-5, 1) - policy_learning_rate = trial.suggest_loguniform( - "policy_learning_rate", 1e-6, 1e-1 - ) - policy_frequency = trial.suggest_categorical("policy_frequency", [1, 2, 3, 5]) - - return { - "batch_size": batch_size, - "gamma": gamma, - "q_learning_rate": q_learning_rate, - "policy_learning_rate": policy_learning_rate, - "policy_frequency": policy_frequency, - } diff --git a/rlberry/agents/torch/sac/sac_utils.py b/rlberry/agents/torch/sac/sac_utils.py deleted file mode 100644 index f644dfdf1..000000000 --- a/rlberry/agents/torch/sac/sac_utils.py +++ /dev/null @@ -1,38 +0,0 @@ -from rlberry.agents.torch.utils.training import model_factory - - -def default_q_net_fn(env, **kwargs): - """ - Returns a default Q value network. - """ - model_config = { - "type": "MultiLayerPerceptron", - "layer_sizes": (256, 256), - "reshape": True, - "in_size": env.observation_space.shape[0] + env.action_space.shape[0], - "out_size": 1, - } - if kwargs: - for k, v in kwargs.items(): - model_config[k] = v - return model_factory(**model_config) - - -def default_policy_net_fn(env, **kwargs): - """ - Returns a default Q value network. - """ - model_config = { - "type": "MultiLayerPerceptron", - "in_size": env.observation_space.shape[0], - "layer_sizes": [256, 256], - "out_size": env.action_space.shape[0], - "reshape": True, - "is_policy": True, - "ctns_actions": True, - "squashed_policy": True, - } - if kwargs: - for k, v in kwargs.items(): - model_config[k] = v - return model_factory(**model_config) diff --git a/rlberry/agents/torch/tests/__init__.py b/rlberry/agents/torch/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rlberry/agents/torch/tests/test_a2c.py b/rlberry/agents/torch/tests/test_a2c.py deleted file mode 100644 index 057649705..000000000 --- a/rlberry/agents/torch/tests/test_a2c.py +++ /dev/null @@ -1,122 +0,0 @@ -from rlberry.envs import Wrapper -from rlberry.agents.torch import A2CAgent -from rlberry.manager import ExperimentManager, evaluate_agents -from rlberry.envs.benchmarks.ball_exploration import PBall2D -from gymnasium import make - - -def test_a2c(): - env = "CartPole-v1" - mdp = make(env) - env_ctor = Wrapper - env_kwargs = dict(env=mdp) - - a2crlberry_stats = ExperimentManager( - A2CAgent, - (env_ctor, env_kwargs), - fit_budget=int(100), - eval_kwargs=dict(eval_horizon=2), - init_kwargs=dict(batch_size=100), - n_fit=1, - agent_name="A2C_rlberry_" + env, - ) - - a2crlberry_stats.fit() - - output = evaluate_agents([a2crlberry_stats], n_simulations=2, plot=False) - a2crlberry_stats.clear_output_dir() - env = "Pendulum-v1" - mdp = make(env) - env_ctor = Wrapper - env_kwargs = dict(env=mdp) - - a2crlberry_stats = ExperimentManager( - A2CAgent, - (env_ctor, env_kwargs), - fit_budget=int(100), - eval_kwargs=dict(eval_horizon=2), - init_kwargs=dict(batch_size=100), - n_fit=1, - agent_name="A2C_rlberry_" + env, - ) - - a2crlberry_stats.fit() - - output = evaluate_agents([a2crlberry_stats], n_simulations=2, plot=False) - a2crlberry_stats.clear_output_dir() - - env = "Acrobot-v1" - mdp = make(env) - env_ctor = Wrapper - env_kwargs = dict(env=mdp) - - a2crlberry_stats = ExperimentManager( - A2CAgent, - (env_ctor, env_kwargs), - fit_budget=int(100), - eval_kwargs=dict(eval_horizon=2), - init_kwargs=dict(batch_size=100), - n_fit=1, - agent_name="A2C_rlberry_" + env, - ) - - a2crlberry_stats.fit() - - output = evaluate_agents([a2crlberry_stats], n_simulations=2, plot=False) - a2crlberry_stats.clear_output_dir() - - env_ctor = PBall2D - env_kwargs = dict() - - a2crlberry_stats = ExperimentManager( - A2CAgent, - (env_ctor, env_kwargs), - fit_budget=int(100), - eval_kwargs=dict(eval_horizon=2), - init_kwargs=dict(batch_size=100), - n_fit=1, - agent_name="A2C_rlberry_" + "PBall2D", - ) - - a2crlberry_stats.fit() - - output = evaluate_agents([a2crlberry_stats], n_simulations=2, plot=False) - a2crlberry_stats.clear_output_dir() - - # test also non default - env = "CartPole-v1" - mdp = make(env) - env_ctor = Wrapper - env_kwargs = dict(env=mdp) - - a2crlberry_stats = ExperimentManager( - A2CAgent, - (env_ctor, env_kwargs), - fit_budget=int(100), - eval_kwargs=dict(eval_horizon=2), - init_kwargs=dict( - batch_size=100, - policy_net_fn="rlberry.agents.torch.utils.training.model_factory_from_env", - policy_net_kwargs=dict( - type="MultiLayerPerceptron", - layer_sizes=(256,), - reshape=False, - is_policy=True, - ), - value_net_fn="rlberry.agents.torch.utils.training.model_factory_from_env", - value_net_kwargs=dict( - type="MultiLayerPerceptron", - layer_sizes=[ - 512, - ], - reshape=False, - out_size=1, - ), - ), - n_fit=1, - agent_name="A2C_rlberry_" + env, - ) - a2crlberry_stats.fit() - - output = evaluate_agents([a2crlberry_stats], n_simulations=2, plot=False) - a2crlberry_stats.clear_output_dir() diff --git a/rlberry/agents/torch/tests/test_dqn.py b/rlberry/agents/torch/tests/test_dqn.py deleted file mode 100644 index 5b6848fb7..000000000 --- a/rlberry/agents/torch/tests/test_dqn.py +++ /dev/null @@ -1,138 +0,0 @@ -import pytest -from rlberry.envs import gym_make -from rlberry.agents.torch.dqn import DQNAgent -from rlberry.agents.torch.utils.training import model_factory -from rlberry.manager import ExperimentManager -import os -import pathlib - -import tempfile - - -@pytest.mark.parametrize( - "use_double_dqn, use_prioritized_replay", [(False, False), (True, True)] -) -def test_dqn_agent(use_double_dqn, use_prioritized_replay): - env = gym_make("CartPole-v1") - agent = DQNAgent( - env, - learning_starts=5, - eval_interval=75, - train_interval=2, - gradient_steps=-1, - use_double_dqn=use_double_dqn, - use_prioritized_replay=use_prioritized_replay, - ) - agent.fit(budget=500) - - model_configs = { - "type": "MultiLayerPerceptron", - "layer_sizes": (5, 5), - "reshape": False, - } - - def mlp(env, **kwargs): - """ - Returns a default Q value network. - """ - kwargs["in_size"] = env.observation_space.shape[0] - kwargs["out_size"] = env.action_space.n - return model_factory(**kwargs) - - new_agent = DQNAgent( - env, q_net_constructor=mlp, q_net_kwargs=model_configs, learning_starts=100 - ) - new_agent.fit(budget=2000) - - -def test_dqn_classic_env(): - env = gym_make("CartPole-v1") - agent = DQNAgent( - env, - learning_starts=5, - eval_interval=75, - train_interval=2, - gradient_steps=-1, - use_double_dqn=True, - use_prioritized_replay=True, - ) - agent.fit(budget=200) - - with tempfile.TemporaryDirectory() as tmpdirname: - saving_path = tmpdirname + "/agent_test_dqn_classic_env.pickle" - - # test the save function - agent.save(saving_path) - assert os.path.exists(saving_path) - - # test the loading function - test_load_env = gym_make("CartPole-v1") - loaded_agent = DQNAgent.load(saving_path, **dict(env=test_load_env)) - assert loaded_agent - - # test the agent - observation, info = test_load_env.reset() - for tt in range(100): - action = loaded_agent.policy(observation) - next_observation, reward, terminated, truncated, info = test_load_env.step( - action - ) - done = terminated or truncated - if done: - next_observation, info = test_load_env.reset() - observation = next_observation - - -def test_dqn_experiment_manager_classic_env(): - # saving_path = "rlberry/agents/torch/tests/agentmanager_test_dqn_classic_env" - - with tempfile.TemporaryDirectory() as tmpdirname: - saving_path = tmpdirname + "/agentmanager_test_dqn_classic_env" - - test_experiment_manager = ExperimentManager( - DQNAgent, # The Agent class. - ( - gym_make, - dict( - id="CartPole-v1", - ), - ), # The Environment to solve. - init_kwargs=dict( # Where to put the agent's hyperparameters - learning_starts=5, - eval_interval=75, - train_interval=2, - gradient_steps=-1, - use_double_dqn=True, - use_prioritized_replay=True, - chunk_size=1, - ), - fit_budget=200, # The number of interactions between the agent and the environment during training. - eval_kwargs=dict( - eval_horizon=50 - ), # The number of interactions between the agent and the environment during evaluations. - n_fit=1, # The number of agents to train. Usually, it is good to do more than 1 because the training is stochastic. - agent_name="test_dqn_classic_env", # The agent's name. - output_dir=saving_path, - ) - - test_experiment_manager.fit(budget=200) - - # test the save function - test_experiment_manager.save() - assert os.path.exists(saving_path) - - # test the loading function - test_load_env = gym_make("CartPole-v1") - path_to_load = next(pathlib.Path(saving_path).glob("**/*.pickle")) - loaded_experiment_manager = ExperimentManager.load(path_to_load) - assert loaded_experiment_manager - - # test the agent - state, info = test_load_env.reset() - for tt in range(50): - action = loaded_experiment_manager.get_agent_instances()[0].policy(state) - next_s, _, terminated, truncated, test = test_load_env.step(action) - done = terminated or truncated - if done: - break - state = next_s diff --git a/rlberry/agents/torch/tests/test_factory.py b/rlberry/agents/torch/tests/test_factory.py deleted file mode 100644 index f1dddb92b..000000000 --- a/rlberry/agents/torch/tests/test_factory.py +++ /dev/null @@ -1,23 +0,0 @@ -import pytest -from rlberry.agents.torch.utils.training import model_factory - - -@pytest.mark.parametrize( - "ntype", - [ - "MultiLayerPerceptron", - "ConvolutionalNetwork", - "DuelingNetwork", - "Table", - ], -) -def test_dqn_agent(ntype): - if ntype == "MultiLayerPerceptron": - nkwargs = {"in_size": 5, "layer_sizes": [5, 5]} - elif ntype == "ConvolutionalNetwork": - nkwargs = dict(in_channels=10, in_height=20, in_width=30, out_size=15) - elif ntype == "DuelingNetwork": - nkwargs = {"in_size": 5, "out_size": 3} - elif ntype == "Table": - nkwargs = dict(state_size=5, action_size=3) - network = model_factory(ntype, **nkwargs) diff --git a/rlberry/agents/torch/tests/test_mdqn.py b/rlberry/agents/torch/tests/test_mdqn.py deleted file mode 100644 index b327b8599..000000000 --- a/rlberry/agents/torch/tests/test_mdqn.py +++ /dev/null @@ -1,40 +0,0 @@ -import pytest -from rlberry.envs import gym_make -from rlberry.agents.torch.dqn import MunchausenDQNAgent -from rlberry.agents.torch.utils.training import model_factory - - -@pytest.mark.parametrize("use_prioritized_replay", [(False), (True)]) -def test_mdqn_agent(use_prioritized_replay): - env = gym_make("CartPole-v1") - agent = MunchausenDQNAgent( - env, - learning_starts=5, - batch_size=5, - eval_interval=2, - train_interval=2, - gradient_steps=-1, - use_prioritized_replay=use_prioritized_replay, - ) - agent.fit(budget=50) - - model_configs = { - "type": "MultiLayerPerceptron", - "layer_sizes": (5, 5), - "reshape": False, - } - - def mlp(env, **kwargs): - """ - Returns a default Q value network. - """ - kwargs["in_size"] = env.observation_space.shape[0] - kwargs["out_size"] = env.action_space.n - return model_factory(**kwargs) - - new_agent = MunchausenDQNAgent( - env, q_net_constructor=mlp, q_net_kwargs=model_configs, learning_starts=100 - ) - new_agent.fit(budget=200) - observation, info = env.reset() - new_agent.policy(observation) diff --git a/rlberry/agents/torch/tests/test_ppo.py b/rlberry/agents/torch/tests/test_ppo.py deleted file mode 100644 index ed31465cb..000000000 --- a/rlberry/agents/torch/tests/test_ppo.py +++ /dev/null @@ -1,201 +0,0 @@ -# from rlberry.envs import gym_make -# from rlberry.agents.torch.ppo import PPOAgent - - -# env = (gym_make, dict(id="Acrobot-v1")) -# # env = gym_make(id="Acrobot-v1") -# ppo = PPOAgent(env) -# ppo.fit(4096) - -import pytest -from rlberry.envs import Wrapper -from rlberry.agents.torch import PPOAgent -from rlberry.manager import ExperimentManager, evaluate_agents -from rlberry.envs.benchmarks.ball_exploration import PBall2D -from gymnasium import make -from rlberry.agents.torch.utils.training import model_factory_from_env -import sys - - -@pytest.mark.timeout(300) -@pytest.mark.xfail(sys.platform == "win32", reason="bug with windows???") -def test_ppo(): - env = "CartPole-v1" - mdp = make(env) - env_ctor = Wrapper - env_kwargs = dict(env=mdp) - - pporlberry_stats = ExperimentManager( - PPOAgent, - (env_ctor, env_kwargs), - fit_budget=int(132), - eval_kwargs=dict(eval_horizon=2), - init_kwargs=dict(batch_size=24, n_steps=96, device="cpu"), - n_fit=1, - agent_name="PPO_rlberry_" + env, - ) - - pporlberry_stats.fit() - - output = evaluate_agents([pporlberry_stats], n_simulations=2, plot=False) - pporlberry_stats.clear_output_dir() - - env = "Pendulum-v1" - mdp = make(env) - env_ctor = Wrapper - env_kwargs = dict(env=mdp) - - pporlberry_stats = ExperimentManager( - PPOAgent, - (env_ctor, env_kwargs), - fit_budget=int(132), - eval_kwargs=dict(eval_horizon=2), - init_kwargs=dict(batch_size=24, n_steps=96, device="cpu"), - n_fit=1, - agent_name="PPO_rlberry_" + env, - ) - - pporlberry_stats.fit() - - output = evaluate_agents([pporlberry_stats], n_simulations=2, plot=False) - pporlberry_stats.clear_output_dir() - - env = "Acrobot-v1" - mdp = make(env) - env_ctor = Wrapper - env_kwargs = dict(env=mdp) - - pporlberry_stats = ExperimentManager( - PPOAgent, - (env_ctor, env_kwargs), - fit_budget=int(132), - eval_kwargs=dict(eval_horizon=2), - init_kwargs=dict(batch_size=24, n_steps=96, device="cpu"), - n_fit=1, - agent_name="PPO_rlberry_" + env, - ) - - pporlberry_stats.fit() - - output = evaluate_agents([pporlberry_stats], n_simulations=2, plot=False) - pporlberry_stats.clear_output_dir() - - env_ctor = PBall2D - env_kwargs = dict() - pporlberry_stats = ExperimentManager( - PPOAgent, - (env_ctor, env_kwargs), - fit_budget=int(132), - eval_kwargs=dict(eval_horizon=2), - init_kwargs=dict(batch_size=24, n_steps=96, device="cpu"), - n_fit=1, - agent_name="PPO_rlberry_" + "PBall2D", - ) - - pporlberry_stats.fit() - - output = evaluate_agents([pporlberry_stats], n_simulations=2, plot=False) - pporlberry_stats.clear_output_dir() - - # test also non default - env = "CartPole-v1" - mdp = make(env) - env_ctor = Wrapper - env_kwargs = dict(env=mdp) - - pporlberry_stats = ExperimentManager( - PPOAgent, - (env_ctor, env_kwargs), - fit_budget=int(132), - eval_kwargs=dict(eval_horizon=2), - init_kwargs=dict( - batch_size=24, - n_steps=96, - policy_net_fn="rlberry.agents.torch.utils.training.model_factory_from_env", - policy_net_kwargs=dict( - type="MultiLayerPerceptron", - layer_sizes=(256,), - reshape=False, - is_policy=True, - ), - value_net_fn="rlberry.agents.torch.utils.training.model_factory_from_env", - value_net_kwargs=dict( - type="MultiLayerPerceptron", - layer_sizes=[ - 512, - ], - reshape=False, - out_size=1, - ), - ), - n_fit=1, - agent_name="PPO_rlberry_" + env, - ) - pporlberry_stats.fit() - - pporlberry_stats = ExperimentManager( - PPOAgent, - (env_ctor, env_kwargs), - fit_budget=int(132), - eval_kwargs=dict(eval_horizon=2), - init_kwargs=dict( - batch_size=24, - n_steps=96, - policy_net_fn=model_factory_from_env, - policy_net_kwargs=dict( - type="MultiLayerPerceptron", - layer_sizes=(256,), - reshape=False, - is_policy=True, - ), - value_net_fn=model_factory_from_env, - value_net_kwargs=dict( - type="MultiLayerPerceptron", - layer_sizes=[ - 512, - ], - reshape=False, - out_size=1, - ), - ), - n_fit=1, - agent_name="PPO_rlberry_" + env, - ) - pporlberry_stats.fit() - - output = evaluate_agents([pporlberry_stats], n_simulations=2, plot=False) - pporlberry_stats.clear_output_dir() - - env_ctor = PBall2D - env_kwargs = dict() - pporlberry_stats = ExperimentManager( - PPOAgent, - (env_ctor, env_kwargs), - fit_budget=int(132), - eval_kwargs=dict(eval_horizon=2), - init_kwargs=dict(batch_size=24, n_steps=96, device="cpu"), - n_fit=1, - agent_name="PPO_rlberry_" + "PBall2D", - ) - - pporlberry_stats.fit() - - output = evaluate_agents([pporlberry_stats], n_simulations=2, plot=False) - pporlberry_stats.clear_output_dir() - - pporlberry_stats = ExperimentManager( - PPOAgent, - (env_ctor, env_kwargs), - fit_budget=int(132), - eval_kwargs=dict(eval_horizon=2), - init_kwargs=dict( - batch_size=24, n_steps=96, normalize_advantages=True, device="cpu" - ), - n_fit=1, - agent_name="PPO_rlberry_" + "PBall2D", - ) - - pporlberry_stats.fit() - - output = evaluate_agents([pporlberry_stats], n_simulations=2, plot=False) - pporlberry_stats.clear_output_dir() diff --git a/rlberry/agents/torch/tests/test_reinforce.py b/rlberry/agents/torch/tests/test_reinforce.py deleted file mode 100644 index 5df650288..000000000 --- a/rlberry/agents/torch/tests/test_reinforce.py +++ /dev/null @@ -1,49 +0,0 @@ -from rlberry.agents.torch import REINFORCEAgent -from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env -from rlberry.exploration_tools.discrete_counter import DiscreteCounter -from rlberry.wrappers.uncertainty_estimator_wrapper import UncertaintyEstimatorWrapper - - -def test_reinforce_agent(): - _env = get_benchmark_env(level=1) - n_episodes = 50 - horizon = 30 - - # - def uncertainty_estimator_fn(observation_space, action_space): - counter = DiscreteCounter(observation_space, action_space, n_bins_obs=20) - return counter - - env = UncertaintyEstimatorWrapper( - _env, uncertainty_estimator_fn, bonus_scale_factor=1.0 - ) - # - agent = REINFORCEAgent( - env, - horizon=horizon, - gamma=0.99, - learning_rate=0.001, - use_bonus_if_available=True, - ) - agent.fit(budget=n_episodes) - agent.policy(env.observation_space.sample()) - - -def test_reinforce_agent_partial_fit(): - env = get_benchmark_env(level=1) - n_episodes = 10 - horizon = 30 - - agent = REINFORCEAgent( - env, - horizon=horizon, - gamma=0.99, - learning_rate=0.001, - use_bonus_if_available=False, - ) - agent.fit(budget=n_episodes // 2) - agent.policy(env.observation_space.sample()) - assert agent.episode == 5 - agent.fit(budget=n_episodes // 2) - assert agent.episode == 10 - agent.policy(env.observation_space.sample()) diff --git a/rlberry/agents/torch/tests/test_sac.py b/rlberry/agents/torch/tests/test_sac.py deleted file mode 100644 index db5f5067f..000000000 --- a/rlberry/agents/torch/tests/test_sac.py +++ /dev/null @@ -1,68 +0,0 @@ -import sys - -import pytest -from gymnasium import make -from rlberry.agents.torch.sac import SACAgent -from rlberry.envs import Wrapper -from rlberry.manager import AgentManager, evaluate_agents - - -@pytest.mark.timeout(300) -@pytest.mark.xfail(sys.platform == "win32", reason="bug with windows???") -def test_sac(): - env = "Pendulum-v1" - mdp = make(env) - env_ctor = Wrapper - env_kwargs = dict(env=mdp) - - sacrlberry_stats = AgentManager( - SACAgent, - (env_ctor, env_kwargs), - fit_budget=int(132), - eval_kwargs=dict(eval_horizon=2), - init_kwargs=dict(batch_size=24, device="cpu"), - n_fit=1, - agent_name="SAC_rlberry_" + env, - ) - - sacrlberry_stats.fit() - - output = evaluate_agents([sacrlberry_stats], n_simulations=2, plot=False) - sacrlberry_stats.clear_output_dir() - - # test also non default - env = "Pendulum-v1" - mdp = make(env) - env_ctor = Wrapper - env_kwargs = dict(env=mdp) - - sacrlberry_stats = AgentManager( - SACAgent, - (env_ctor, env_kwargs), - fit_budget=int(1024), - eval_kwargs=dict(eval_horizon=2), - init_kwargs=dict( - learning_start=int(512), - autotune_alpha=False, - batch_size=24, - policy_net_kwargs=dict( - type="MultiLayerPerceptron", - layer_sizes=(256,), - reshape=False, - is_policy=True, - ), - q_net_kwargs=dict( - type="MultiLayerPerceptron", - layer_sizes=[ - 512, - ], - reshape=False, - out_size=1, - ), - ), - n_fit=1, - agent_name="SAC_rlberry_" + env, - ) - sacrlberry_stats.fit() - output = evaluate_agents([sacrlberry_stats], n_simulations=2, plot=False) - sacrlberry_stats.clear_output_dir() diff --git a/rlberry/agents/torch/tests/test_torch_atari.py b/rlberry/agents/torch/tests/test_torch_atari.py deleted file mode 100644 index bb7629c78..000000000 --- a/rlberry/agents/torch/tests/test_torch_atari.py +++ /dev/null @@ -1,287 +0,0 @@ -from rlberry.manager import ExperimentManager -from rlberry.agents.torch.dqn.dqn import DQNAgent -from rlberry.envs.gym_make import atari_make - -from rlberry.agents.torch import PPOAgent -from rlberry.agents.torch.utils.training import model_factory_from_env -import pathlib -import numpy as np -import pytest -import os - -import tempfile - - -def test_forward_dqn(): - mlp_configs = { - "type": "MultiLayerPerceptron", # A network architecture - "layer_sizes": [32], # Network dimensions - "reshape": False, - "is_policy": False, # The network should output a distribution - # over actions - } - - cnn_configs = { - "type": "ConvolutionalNetwork", # A network architecture - "activation": "RELU", - "in_channels": 4, - "in_height": 84, - "in_width": 84, - "head_mlp_kwargs": mlp_configs, - "transpose_obs": False, - "is_policy": False, # The network should output a distribution - } - - tuned_agent = ExperimentManager( - DQNAgent, # The Agent class. - ( - atari_make, - # uncomment when rlberry will manage vectorized env - # dict(id="ALE/Breakout-v5", n_envs=3), - dict(id="ALE/Breakout-v5", n_envs=1), - ), # The Environment to solve. - init_kwargs=dict( # Where to put the agent's hyperparameters - q_net_constructor="rlberry.agents.torch.utils.training.model_factory_from_env", - q_net_kwargs=cnn_configs, - max_replay_size=100, - batch_size=32, - learning_starts=100, - gradient_steps=1, - epsilon_final=0.01, - learning_rate=1e-4, # Size of the policy gradient descent steps. - chunk_size=5, - ), - fit_budget=200, # The number of interactions between the agent and the environment during training. - eval_kwargs=dict( - eval_horizon=10 - ), # The number of interactions between the agent and the environment during evaluations. - n_fit=1, # The number of agents to train. Usually, it is good to do more than 1 because the training is stochastic. - agent_name="DQN_test", # The agent's name. - ) - - tuned_agent.fit() - - -def test_forward_empty_input_dim(): - mlp_configs = { - "type": "MultiLayerPerceptron", # A network architecture - "layer_sizes": [32], # Network dimensions - "reshape": False, - "is_policy": False, # The network should output a distribution - # over actions - } - - cnn_configs = { - "type": "ConvolutionalNetwork", # A network architecture - "activation": "RELU", - "head_mlp_kwargs": mlp_configs, - "transpose_obs": False, - "is_policy": False, # The network should output a distribution - } - - tuned_agent = ExperimentManager( - DQNAgent, # The Agent class. - ( - atari_make, - # uncomment when rlberry will manage vectorized env - # dict(id="ALE/Breakout-v5", n_envs=3), - dict(id="ALE/Breakout-v5", n_envs=1), - ), # The Environment to solve. - init_kwargs=dict( # Where to put the agent's hyperparameters - q_net_constructor="rlberry.agents.torch.utils.training.model_factory_from_env", - q_net_kwargs=cnn_configs, - max_replay_size=100, - batch_size=32, - learning_starts=100, - gradient_steps=1, - epsilon_final=0.01, - learning_rate=1e-4, # Size of the policy gradient descent steps. - chunk_size=5, - ), - fit_budget=10, # The number of interactions between the agent and the environment during training. - eval_kwargs=dict( - eval_horizon=10 - ), # The number of interactions between the agent and the environment during evaluations. - n_fit=1, # The number of agents to train. Usually, it is good to do more than 1 because the training is stochastic. - agent_name="DQN_test", # The agent's name. - ) - - tuned_agent.fit() - - -@pytest.mark.parametrize("num_envs", [1, 3]) -def test_ppo_vectorized_atari_env(num_envs): - policy_mlp_configs = { - "type": "MultiLayerPerceptron", # A network architecture - "layer_sizes": [32], # Network dimensions - "reshape": False, - "is_policy": True, # The network should output a distribution - # over actions - } - - critic_mlp_configs = { - "type": "MultiLayerPerceptron", - "layer_sizes": [32], - "reshape": False, - "out_size": 1, # The critic network is an approximator of - # a value function V: States -> |R - } - - policy_configs = { - "type": "ConvolutionalNetwork", # A network architecture - "activation": "RELU", - "in_channels": 4, - "in_height": 84, - "in_width": 84, - "head_mlp_kwargs": policy_mlp_configs, - "transpose_obs": False, - "is_policy": True, # The network should output a distribution - } - - critic_configs = { - "type": "ConvolutionalNetwork", - "layer_sizes": "RELU", - "in_channels": 4, - "in_height": 84, - "in_width": 84, - "head_mlp_kwargs": critic_mlp_configs, - "transpose_obs": False, - "out_size": 1, - } - - agent = PPOAgent( - ( - atari_make, - dict(id="ALE/Freeway-v5"), - ), - optimizer_type="ADAM", # What optimizer to use for policy gradient descent steps. - learning_rate=1e-4, # Size of the policy gradient descent steps. - policy_net_fn=model_factory_from_env, # A policy network constructor - policy_net_kwargs=policy_configs, # Policy network's architecure - value_net_fn=model_factory_from_env, # A Critic network constructor - value_net_kwargs=critic_configs, # Critic network's architecure. - n_envs=num_envs, - n_steps=64, - batch_size=128, - # **dict(eval_env=(atari_make,dict(id="ALE/Freeway-v5",n_envs=1))) - ) - agent.fit(budget=500) - - with tempfile.TemporaryDirectory() as tmpdirname: - saving_path = tmpdirname + "/agent_test_ppo_vect_env.pickle" - - # test the save function - agent.save(saving_path) - assert os.path.exists(saving_path) - - # test the loading function - test_load_env = atari_make("ALE/Freeway-v5") - test_load_env.reset() - loaded_agent = PPOAgent.load( - saving_path, **dict(env=test_load_env), copy_env=False - ) - assert loaded_agent - - # test the agent - observation, info = test_load_env.reset() - for tt in range(100): - action = loaded_agent.policy(observation) - next_observation, reward, terminated, truncated, info = test_load_env.step( - action - ) - done = terminated or truncated - if done: - next_observation, info = test_load_env.reset() - observation = next_observation - - -@pytest.mark.parametrize("num_envs", [1, 3]) -def test_ppo_experiment_manager_vectorized_atari_env(num_envs): - with tempfile.TemporaryDirectory() as tmpdirname: - saving_path = tmpdirname + "/agentmanager_test_ppo_vectorized_env" - - policy_mlp_configs = { - "type": "MultiLayerPerceptron", # A network architecture - "layer_sizes": [32], # Network dimensions - "reshape": False, - "is_policy": True, # The network should output a distribution - # over actions - } - - critic_mlp_configs = { - "type": "MultiLayerPerceptron", - "layer_sizes": [32], - "reshape": False, - "out_size": 1, # The critic network is an approximator of - # a value function V: States -> |R - } - - policy_configs = { - "type": "ConvolutionalNetwork", # A network architecture - "activation": "RELU", - "in_channels": 4, - "in_height": 84, - "in_width": 84, - "head_mlp_kwargs": policy_mlp_configs, - "transpose_obs": False, - "is_policy": True, # The network should output a distribution - } - - critic_configs = { - "type": "ConvolutionalNetwork", - "layer_sizes": "RELU", - "in_channels": 4, - "in_height": 84, - "in_width": 84, - "head_mlp_kwargs": critic_mlp_configs, - "transpose_obs": False, - "out_size": 1, - } - - test_experiment_manager = ExperimentManager( - PPOAgent, # The Agent class. - ( - atari_make, - dict(id="ALE/Atlantis-v5"), - ), # The Environment to solve. - init_kwargs=dict( # Where to put the agent's hyperparameters - optimizer_type="ADAM", # What optimizer to use for policy gradient descent steps. - learning_rate=1e-4, # Size of the policy gradient descent steps. - policy_net_fn=model_factory_from_env, # A policy network constructor - policy_net_kwargs=policy_configs, # Policy network's architecure - value_net_fn=model_factory_from_env, # A Critic network constructor - value_net_kwargs=critic_configs, # Critic network's architecure. - n_envs=num_envs, - n_steps=64, - batch_size=128, - ), - fit_budget=200, # The number of interactions between the agent and the environment during training. - eval_kwargs=dict( - eval_horizon=50 - ), # The number of interactions between the agent and the environment during evaluations. - n_fit=1, # The number of agents to train. Usually, it is good to do more than 1 because the training is stochastic. - agent_name="test_ppo_vectorized_env", # The agent's name. - output_dir=saving_path, - # eval_env = (atari_make,dict(id="ALE/Atlantis-v5",n_envs=1)) - ) - test_experiment_manager.fit(budget=500) - - # test the save function - test_experiment_manager.save() - assert os.path.exists(saving_path) - - # test the loading function - test_load_env = atari_make("ALE/Atlantis-v5") - test_load_env.reset() - path_to_load = next(pathlib.Path(saving_path).glob("**/*.pickle")) - loaded_experiment_manager = ExperimentManager.load(path_to_load) - assert loaded_experiment_manager - - # test the agent - obs, infos = test_load_env.reset() - for tt in range(50): - actions = loaded_experiment_manager.get_agent_instances()[0].policy(obs) - obs, reward, terminated, truncated, info = test_load_env.step(actions) - done = np.logical_or(terminated, truncated) - if done: - break diff --git a/rlberry/agents/torch/tests/test_torch_models.py b/rlberry/agents/torch/tests/test_torch_models.py deleted file mode 100644 index 9bc692294..000000000 --- a/rlberry/agents/torch/tests/test_torch_models.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -TODO: Test attention modules -""" - -import torch -from rlberry.agents.torch.utils.models import MultiLayerPerceptron -from rlberry.agents.torch.utils.models import ConvolutionalNetwork, DuelingNetwork - - -def test_mlp(): - model = MultiLayerPerceptron( - in_size=5, layer_sizes=[10, 10, 10], out_size=10, reshape=False - ) - x = torch.rand(1, 5) - y = model.forward(x) - assert y.shape[1] == 10 - - -def test_mlp_policy(): - model = MultiLayerPerceptron( - in_size=5, layer_sizes=[10, 10, 10], out_size=10, reshape=False, is_policy=True - ) - x = torch.rand(1, 5) - scores = model.action_scores(x) - assert scores.shape[1] == 10 - - -def test_cnn(): - model = ConvolutionalNetwork(in_channels=10, in_height=20, in_width=30, out_size=15) - x = torch.rand(1, 10, 20, 30) - y = model.forward(x) - assert y.shape[1] == 15 - - -def test_dueling_network(): - model = DuelingNetwork(in_size=10, out_size=15) - x = torch.rand(1, 10) - y = model.forward(x) - - -def test_cnn_policy(): - model = ConvolutionalNetwork( - in_channels=10, in_height=20, in_width=30, out_size=15, is_policy=True - ) - x = torch.rand(1, 10, 20, 30) - scores = model.action_scores(x) - assert scores.shape[1] == 15 diff --git a/rlberry/agents/torch/tests/test_torch_training.py b/rlberry/agents/torch/tests/test_torch_training.py deleted file mode 100644 index fe5fb722c..000000000 --- a/rlberry/agents/torch/tests/test_torch_training.py +++ /dev/null @@ -1,32 +0,0 @@ -import torch -from rlberry.agents.torch.utils.training import loss_function_factory, optimizer_factory -from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env -from rlberry.agents.torch.utils.models import default_policy_net_fn - -# loss_function_factory -assert isinstance(loss_function_factory("l2"), torch.nn.MSELoss) -assert isinstance(loss_function_factory("l1"), torch.nn.L1Loss) -assert isinstance(loss_function_factory("smooth_l1"), torch.nn.SmoothL1Loss) -assert isinstance(loss_function_factory("bce"), torch.nn.BCELoss) - -# optimizer_factory -env = get_benchmark_env(level=1) -assert ( - optimizer_factory(default_policy_net_fn(env).parameters(), "ADAM").defaults["lr"] - == 0.001 -) -assert optimizer_factory(default_policy_net_fn(env).parameters(), "ADAM").defaults[ - "betas" -] == (0.9, 0.999) -assert ( - optimizer_factory(default_policy_net_fn(env).parameters(), "RMS_PROP").defaults[ - "lr" - ] - == 0.01 -) -assert ( - optimizer_factory(default_policy_net_fn(env).parameters(), "RMS_PROP").defaults[ - "alpha" - ] - == 0.99 -) diff --git a/rlberry/agents/torch/utils/__init__.py b/rlberry/agents/torch/utils/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rlberry/agents/torch/utils/models.py b/rlberry/agents/torch/utils/models.py deleted file mode 100644 index 709493995..000000000 --- a/rlberry/agents/torch/utils/models.py +++ /dev/null @@ -1,534 +0,0 @@ -# -# Simple MLP and CNN models -# -from functools import partial - - -from gymnasium import spaces -from gymnasium.vector.sync_vector_env import SyncVectorEnv -from gymnasium.vector.async_vector_env import AsyncVectorEnv -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.distributions import Categorical, Normal - -from rlberry.agents.torch.utils.training import model_factory, activation_factory - - -def default_twinq_net_fn(env): - """ - Returns a default Twinq network - """ - assert isinstance(env.action_space, spaces.Discrete) - if isinstance(env.observation_space, spaces.Box): - obs_shape = env.observation_space.shape - elif isinstance(env.observation_space, spaces.Tuple): - obs_shape = env.observation_space.spaces[0].shape - else: - raise ValueError( - "Incompatible observation space: {}".format(env.observation_space) - ) - # Assume CHW observation space - - if len(obs_shape) == 1: - model_config = { - "type": "MultiLayerPerceptron", - "in_size": int(obs_shape[0]) + int(env.action_space.n), - "layer_sizes": [64, 64], - } - else: - raise ValueError( - "Incompatible observation shape: {}".format(env.observation_space.shape) - ) - - model_config["out_size"] = 1 - - q1 = model_factory(**model_config) - q2 = model_factory(**model_config) - - return (q1, q2) - - -def default_policy_net_fn(env): - """ - Returns a default policy network. - """ - - # remove potential wrappers - while type(env) in [SyncVectorEnv, AsyncVectorEnv]: - env = env.envs[0] - - if isinstance(env.observation_space, spaces.Box): - obs_shape = env.observation_space.shape - elif isinstance(env.observation_space, spaces.Tuple): - obs_shape = env.observation_space.spaces[0].shape - else: - raise ValueError( - "Incompatible observation space: {}".format(env.observation_space) - ) - - if len(obs_shape) == 3: - if obs_shape[0] < obs_shape[1] and obs_shape[0] < obs_shape[2]: - # Assume CHW observation space - model_config = { - "type": "ConvolutionalNetwork", - "is_policy": True, - "in_channels": int(obs_shape[0]), - "in_height": int(obs_shape[1]), - "in_width": int(obs_shape[2]), - } - elif obs_shape[2] < obs_shape[0] and obs_shape[2] < obs_shape[1]: - # Assume WHC observation space - model_config = { - "type": "ConvolutionalNetwork", - "is_policy": True, - "transpose_obs": True, - "in_channels": int(obs_shape[2]), - "in_height": int(obs_shape[1]), - "in_width": int(obs_shape[0]), - } - elif len(obs_shape) == 2: - model_config = { - "type": "ConvolutionalNetwork", - "is_policy": True, - "in_channels": int(1), - "in_height": int(obs_shape[0]), - "in_width": int(obs_shape[1]), - } - elif len(obs_shape) == 1: - model_config = { - "type": "MultiLayerPerceptron", - "in_size": int(obs_shape[0]), - "layer_sizes": [64, 64], - "reshape": False, - "is_policy": True, - } - else: - raise ValueError( - "Incompatible observation shape: {}".format(env.observation_space.shape) - ) - - if isinstance(env.action_space, spaces.Discrete): - model_config["out_size"] = env.action_space.n - model_config["ctns_actions"] = False - elif isinstance(env.action_space, spaces.Tuple): - model_config["out_size"] = env.action_space.spaces[0].n - model_config["ctns_actions"] = False - elif isinstance(env.action_space, spaces.Box): - model_config["out_size"] = env.action_space.shape[0] - model_config["ctns_actions"] = True - - return model_factory(**model_config) - - -def default_value_net_fn(env): - """ - Returns a default value network. - """ - - # remove potential wrappers - while type(env) in [SyncVectorEnv, AsyncVectorEnv]: - env = env.envs[0] - - if isinstance(env.observation_space, spaces.Box): - obs_shape = env.observation_space.shape - elif isinstance(env.observation_space, spaces.Tuple): - obs_shape = env.observation_space.spaces[0].shape - else: - raise ValueError( - "Incompatible observation space: {}".format(env.observation_space) - ) - # Assume CHW observation space - if len(obs_shape) == 3: - model_config = { - "type": "ConvolutionalNetwork", - "in_channels": int(obs_shape[0]), - "in_height": int(obs_shape[1]), - "in_width": int(obs_shape[2]), - } - elif len(obs_shape) == 2: - model_config = { - "type": "ConvolutionalNetwork", - "in_channels": int(1), - "in_height": int(obs_shape[0]), - "in_width": int(obs_shape[1]), - } - elif len(obs_shape) == 1: - model_config = { - "type": "MultiLayerPerceptron", - "in_size": int(obs_shape[0]), - "layer_sizes": [64, 64], - } - else: - raise ValueError( - "Incompatible observation shape: {}".format(env.observation_space.shape) - ) - - model_config["out_size"] = 1 - - return model_factory(**model_config) - - -class Net(nn.Module): - def __init__(self, obs_size, hidden_size, n_actions): - super(Net, self).__init__() - self.net = nn.Sequential( - nn.Linear(obs_size, hidden_size), - nn.ReLU(), - nn.Linear(hidden_size, n_actions), - ) - - def forward(self, x): - return self.net(x) - - -class BaseModule(torch.nn.Module): - """ - Base torch.nn.Module implementing basic features: - - initialization factory - - normalization parameters - """ - - def __init__(self, activation_type="RELU", reset_type="xavier"): - super().__init__() - self.activation = activation_factory(activation_type) - self.reset_type = reset_type - - def _init_weights(self, m, param=None, put_bias_to_zero=False): - if hasattr(m, "weight"): - if self.reset_type == "xavier": - torch.nn.init.xavier_uniform_(m.weight.data) - elif self.reset_type == "zeros": - torch.nn.init.constant_(m.weight.data, 0.0) - elif self.reset_type == "orthogonal": - torch.nn.init.orthogonal_(m.weight.data, gain=param) - else: - raise ValueError("Unknown reset type") - if put_bias_to_zero: - if hasattr(m, "bias") and m.bias is not None: - torch.nn.init.constant_(m.bias.data, 0.0) - - def reset(self): - self.apply(self._init_weights) - - -class Table(torch.nn.Module): - """Torch module for a policy for discrete state-action spaces. - - Parameters - ---------- - state_size: int - Number of states - action_size: int - Number of actions - """ - - def __init__(self, state_size, action_size): - super().__init__() - self.policy = nn.Embedding.from_pretrained( - torch.zeros(state_size, action_size), freeze=False - ) - self.softmax = nn.Softmax(dim=-1) - - def forward(self, x): - action_probs = self.softmax(self.action_scores(x)) - return Categorical(action_probs) - - def action_scores(self, x): - return self.policy(x.long()) - - -class MultiLayerPerceptron(BaseModule): - """Torch module for an MLP. - - Parameters - ---------- - in_size: int - Input size - layer_sizes: Sequence[int] - Dimensions of each hidden layer. - reshape: bool, default = True - If True, input tensors are reshaped to (batch_size, dim) - out_size: int, optional - Output size. If None, the output size is given by the last - element of layer_sizes. - activation: {"RELU", "TANH", "ELU"} - Activation function. - is_policy: bool, default=False - If true, the :meth:`forward` method returns a distribution over the - output. - ctns_actions: bool, default=False - If true, the :meth:`forward` method returns a normal distribution - corresponding to the output. Otherwise, a categorical distribution - is returned. - std0: float, default=1.0 - Initial standard deviation for the normal distribution. Only used - if ctns_actions and is_policy are True. - reset_type: {"xavier", "orthogonal", "zeros"}, default="orthogonal" - Type of weight initialization. - pred_init_scale: float, default="auto" - Scale of the initial weights of the output layer. If "auto", the - scale is set to 0.01 for policy networks and 1.0 otherwise. - """ - - def __init__( - self, - in_size=None, - layer_sizes=None, - reshape=False, - out_size=None, - activation="RELU", - is_policy=False, - ctns_actions=False, - std0=1.0, - reset_type="orthogonal", - pred_init_scale="auto", - squashed_policy=False, - **kwargs - ): - super().__init__(reset_type=reset_type, **kwargs) - - self.reshape = reshape - self.layer_sizes = layer_sizes or [64, 64] - self.layer_sizes = list(self.layer_sizes) - self.out_size = out_size - self.activation = activation_factory(activation) - self.is_policy = is_policy - self.ctns_actions = ctns_actions - self.std0 = std0 - self.squashed_policy = squashed_policy - - # Set pred_init_scale - if pred_init_scale == "auto": - self.pred_init_scale = 0.01 if is_policy else 1.0 - else: - self.pred_init_scale = pred_init_scale - - # Instantiate parameters - sizes = [in_size] + self.layer_sizes - self.layers = nn.ModuleList( - [nn.Linear(sizes[i], sizes[i + 1]) for i in range(len(sizes) - 1)] - ) - if out_size: - if squashed_policy: - self.fc_mean = nn.Linear(256, out_size) - self.fc_logstd = nn.Linear(256, out_size) - if ctns_actions: - self.logstd = nn.Parameter(np.log(std0) * torch.ones(out_size)) - self.predict = nn.Linear(sizes[-1], out_size) - - # Initialize parameters - self.reset() - - def reset(self): - self.apply(partial(self._init_weights, param=np.log(2))) - if self.out_size: - if self.ctns_actions: - self.logstd.data.fill_(np.log(self.std0)) - self.apply( - partial(self._init_weights, param=np.log(2), put_bias_to_zero=True) - ) - self._init_weights(self.predict, param=self.pred_init_scale) - - def forward(self, x): - if self.reshape: - x = x.reshape(x.shape[0], -1) # We expect a batch of vectors - for layer in self.layers: - x = self.activation(layer(x.float())) - if self.squashed_policy: - mean = self.fc_mean(x) - log_std = self.fc_logstd(x) - log_std = torch.tanh(log_std) - LOG_STD_MAX = 2 - LOG_STD_MIN = -5 - log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) - return (mean, log_std) - if self.out_size: - x = self.predict(x) - if self.is_policy: - if self.ctns_actions: - std = torch.exp(self.logstd.expand_as(x)) - dist = Normal(x, std) - else: - action_probs = F.softmax(x, dim=-1) - dist = Categorical(action_probs) - return dist - return x - - def action_scores(self, x): - if self.is_policy: - if self.reshape: - x = x.reshape(x.shape[0], -1) # We expect a batch of vectors - for layer in self.layers: - x = self.activation(layer(x.float())) - if self.out_size: - action_scores = self.predict(x) - return action_scores - - -class DuelingNetwork(BaseModule): - """Torch module for a DQN dueling network based on a MultiLayerPerceptron. - - Parameters - ----------- - in_size: int - Input size - base_module_kwargs: dict - Parameters for :func:`~rlberry.agents.torch.utils.training.model_factory` - to build shared (MLP) architecture for the advantage and value nets. - value_kwargs: dict - Parameters for :func:`~rlberry.agents.torch.utils.training.model_factory` - to build value network (MLP). - advantage_kwargs: dict - Parameters for :func:`~rlberry.agents.torch.utils.training.model_factory` - to build advantage network (MLP). - out_size: int - Output size. - """ - - def __init__( - self, - in_size=None, - base_module_kwargs=None, - value_kwargs=None, - advantage_kwargs=None, - out_size=None, - ): - super().__init__() - self.out_size = out_size - base_module_kwargs = base_module_kwargs or {} - base_module_kwargs["in_size"] = in_size - self.base_module = model_factory(**base_module_kwargs) - value_kwargs = value_kwargs or {} - value_kwargs["in_size"] = self.base_module.layer_sizes[-1] - value_kwargs["out_size"] = 1 - self.value = model_factory(**value_kwargs) - advantage_kwargs = advantage_kwargs or {} - advantage_kwargs["in_size"] = self.base_module.layer_sizes[-1] - advantage_kwargs["out_size"] = out_size - self.advantage = model_factory(**advantage_kwargs) - - def forward(self, x): - x = self.base_module(x) - value = self.value(x).expand(-1, self.out_size) - advantage = self.advantage(x) - return ( - value + advantage - advantage.mean(1).unsqueeze(1).expand(-1, self.out_size) - ) - - -class ConvolutionalNetwork(nn.Module): - """Torch module for a CNN. - - Expects inputs of shape BCHW, where - B = batch size; - C = number of channels; - H = height; - W = width. - - For the CNN forward, if the tensor has more than 4 dimensions (not BCHW), it keeps the 3 last dimension as CHW and merge all first ones into 1 (Batch). Go through the CNN + MLP, then split the first dimension as before. - - Parameters - ---------- - activation: {"RELU", "TANH", "ELU"} - Activation function. - in_channels: int - Number of input channels C - in_height: int - Input height H - in_width: int - Input width W - head_mlp_kwargs: dict, optional - Parameters to build an MLP - (:class:`~rlberry.agents.torch.utils.models.MultiLayerPerceptron`) - using the factory - :func:`~rlberry.agents.torch.utils.training.model_factory` - - """ - - def __init__( - self, - activation="RELU", - in_channels=None, - in_height=None, - in_width=None, - head_mlp_kwargs=None, - out_size=None, - is_policy=False, - transpose_obs=False, - **kwargs - ): - super().__init__() - self.activation = activation_factory(activation) - self.conv1 = nn.Conv2d(in_channels, 16, kernel_size=2, stride=2) - self.conv2 = nn.Conv2d(16, 32, kernel_size=2, stride=2) - self.conv3 = nn.Conv2d(32, 64, kernel_size=2, stride=2) - - # MLP Head - self.head_mlp_kwargs = head_mlp_kwargs or {} - self.head_mlp_kwargs["in_size"] = self._get_conv_out_size( - [in_channels, in_height, in_width] - ) # Number of Linear input connections depends on output of conv layers - self.head_mlp_kwargs["out_size"] = out_size - self.head_mlp_kwargs["is_policy"] = is_policy - self.head = model_factory(**self.head_mlp_kwargs) - - self.is_policy = is_policy - self.transpose_obs = transpose_obs - - def _get_conv_out_size(self, shape): - """ - Computes the output dimensions of the convolution network. - Shape : dimension of the input of the CNN - """ - conv_result = self.activation((self.conv1(torch.zeros(1, *shape)))) - conv_result = self.activation((self.conv2(conv_result))) - conv_result = self.activation((self.conv3(conv_result))) - return int(np.prod(conv_result.size())) - - def convolutions(self, x): - x = x.float() - # if there is no batch (CHW), add one dimension to specify batch of 1 (and get format BCHW) - if len(x.shape) == 3: - x = x.unsqueeze(0) - if self.transpose_obs: - x = torch.transpose(x, -1, -3) - x = self.activation((self.conv1(x))) - x = self.activation((self.conv2(x))) - x = self.activation((self.conv3(x))) - x = x.view(x.size(0), -1) # flatten - return x - - def forward(self, x): - """ - Forward convolutional network - - Parameters - ---------- - x: torch.tensor - Tensor of shape BCHW (Batch,Chanel,Height,Width : if more than 4 dimensions, merge all the first in batch dimension) - """ - flag_view_to_change = False - - if len(x.shape) > 4: - flag_view_to_change = True - dim_to_retore = x.shape[:-3] - inputview_size = tuple((-1,)) + tuple(x.shape[-3:]) - outputview_size = tuple(dim_to_retore) + tuple( - (self.head_mlp_kwargs["out_size"],) - ) - x = x.view(inputview_size) - - conv_result = self.convolutions(x) - output_result = self.head( - conv_result.view(conv_result.size()[0], -1) - ) # give the 'conv_result' flattenned in 2 dimensions (batch and other) to the MLP (head) - - if flag_view_to_change: - output_result = output_result.view(outputview_size) - - return output_result - - def action_scores(self, x): - return self.head.action_scores(self.convolutions(x)) diff --git a/rlberry/agents/torch/utils/training.py b/rlberry/agents/torch/utils/training.py deleted file mode 100644 index ed338b3bb..000000000 --- a/rlberry/agents/torch/utils/training.py +++ /dev/null @@ -1,148 +0,0 @@ -import numpy as np -import torch -from gymnasium import spaces -from torch import nn as nn -from torch.nn import functional as F - - -def loss_function_factory(loss_function, **kwargs): - if loss_function == "l2": - return torch.nn.MSELoss(**kwargs) - elif loss_function == "l1": - return torch.nn.L1Loss(**kwargs) - elif loss_function == "smooth_l1": - return torch.nn.SmoothL1Loss(**kwargs) - elif loss_function == "bce": - return torch.nn.BCELoss(**kwargs) - else: - raise ValueError("Unknown loss function : {}".format(loss_function)) - - -def optimizer_factory(params, optimizer_type="ADAM", **kwargs): - if optimizer_type == "ADAM": - return torch.optim.Adam(params=params, **kwargs) - elif optimizer_type == "RMS_PROP": - return torch.optim.RMSprop(params=params, **kwargs) - else: - raise ValueError("Unknown optimizer type: {}".format(optimizer_type)) - - -def model_factory_from_env(env, **kwargs): - """Returns a torch module after setting up input/output dimensions according to an env. - - Parameters - ---------- - env: gym.Env - Environment - **kwargs: Dict - Parameters to be updated, used to call :func:`~rlberry.agents.torch.utils.training.model_factory`. - """ - kwargs = size_model_config(env, **kwargs) - return model_factory(**kwargs) - - -def model_factory(type="MultiLayerPerceptron", **kwargs) -> nn.Module: - """Build a neural net of a given type. - - Parameters - ---------- - type: {"MultiLayerPerceptron", - "ConvolutionalNetwork", - "DuelingNetwork", - "Table"}, default = "MultiLayerPerceptron" - Type of neural network. - **kwargs: dict - Parameters that vary according to each neural net type, see - - * :class:`~rlberry.agents.torch.utils.models.MultiLayerPerceptron` - - * :class:`~rlberry.agents.torch.utils.models.ConvolutionalNetwork` - - * :class:`~rlberry.agents.torch.utils.models.DuelingNetwork` - - * :class:`~rlberry.agents.torch.utils.models.Table` - """ - from rlberry.agents.torch.utils.models import ( - MultiLayerPerceptron, - DuelingNetwork, - ConvolutionalNetwork, - Table, - ) - - if type == "MultiLayerPerceptron": - return MultiLayerPerceptron(**kwargs) - elif type == "DuelingNetwork": - return DuelingNetwork(**kwargs) - elif type == "ConvolutionalNetwork": - return ConvolutionalNetwork(**kwargs) - elif type == "Table": - return Table(**kwargs) - else: - raise ValueError("Unknown model type") - - -def size_model_config(env, **model_config): - """ - Setup input/output dimensions for the configuration of - a model depending on the environment observation/action spaces. - - Parameters - ---------- - env : gym.Env - An environment. - model_config : dict - Parameters to be updated, used to call :func:`~rlberry.agents.torch.utils.training.model_factory`. - If "out_size" is not given in model_config, assumes - that the output dimension of the neural net is equal to the number - of actions in the environment. - """ - - if isinstance(env.observation_space, spaces.Box): - obs_shape = env.observation_space.shape - elif isinstance(env.observation_space, spaces.Tuple): - obs_shape = env.observation_space.spaces[0].shape - elif isinstance(env.observation_space, spaces.Discrete): - return model_config - - # Assume CHW observation space - if "type" in model_config and model_config["type"] == "ConvolutionalNetwork": - if "transpose_obs" in model_config and not model_config["transpose_obs"]: - # Assume CHW observation space - if "in_channels" not in model_config: - model_config["in_channels"] = int(obs_shape[0]) - if "in_height" not in model_config: - model_config["in_height"] = int(obs_shape[1]) - if "in_width" not in model_config: - model_config["in_width"] = int(obs_shape[2]) - else: - # Assume WHC observation space to transpose - if "in_channels" not in model_config: - model_config["in_channels"] = int(obs_shape[2]) - if "in_height" not in model_config: - model_config["in_height"] = int(obs_shape[1]) - if "in_width" not in model_config: - model_config["in_width"] = int(obs_shape[0]) - else: - model_config["in_size"] = int(np.prod(obs_shape)) - - if "out_size" not in model_config: - if isinstance(env.action_space, spaces.Discrete): - model_config["out_size"] = env.action_space.n - elif isinstance(env.action_space, spaces.Tuple): - model_config["out_size"] = env.action_space.spaces[0].n - return model_config - - -def activation_factory(activation_type): - if activation_type == "RELU": - return F.relu - elif activation_type == "TANH": - return torch.tanh - elif activation_type == "ELU": - return nn.ELU() - else: - raise ValueError("Unknown activation_type: {}".format(activation_type)) - - -def trainable_parameters(model): - return sum(p.numel() for p in model.parameters() if p.requires_grad) diff --git a/rlberry/agents/ucbvi/__init__.py b/rlberry/agents/ucbvi/__init__.py deleted file mode 100644 index 031f77a6f..000000000 --- a/rlberry/agents/ucbvi/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .ucbvi import UCBVIAgent diff --git a/rlberry/agents/ucbvi/ucbvi.py b/rlberry/agents/ucbvi/ucbvi.py deleted file mode 100644 index d5dfc4e67..000000000 --- a/rlberry/agents/ucbvi/ucbvi.py +++ /dev/null @@ -1,332 +0,0 @@ -import numpy as np - -import gymnasium.spaces as spaces -from rlberry.agents import AgentWithSimplePolicy -from rlberry.agents.ucbvi.utils import ( - update_value_and_get_action, - update_value_and_get_action_sd, -) -from rlberry.exploration_tools.discrete_counter import DiscreteCounter -from rlberry.agents.dynprog.utils import ( - backward_induction_sd, - backward_induction_reward_sd, -) -from rlberry.agents.dynprog.utils import backward_induction_in_place - -import rlberry - -logger = rlberry.logger - - -class UCBVIAgent(AgentWithSimplePolicy): - """ - UCBVI [1]_ with custom exploration bonus. - - Notes - ----- - The recommended policy after all the episodes is computed without - exploration bonuses. - - Parameters - ---------- - env : gym.Env - Environment with discrete states and actions. - gamma : double, default: 1.0 - Discount factor in [0, 1]. If gamma is 1.0, the problem is set to - be finite-horizon. - horizon : int - Horizon of the objective function. If None and gamma<1, set to - 1/(1-gamma). - bonus_scale_factor : double, default: 1.0 - Constant by which to multiply the exploration bonus, controls - the level of exploration. - bonus_type : {"simplified_bernstein"} - Type of exploration bonus. Currently, only "simplified_bernstein" - is implemented. If `reward_free` is true, this parameter is ignored - and the algorithm uses 1/n bonuses. - reward_free : bool, default: False - If true, ignores rewards and uses only 1/n bonuses. - stage_dependent : bool, default: False - If true, assume that transitions and rewards can change with the stage h. - real_time_dp : bool, default: False - If true, uses real-time dynamic programming [2]_ instead of full backward induction - for the sampling policy. - - References - ---------- - .. [1] Azar et al., 2017 - Minimax Regret Bounds for Reinforcement Learning - https://arxiv.org/abs/1703.05449 - - .. [2] Efroni, Yonathan, et al. - Tight regret bounds for model-based reinforcement learning with greedy policies. - Advances in Neural Information Processing Systems. 2019. - https://papers.nips.cc/paper/2019/file/25caef3a545a1fff2ff4055484f0e758-Paper.pdf - """ - - name = "UCBVI" - - def __init__( - self, - env, - gamma=1.0, - horizon=100, - bonus_scale_factor=1.0, - bonus_type="simplified_bernstein", - reward_free=False, - stage_dependent=False, - real_time_dp=False, - **kwargs - ): - # init base class - AgentWithSimplePolicy.__init__(self, env, **kwargs) - - self.gamma = gamma - self.horizon = horizon - self.bonus_scale_factor = bonus_scale_factor - self.bonus_type = bonus_type - self.reward_free = reward_free - self.stage_dependent = stage_dependent - self.real_time_dp = real_time_dp - - # check environment - assert isinstance(self.env.observation_space, spaces.Discrete) - assert isinstance(self.env.action_space, spaces.Discrete) - - # other checks - assert gamma >= 0 and gamma <= 1.0 - if self.horizon is None: - assert gamma < 1.0, "If no horizon is given, gamma must be smaller than 1." - self.horizon = int(np.ceil(1.0 / (1.0 - gamma))) - - # maximum value - r_range = self.env.reward_range[1] - self.env.reward_range[0] - if r_range == np.inf or r_range == 0.0: - logger.warning( - "{}: Reward range is zero or infinity. ".format(self.name) - + "Setting it to 1." - ) - r_range = 1.0 - - self.v_max = np.zeros(self.horizon) - self.v_max[-1] = r_range - for hh in reversed(range(self.horizon - 1)): - self.v_max[hh] = r_range + self.gamma * self.v_max[hh + 1] - - # initialize - self.reset() - - def reset(self, **kwargs): - H = self.horizon - S = self.env.observation_space.n - A = self.env.action_space.n - - if self.stage_dependent: - shape_hsa = (H, S, A) - shape_hsas = (H, S, A, S) - else: - shape_hsa = (S, A) - shape_hsas = (S, A, S) - - # visit counter - self.N_sa = np.zeros(shape_hsa) - # bonus - self.B_sa = np.zeros((H, S, A)) - - # MDP estimator - self.R_hat = np.zeros(shape_hsa) - self.P_hat = np.ones(shape_hsas) * 1.0 / S - - # Value functions - self.V = np.ones((H, S)) - self.Q = np.zeros((H, S, A)) - # for rec. policy - self.V_policy = np.zeros((H, S)) - self.Q_policy = np.zeros((H, S, A)) - - # Init V and bonus - for hh in range(self.horizon): - self.B_sa[hh, :, :] = self.v_max[hh] - self.V[hh, :] = self.v_max[hh] - - # ep counter - self.episode = 0 - - # useful object to compute total number of visited states & entropy of visited states - self.counter = DiscreteCounter( - self.env.observation_space, self.env.action_space - ) - - # update name - if self.real_time_dp: - self.name = "UCBVI-RTDP" - - def policy(self, observation): - state = observation - assert self.Q_policy is not None - return self.Q_policy[0, state, :].argmax() - - def _get_action(self, state, hh=0): - """Sampling policy.""" - if not self.real_time_dp: - assert self.Q is not None - return self.Q[hh, state, :].argmax() - else: - if self.stage_dependent: - update_fn = update_value_and_get_action_sd - else: - update_fn = update_value_and_get_action - return update_fn( - state, - hh, - self.V, - self.R_hat, - self.P_hat, - self.B_sa, - self.gamma, - self.v_max, - ) - - def _compute_bonus(self, n, hh): - # reward-free - if self.reward_free: - bonus = 1.0 / n - return bonus - - # not reward-free - if self.bonus_type == "simplified_bernstein": - bonus = self.bonus_scale_factor * np.sqrt(1.0 / n) + self.v_max[hh] / n - bonus = min(bonus, self.v_max[hh]) - return bonus - else: - raise ValueError( - "Error: bonus type {} not implemented".format(self.bonus_type) - ) - - def _update(self, state, action, next_state, reward, hh): - if self.stage_dependent: - self.N_sa[hh, state, action] += 1 - - nn = self.N_sa[hh, state, action] - prev_r = self.R_hat[hh, state, action] - prev_p = self.P_hat[hh, state, action, :] - - self.R_hat[hh, state, action] = ( - 1.0 - 1.0 / nn - ) * prev_r + reward * 1.0 / nn - - self.P_hat[hh, state, action, :] = (1.0 - 1.0 / nn) * prev_p - self.P_hat[hh, state, action, next_state] += 1.0 / nn - - self.B_sa[hh, state, action] = self._compute_bonus(nn, hh) - - else: - self.N_sa[state, action] += 1 - - nn = self.N_sa[state, action] - prev_r = self.R_hat[state, action] - prev_p = self.P_hat[state, action, :] - - self.R_hat[state, action] = (1.0 - 1.0 / nn) * prev_r + reward * 1.0 / nn - - self.P_hat[state, action, :] = (1.0 - 1.0 / nn) * prev_p - self.P_hat[state, action, next_state] += 1.0 / nn - - self.B_sa[hh, state, action] = self._compute_bonus(nn, hh) - - def _run_episode(self): - # interact for H steps - episode_rewards = 0 - observation, info = self.env.reset() - for hh in range(self.horizon): - action = self._get_action(observation, hh) - next_observation, reward, terminated, truncated, info = self.env.step( - action - ) - done = terminated or truncated - episode_rewards += reward # used for logging only - - self.counter.update(observation, action) - - if self.reward_free: - reward = 0.0 # set to zero before update if reward_free - - self._update(observation, action, next_observation, reward, hh) - - observation = next_observation - if done: - break - - # run backward induction - if not self.real_time_dp: - if self.stage_dependent: - backward_induction_sd( - self.Q, - self.V, - self.R_hat + self.B_sa, - self.P_hat, - self.gamma, - self.v_max[0], - ) - else: - backward_induction_reward_sd( - self.Q, - self.V, - self.R_hat + self.B_sa, - self.P_hat, - self.gamma, - self.v_max[0], - ) - - # update info - self.episode += 1 - - # writer - if self.writer is not None: - self.writer.add_scalar("episode_rewards", episode_rewards, self.episode) - self.writer.add_scalar( - "n_visited_states", self.counter.get_n_visited_states(), self.episode - ) - - # return sum of rewards collected in the episode - return episode_rewards - - def fit(self, budget: int, **kwargs): - """ - - Train the agent using the provided environment. - - Parameters - ---------- - budget: int - number of episodes - **kwargs - Extra arguments. Not used for this agent. - """ - del kwargs - n_episodes_to_run = budget - count = 0 - while count < n_episodes_to_run: - self._run_episode() - count += 1 - - # compute Q function for the recommended policy - if self.stage_dependent: - backward_induction_sd( - self.Q_policy, - self.V_policy, - self.R_hat, - self.P_hat, - self.gamma, - self.v_max[0], - ) - else: - backward_induction_in_place( - self.Q_policy, - self.V_policy, - self.R_hat, - self.P_hat, - self.horizon, - self.gamma, - self.v_max[0], - ) diff --git a/rlberry/agents/ucbvi/utils.py b/rlberry/agents/ucbvi/utils.py deleted file mode 100644 index 9e4823475..000000000 --- a/rlberry/agents/ucbvi/utils.py +++ /dev/null @@ -1,83 +0,0 @@ -from rlberry.utils.jit_setup import numba_jit - - -@numba_jit -def update_value_and_get_action(state, hh, V, R_hat, P_hat, B_sa, gamma, v_max): - """ - state : int - hh : int - V : np.ndarray - shape (H, S) - R_hat : np.ndarray - shape (S, A) - P_hat : np.ndarray - shape (S, A, S) - B_sa : np.ndarray - shape (H, S, A) - gamma : double - v_max : np.ndarray - shape (H,) - """ - H = V.shape[0] - S, A = R_hat.shape[-2:] - best_action = 0 - max_val = 0 - previous_value = V[hh, state] - - for aa in range(A): - q_aa = R_hat[state, aa] + B_sa[hh, state, aa] - - if hh < H - 1: - for sn in range(S): - q_aa += gamma * P_hat[state, aa, sn] * V[hh + 1, sn] - - if aa == 0 or q_aa > max_val: - max_val = q_aa - best_action = aa - - V[hh, state] = max_val - V[hh, state] = min(v_max[hh], V[hh, state]) - V[hh, state] = min(previous_value, V[hh, state]) - - return best_action - - -@numba_jit -def update_value_and_get_action_sd(state, hh, V, R_hat, P_hat, B_sa, gamma, v_max): - """ - state : int - hh : int - V : np.ndarray - shape (H, S) - R_hat : np.ndarray - shape (H, S, A) - P_hat : np.ndarray - shape (H, S, A, S) - B_sa : np.ndarray - shape (S, A) - gamma : double - v_max : np.ndarray - shape (H,) - """ - H = V.shape[0] - S, A = R_hat.shape[-2:] - best_action = 0 - max_val = 0 - previous_value = V[hh, state] - - for aa in range(A): - q_aa = R_hat[hh, state, aa] + B_sa[hh, state, aa] - - if hh < H - 1: - for sn in range(S): - q_aa += gamma * P_hat[hh, state, aa, sn] * V[hh + 1, sn] - - if aa == 0 or q_aa > max_val: - max_val = q_aa - best_action = aa - - V[hh, state] = max_val - V[hh, state] = min(v_max[hh], V[hh, state]) - V[hh, state] = min(previous_value, V[hh, state]) - - return best_action diff --git a/rlberry/agents/utils/memories.py b/rlberry/agents/utils/memories.py deleted file mode 100644 index 1677efd19..000000000 --- a/rlberry/agents/utils/memories.py +++ /dev/null @@ -1,59 +0,0 @@ -import numpy as np -from collections import namedtuple - -Transition = namedtuple( - "Transition", ("state", "action", "reward", "next_state", "terminal", "info") -) - - -class ReplayMemory(object): - """ - Container that stores and samples transitions. - """ - - def __init__(self, capacity=10000, **kwargs): - self.capacity = int(capacity) - self.memory = [] - self.position = 0 - - def push(self, item): - """Saves a thing.""" - if len(self.memory) < self.capacity: - self.memory.append(item) - else: - self.memory[self.position] = item - # Faster than append and pop - self.position = (self.position + 1) % self.capacity - - def _encode_sample(self, idxes): - return [self.memory[idx] for idx in idxes] - - def sample(self, batch_size): - batch_size = min(batch_size, len(self)) - idxes = np.random.choice(len(self.memory), size=batch_size) - return self._encode_sample(idxes), idxes - - def __len__(self): - return len(self.memory) - - def is_full(self): - return len(self.memory) == self.capacity - - def is_empty(self): - return len(self.memory) == 0 - - -class Memory: - def __init__(self): - self.actions = [] - self.states = [] - self.logprobs = [] - self.rewards = [] - self.is_terminals = [] - - def clear_memory(self): - del self.actions[:] - del self.states[:] - del self.logprobs[:] - del self.rewards[:] - del self.is_terminals[:] diff --git a/rlberry/colab_utils/__init__.py b/rlberry/colab_utils/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rlberry/colab_utils/display_setup.py b/rlberry/colab_utils/display_setup.py deleted file mode 100644 index 302e589eb..000000000 --- a/rlberry/colab_utils/display_setup.py +++ /dev/null @@ -1,37 +0,0 @@ -# -# Code to visualize the environments. -# - -import base64 -from pyvirtualdisplay import Display -from IPython import display as ipythondisplay - -# from IPython.display import clear_output -from pathlib import Path - - -def show_video(filename=None, directory="./videos"): - """ - Either show all videos in a directory (if filename is None) or - show video corresponding to filename. - """ - html = [] - if filename is not None: - files = Path("./").glob(filename) - else: - files = Path(directory).glob("*.mp4") - for mp4 in files: - video_b64 = base64.b64encode(mp4.read_bytes()) - html.append( - """""".format( - mp4, video_b64.decode("ascii") - ) - ) - ipythondisplay.display(ipythondisplay.HTML(data="
".join(html))) - - -display = Display(visible=0, size=(1400, 900)) -display.start() diff --git a/rlberry/envs/__init__.py b/rlberry/envs/__init__.py index 96d4442f0..dd360af93 100644 --- a/rlberry/envs/__init__.py +++ b/rlberry/envs/__init__.py @@ -1,6 +1,5 @@ from .gym_make import gym_make, atari_make from .basewrapper import Wrapper -from .classic_control import Acrobot, MountainCar, Pendulum, SpringCartPole -from .finite import Chain, FiniteMDP, GridWorld from .interface import Model from .pipeline import PipelineEnv +from .finite_mdp import FiniteMDP diff --git a/rlberry/envs/bandits/__init__.py b/rlberry/envs/bandits/__init__.py deleted file mode 100644 index ca602a3af..000000000 --- a/rlberry/envs/bandits/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .bandit_base import AdversarialBandit, Bandit -from .corrupted_bandits import CorruptedLaws, CorruptedNormalBandit -from .stochastic_bandits import BernoulliBandit, NormalBandit diff --git a/rlberry/envs/bandits/bandit_base.py b/rlberry/envs/bandits/bandit_base.py deleted file mode 100644 index 95ceeb1a2..000000000 --- a/rlberry/envs/bandits/bandit_base.py +++ /dev/null @@ -1,115 +0,0 @@ -from collections import deque - - -from rlberry.envs.interface import Model -import rlberry.spaces as spaces - -import rlberry - -logger = rlberry.logger - - -class Bandit(Model): - """ - Base class for a stochastic multi-armed bandit. - - Parameters - ---------- - laws: list of laws. - laws of the arms. can either be a frozen scipy law or any class that - has a method .rvs(). - - **kwargs: keywords arguments - additional arguments sent to :class:`~rlberry.envs.interface.Model` - - Attributes - ---------- - laws: list - laws of the arms. can either be a frozen scipy law or any class that - has a method .rvs(). - n_arms: int - Number of arms. - action_space: spaces.Discrete - Action space when viewing the bandit as a single-state MDP. - rewards: list - For each arm, pre-sample 10 times. - n_rewards: list - Reward counter per arm. - """ - - name = "" - - def __init__(self, laws=[], **kwargs): - Model.__init__(self, **kwargs) - self.laws = laws - self.n_arms = len(self.laws) - self.action_space = spaces.Discrete(self.n_arms) - - # Pre-sample 10 samples - self.rewards = [ - deque(self.laws[action].rvs(size=10, random_state=self.rng)) - for action in range(self.n_arms) - ] - self.n_rewards = [10] * self.n_arms - - def step(self, action): - """ - Sample the reward associated to the action. - """ - # test that the action exists - assert action < self.n_arms - - reward = self.laws[action].rvs(random_state=self.rng, size=1)[0] - terminated = True - truncated = False - - return 0, reward, terminated, truncated, {} - - def reset(self, seed=None, option=None): - """ - Reset the environment to a default state. - """ - return 0, {} - - -class AdversarialBandit(Model): - """ - Base class for a adversarial multi-armed bandit with oblivious - opponent, i.e all rewards are fixed in advance at the start of the run. - - Parameters - ---------- - rewards: list of rewards, shape (T, A). - Possible rewards up to horizon T for each of the A arms. - - **kwargs: keywords arguments - additional arguments sent to :class:`~rlberry.envs.interface.Model` - - """ - - name = "" - - def __init__(self, rewards=[], **kwargs): - Model.__init__(self, **kwargs) - self.n_arms = rewards.shape[1] - self.rewards = deque(rewards) - self.action_space = spaces.Discrete(self.n_arms) - - def step(self, action): - """ - Sample the reward associated to the action. - """ - # test that the action exists - assert action < self.n_arms - - rewards = self.rewards.popleft() - reward = rewards[action] - terminated = True - truncated = False - return 0, reward, terminated, truncated, {} - - def reset(self, seed=None, option=None): - """ - Reset the environment to a default state. - """ - return 0, {} diff --git a/rlberry/envs/bandits/corrupted_bandits.py b/rlberry/envs/bandits/corrupted_bandits.py deleted file mode 100644 index 2ac703588..000000000 --- a/rlberry/envs/bandits/corrupted_bandits.py +++ /dev/null @@ -1,90 +0,0 @@ -import numpy as np -from scipy import stats - -from rlberry.envs.bandits import Bandit - - -class CorruptedLaws: - """ - Class for corrupted laws. - - Parameters - ---------- - law: law - Can either be a frozen scipy law or any class that - has a method .rvs() to sample according to the given law. - - - cor_prop: float in (0,1/2) - Proportion of corruption - - cor_law: law - Laws of corruption. - """ - - def __init__(self, law, cor_prop, cor_law): - self.law = law - self.cor_prop = cor_prop - self.cor_law = cor_law - - def rvs(self, size, random_state): - is_corrupted = random_state.binomial(1, self.cor_prop, size=size) - cor_sample = self.cor_law.rvs(size=size, random_state=random_state) - noncor_sample = self.law.rvs(size=size, random_state=random_state) - return is_corrupted * cor_sample + (1 - is_corrupted) * noncor_sample - - def mean(self): - return ( - 1 - self.cor_prop - ) * self.law.mean() + self.cor_prop * self.cor_law.mean() - - -class CorruptedNormalBandit(Bandit): - """ - Class for Bandits corrupted by nature. - - Parameters - ---------- - means: array-like of size n_arms, default=array([0,1]) - means of the law of inliers of each of the arms. - - stds: array-like of size n_arms or None, default=None - stds of the law of inliers of each of the arms. If None, use array with - all ones. - - cor_prop: float in (0,1/2), default=0.05 - proportion of corruption - - cor_laws: list of scipy frozen laws or None, default=None - laws of corruption on each arm. If None, all the arms are corrupted by - a normal of mean 1000 and std 1. - """ - - def __init__( - self, - means=np.array([0, 1]), - stds=None, - cor_prop=0.05, - cor_laws=None, - ): - laws = self.make_laws(means, stds, cor_prop, cor_laws) - Bandit.__init__(self, laws=laws) - - def make_laws(self, means, stds, cor_prop, cor_laws): - if cor_laws is not None: - self.cor_laws = cor_laws - else: - self.cor_laws = [stats.norm(loc=1000) for a in range(len(means))] - if stds is None: - self.stds = np.ones(len(means)) - else: - self.stds = stds - assert len(means) == len(self.stds) - assert cor_prop <= 0.5 - inlier_laws = [ - stats.norm(loc=means[a], scale=self.stds[a]) for a in range(len(means)) - ] - return [ - CorruptedLaws(inlier_laws[a], cor_prop, self.cor_laws[a]) - for a in range(len(means)) - ] diff --git a/rlberry/envs/bandits/stochastic_bandits.py b/rlberry/envs/bandits/stochastic_bandits.py deleted file mode 100644 index e4ecf4f88..000000000 --- a/rlberry/envs/bandits/stochastic_bandits.py +++ /dev/null @@ -1,58 +0,0 @@ -import numpy as np -from scipy import stats - -from rlberry.envs.bandits import Bandit - - -class NormalBandit(Bandit): - """ - Class for Normal Bandits - - Parameters - ---------- - means: array-like of size n_arms, default=array([0,1]) - means of the law of each of the arms. - - stds: array-like of size n_arms or None, default=None - stds of the law of each of the arms. If None, use array with - all ones. - - """ - - def __init__( - self, - means=np.array([0, 1]), - stds=None, - ): - laws = self.make_laws(means, stds) - Bandit.__init__(self, laws=laws) - - def make_laws(self, means, stds): - if stds is None: - self.stds = np.ones(len(means)) - else: - self.stds = stds - assert len(means) == len(self.stds) - return [stats.norm(loc=means[a], scale=self.stds[a]) for a in range(len(means))] - - -class BernoulliBandit(Bandit): - """ - Class for Bernoulli Bandits - - Parameters - ---------- - p: array-like of size n_arms, default=array([0.1,0.9]) - means of the law of inliers of each of the arms. - - """ - - def __init__( - self, - p=np.array([0.1, 0.9]), - ): - laws = self.make_laws(p) - Bandit.__init__(self, laws=laws) - - def make_laws(self, p): - return [stats.binom(n=1, p=p[a]) for a in range(len(p))] diff --git a/rlberry/envs/benchmarks/__init__.py b/rlberry/envs/benchmarks/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rlberry/envs/benchmarks/ball_exploration/__init__.py b/rlberry/envs/benchmarks/ball_exploration/__init__.py deleted file mode 100644 index 390066140..000000000 --- a/rlberry/envs/benchmarks/ball_exploration/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .pball import PBall, PBall2D, SimplePBallND diff --git a/rlberry/envs/benchmarks/ball_exploration/ball2d.py b/rlberry/envs/benchmarks/ball_exploration/ball2d.py deleted file mode 100644 index bb1b43d7e..000000000 --- a/rlberry/envs/benchmarks/ball_exploration/ball2d.py +++ /dev/null @@ -1,220 +0,0 @@ -""" -This files provides a set of 2D environments with increasing difficulty -of exploration. - -The difficulty is ranked by the level. - -Important: - * To create instances, use the function get_benchmark_env(level). - * The horizon H is also set as an attribute of the environment. -""" - -import numpy as np - -from rlberry.wrappers.autoreset import AutoResetWrapper -from rlberry.envs.benchmarks.ball_exploration.pball import PBall2D - - -def get_benchmark_env(level=1): - if level == 0: - env = _get_autoreset_env(BallLevel0()) - return env - elif level == 1: - env = _get_autoreset_env(BallLevel1()) - return env - elif level == 2: - env = _get_autoreset_env(BallLevel2()) - return env - elif level == 3: - env = _get_autoreset_env(BallLevel3()) - return env - elif level == 4: - env = _get_autoreset_env(BallLevel4()) - return env - elif level == 5: - env = _get_autoreset_env(BallLevel5()) - return env - else: - raise NotImplementedError("Invalid benchmark level.") - - -def _get_autoreset_env(env): - horizon = env.horizon - return AutoResetWrapper(env, horizon) - - -# -# Level 0 (reward free!) -# -class BallLevel0(PBall2D): - """ - Reward-free (0 reward) - """ - - def __init__(self): - self.horizon = 30 - # - self.p = 2 - self.action_list = [ - np.array([0.0, 0.0]), - 0.05 * np.array([1.0, 0.0]), - -0.05 * np.array([1.0, 0.0]), - 0.05 * np.array([0.0, 1.0]), - -0.05 * np.array([0.0, 1.0]), - ] - - self.reward_amplitudes = [] - self.reward_smoothness = [] - self.reward_centers = [] - self.A = np.eye(2) - self.B = np.eye(2) - self.sigma = 0.01 - self.sigma_init = 0.001 - self.mu_init = np.array([0.0, 0.0]) - - PBall2D.__init__( - self, - self.p, - self.action_list, - self.reward_amplitudes, - self.reward_smoothness, - self.reward_centers, - self.A, - self.B, - self.sigma, - self.sigma_init, - self.mu_init, - ) - self.name = "Ball Exploration Benchmark - Level 0 (Reward-Free)" - - -# -# Level 1 -# - - -class BallLevel1(PBall2D): - """ - Dense rewards - """ - - def __init__(self): - self.horizon = 30 - # - self.p = 2 - self.action_list = [ - np.array([0.0, 0.0]), - 0.05 * np.array([1.0, 0.0]), - -0.05 * np.array([1.0, 0.0]), - 0.05 * np.array([0.0, 1.0]), - -0.05 * np.array([0.0, 1.0]), - ] - - self.reward_amplitudes = np.array([1.0]) - self.reward_smoothness = np.array([0.5 * np.sqrt(2)]) - self.reward_centers = [np.array([0.5, 0.5])] - self.A = np.eye(2) - self.B = np.eye(2) - self.sigma = 0.01 - self.sigma_init = 0.001 - self.mu_init = np.array([0.0, 0.0]) - - PBall2D.__init__( - self, - self.p, - self.action_list, - self.reward_amplitudes, - self.reward_smoothness, - self.reward_centers, - self.A, - self.B, - self.sigma, - self.sigma_init, - self.mu_init, - ) - self.name = "Ball Exploration Benchmark - Level 1" - - -# -# Level 2 -# - - -class BallLevel2(BallLevel1): - """ - Sparse rewards - """ - - def __init__(self): - BallLevel1.__init__(self) - self.reward_amplitudes = np.array([1.0]) - self.reward_smoothness = np.array([0.2]) - self.reward_centers = [np.array([0.5, 0.5])] - self.name = "Ball Exploration Benchmark - Level 2" - - -# -# Level 3 -# - - -class BallLevel3(BallLevel2): - """ - Sparse rewards, noisier - """ - - def __init__(self): - BallLevel2.__init__(self) - self.sigma = 0.025 - self.name = "Ball Exploration Benchmark - Level 3" - - -# -# Level 4 -# - - -class BallLevel4(BallLevel1): - """ - Far sparse reward (as lvl 2) + dense suboptimal rewards - """ - - def __init__(self): - BallLevel1.__init__(self) - - self.reward_amplitudes = np.array([1.0, 0.1]) - self.reward_smoothness = np.array([0.2, 0.5 * np.sqrt(2)]) - self.reward_centers = [ - np.array([-0.5, -0.5]), # far sparse - np.array([0.5, 0.5]), - ] # dense - self.name = "Ball Exploration Benchmark - Level 4" - - -# -# Level 5 -# - - -class BallLevel5(BallLevel4): - """ - Far sparse reward (as lvl 2) + dense suboptimal rewards, noisier - """ - - def __init__(self): - BallLevel4.__init__(self) - self.sigma = 0.025 - self.name = "Ball Exploration Benchmark - Level 5" - - -# if __name__ == '__main__': -# env = get_benchmark_env(1) -# env.enable_rendering() -# for ii in range(100): -# # env.step(1) -# # env.step(3) -# # env.step(env.action_space.sample()) -# # env.step(0) -# env.step(4) - -# env.render() diff --git a/rlberry/envs/benchmarks/ball_exploration/pball.py b/rlberry/envs/benchmarks/ball_exploration/pball.py deleted file mode 100644 index 4f7e9c479..000000000 --- a/rlberry/envs/benchmarks/ball_exploration/pball.py +++ /dev/null @@ -1,482 +0,0 @@ -import numpy as np - - -import rlberry.spaces as spaces -from rlberry.envs.interface import Model -from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D - -import rlberry - -logger = rlberry.logger - - -def projection_to_pball(x, p): - """ - Solve the problem: - min_z ||x-z||_2^2 - s.t. ||z||_p <= 1 - for p = 2 or p = np.inf - - If p is not 2 or np.inf, it returns x/norm_p(x) if norm_p(x) > 1 - - WARNING: projection_to_pball is not actually a projection for p!=2 - or p=!np.inf - """ - if np.linalg.norm(x, ord=p) <= 1.0: - return x - - if p == 2: - z = x / np.linalg.norm(x, ord=p) - return z - - if p == np.inf: - z = np.minimum(1.0, np.maximum(x, -1.0)) - return z - - # below it is not a projection - return x / np.linalg.norm(x, ord=p) - - -class PBall(Model): - """ - Parametric family of environments whose state space is a unit sphere - according to the p-norm in R^d. - - Note: - The projection function is only a true projection for - p in {2, infinity}. - - ---------------------------------------------------------------------- - State space: - x in R^d: norm_p (x) <= 1 - - implemented as rlberry.spaces.Box representing [0, 1]^d - ---------------------------------------------------------------------- - Action space: - {u_1, ..., u_m} such that u_i in R^d' for i = 1, ..., m - - implemented as rlberry.spaces.Discrete(m) - ---------------------------------------------------------------------- - Reward function (independent of the actions): - r(x) = sum_{i=1}^n b_i max( 0, 1 - norm_p( x - x_i )/c_i ) - - requirements: - c_i >= 0 - b_i in [0, 1] - ---------------------------------------------------------------------- - Transitions: - x_{t+1} = A x_t + B u_t + N - - where - A: square matrix of size d - B: matrix of size (d, d') - N: d-dimensional Gaussian noise with zero mean and covariance - matrix sigma*I - ---------------------------------------------------------------------- - Initial state: - d-dimensional Gaussian with mean mu_init and covariance matrix - sigma_init*I - ---------------------------------------------------------------------- - - Default parameters are provided for a 2D environment, PBall2D - """ - - name = "LP-Ball" - - def __init__( - self, - p, - action_list, - reward_amplitudes, - reward_smoothness, - reward_centers, - A, - B, - sigma, - sigma_init, - mu_init, - ): - """ - Parameters - ----------- - p : int - parameter of the p-norm - action_list : list - list of actions {u_1, ..., u_m}, each action u_i is a - d'-dimensional array - reward_amplitudes: list - list of reward amplitudes: {b_1, ..., b_n} - reward_smoothness : list - list of reward smoothness: {c_1, ..., c_n} - reward_centers : list - list of reward centers: {x_1, ..., x_n} - A : numpy.ndarray - array A of size (d, d) - B : numpy.ndarray - array B of size (d, d') - sigma : double - transition noise sigma - sigma_init : double - initial state noise sigma_init - mu_init : numpy.ndarray - array of size (d,) containing the mean of the initial state - """ - Model.__init__(self) - - assert p >= 1, "PBall requires p>=1" - if p not in [2, np.inf]: - logger.warning( - "For p!=2 or p!=np.inf, PBall \ -does not make true projections onto the lp ball." - ) - self.p = p - self.d, self.dp = B.shape # d and d' - self.m = len(action_list) - self.action_list = action_list - self.reward_amplitudes = reward_amplitudes - self.reward_smoothness = reward_smoothness - self.reward_centers = reward_centers - self.A = A - self.B = B - self.sigma = sigma - self.sigma_init = sigma_init - self.mu_init = mu_init - - # State and action spaces - low = -1.0 * np.ones(self.d, dtype=np.float64) - high = np.ones(self.d, dtype=np.float64) - self.observation_space = spaces.Box(low, high) - self.action_space = spaces.Discrete(self.m) - - # reward range - assert len(self.reward_amplitudes) == len(self.reward_smoothness) - assert len(self.reward_amplitudes) == len(self.reward_centers) - if len(self.reward_amplitudes) > 0: - assert ( - self.reward_amplitudes.max() <= 1.0 - and self.reward_amplitudes.min() >= 0.0 - ), "reward amplitudes b_i must be in [0, 1]" - assert ( - self.reward_smoothness.min() > 0.0 - ), "reward smoothness c_i must be > 0" - self.reward_range = (0, 1.0) - - # - self.name = "Lp-Ball" - - # Initalize state - self.reset() - - def reset(self, state=None, seed=None, options=None): - if state is not None: - self.state = state - else: - self.state = self.mu_init + self.sigma_init * self.seeder.rng.normal( - size=self.d - ) - # projection to unit ball - self.state = projection_to_pball(self.state, self.p) - return self.state.copy(), {} - - def sample(self, state, action): - assert self.action_space.contains(action) - assert self.observation_space.contains(state) - - # next state - action_vec = self.action_list[action] - next_s = ( - self.A.dot(state) - + self.B.dot(action_vec) - + self.sigma * self.rng.normal(size=self.d) - ) - next_s = projection_to_pball(next_s, self.p) - - # done and reward - terminated = False - truncated = False - reward = self.compute_reward_at(state) - - return next_s, reward, terminated, truncated, {} - - def step(self, action): - next_s, reward, terminated, truncated, info = self.sample(self.state, action) - self.state = next_s.copy() - return next_s, reward, terminated, truncated, info - - def compute_reward_at(self, x): - reward = 0.0 - for ii, b_ii in enumerate(self.reward_amplitudes): - c_ii = self.reward_smoothness[ii] - x_ii = self.reward_centers[ii] - dist = np.linalg.norm(x - x_ii, ord=self.p) - reward += b_ii * max(0.0, 1.0 - dist / c_ii) - return reward - - def get_reward_lipschitz_constant(self): - ratios = self.reward_amplitudes / self.reward_smoothness - Lr = ratios.max() - return Lr - - def get_transitions_lipschitz_constant(self): - """ - note: considers a fixed action, returns Lipschitz constant - w.r.t. to states. - - If p!=1, p!=2 or p!=np.inf, returns an upper bound on the induced norm - """ - if self.p == 1: - order = np.inf - else: - order = self.p / (self.p - 1.0) - - if order in [1, 2]: - return np.linalg.norm(self.A, ord=order) - - # If p!=1, p!=2 or p!=np.inf, return upper bound on the induced norm. - return np.power(self.d, 1.0 / self.p) * np.linalg.norm(self.A, ord=np.inf) - - -class PBall2D(RenderInterface2D, PBall): - """ - Parametric family of environments whose state space is a unit sphere - according to the p-norm in R^d. - - Parameters - ---------- - p : int, default = 2 - value of p for which p-norm Sphere is considered. - - action_list : list, default = [array([0.05, 0.]), array([- 0.05, - 0.]), - array([0., 0.05]), array([- 0., - 0.05])] - list of actions described as segment in 2D. - - reward_amplitudes: array, default = array([1.]). - See reward function. - - reward_smoothness: array, default = array([0.25]) - See reward function. - - reward_centers: list of arrays, default = [array([0.75, 0.])] - See reward function. - - A: 2D array, default = array([[1., 0.], [0., 1.]]) - See Transition function. - - B: 2D array, default = array([[1., 0.], [0., 1.]]) - See Transition function. - - sigma: float, default = 0.01 - See Transition function. - - sigma_init: float, default = 0.001 - See Initial state. - - mu_init: array of length 2, default = array([0., 0.]) - See Initial state. - - Note: - The projection function is only a true projection for - p in {2, infinity}. - - ---------------------------------------------------------------------- - State space: - x in R^d: norm_p (x) <= 1 - - implemented as rlberry.spaces.Box representing [0, 1]^2 - ---------------------------------------------------------------------- - Action space: - {u_1, ..., u_m} such that u_i in R^2 for i = 1, ..., m - - implemented as rlberry.spaces.Discrete(m) - ---------------------------------------------------------------------- - Reward function (independent of the actions): - r(x) = sum_{i=1}^n b_i max( 0, 1 - norm_p( x - x_i )/c_i ) - - requirements: - c_i >= 0 - b_i in [0, 1] - ---------------------------------------------------------------------- - Transitions: - x_{t+1} = A x_t + B u_t + N - - where - A: square matrix of size 2 - B: matrix of size (2, 2) - N: d-dimensional Gaussian noise with zero mean and covariance - matrix sigma*I - ---------------------------------------------------------------------- - Initial state: - 2-dimensional Gaussian with mean mu_init and covariance matrix - sigma_init*I - ---------------------------------------------------------------------- - - """ - - def __init__( - self, - p=2, - action_list=[ - 0.05 * np.array([1, 0]), - -0.05 * np.array([1, 0]), - 0.05 * np.array([0, 1]), - -0.05 * np.array([0, 1]), - ], - reward_amplitudes=np.array([1.0]), - reward_smoothness=np.array([0.25]), - reward_centers=[np.array([0.75, 0.0])], - A=np.eye(2), - B=np.eye(2), - sigma=0.01, - sigma_init=0.001, - mu_init=np.array([0.0, 0.0]), - ): - # Initialize PBall - PBall.__init__( - self, - p, - action_list, - reward_amplitudes, - reward_smoothness, - reward_centers, - A, - B, - sigma, - sigma_init, - mu_init, - ) - - # Render interface - RenderInterface2D.__init__(self) - - # rendering info - self.set_clipping_area((-1, 1, -1, 1)) - self.set_refresh_interval(50) # in milliseconds - - def step(self, action): - # save state for rendering - if self.is_render_enabled(): - self.append_state_for_rendering(self.state.copy()) - return PBall.step(self, action) - - # - # Code for rendering - # - - def _get_ball_shape(self, xcenter, radius): - shape = GeometricPrimitive("POLYGON") - n_points = 200 - theta_vals = np.linspace(0.0, 2 * np.pi, n_points) - for theta in theta_vals: - pp = np.array([2.0 * np.cos(theta), 2.0 * np.sin(theta)]) - pp = xcenter + radius * projection_to_pball(pp, self.p) - # project to the main ball after translation - pp = projection_to_pball(pp, self.p) - shape.add_vertex((pp[0], pp[1])) - return shape - - def get_background(self): - bg = Scene() - - # ball shape - contour = self._get_ball_shape(np.zeros(2), 1.0) - contour.set_color((0.0, 0.0, 0.5)) - bg.add_shape(contour) - - # reward position - for ii, ampl in enumerate(self.reward_amplitudes): - contour = self._get_ball_shape( - self.reward_centers[ii], self.reward_smoothness[ii] - ) - ampl = 1.0 - ampl # dark violet = more reward - contour.set_color((0.5, 0.0, 0.5 * (1.0 + ampl))) - bg.add_shape(contour) - - return bg - - def get_scene(self, state): - scene = Scene() - - agent = GeometricPrimitive("QUADS") - agent.set_color((0.75, 0.0, 0.5)) - size = 0.05 - x = state[0] - y = state[1] - agent.add_vertex((x - size / 4.0, y - size)) - agent.add_vertex((x + size / 4.0, y - size)) - agent.add_vertex((x + size / 4.0, y + size)) - agent.add_vertex((x - size / 4.0, y + size)) - - agent.add_vertex((x - size, y - size / 4.0)) - agent.add_vertex((x + size, y - size / 4.0)) - agent.add_vertex((x + size, y + size / 4.0)) - agent.add_vertex((x - size, y + size / 4.0)) - - scene.add_shape(agent) - return scene - - -class SimplePBallND(PBall): - """ - PBall environment in d dimensions with simple dynamics. - """ - - def __init__( - self, - p=2, - dim=2, - action_amplitude=0.05, - r_smoothness=0.25, - sigma=0.01, - sigma_init=0.001, - mu_init=None, - ): - # Action list - action_list = [] - for dd in range(dim): - aux = np.zeros(dim) - aux[dd] = action_amplitude - action_list.append(aux) - action_list.append(-1 * aux) - - # Rewards - reward_amplitudes = np.array([1.0]) - reward_smoothness = np.array([r_smoothness]) - reward_centers = [np.zeros(dim)] - reward_centers[0][0] = 0.8 - - # Transitions - A = np.eye(dim) - B = np.eye(dim) - - # Initial position - if mu_init is None: - mu_init = np.zeros(dim) - - # Initialize PBall - PBall.__init__( - self, - p, - action_list, - reward_amplitudes, - reward_smoothness, - reward_centers, - A, - B, - sigma, - sigma_init, - mu_init, - ) - - -# if __name__ == '__main__': -# env = PBall2D(p=5) -# print(env.get_transitions_lipschitz_constant()) -# print(env.get_reward_lipschitz_constant()) - -# env.enable_rendering() - -# for ii in range(100): -# env.step(1) -# env.step(3) - -# env.render() diff --git a/rlberry/envs/benchmarks/generalization/__init__.py b/rlberry/envs/benchmarks/generalization/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rlberry/envs/benchmarks/generalization/twinrooms.py b/rlberry/envs/benchmarks/generalization/twinrooms.py deleted file mode 100644 index f0619e96b..000000000 --- a/rlberry/envs/benchmarks/generalization/twinrooms.py +++ /dev/null @@ -1,185 +0,0 @@ -import numpy as np -import rlberry.spaces as spaces -from rlberry.envs import Model -from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D -from rlberry.rendering.common_shapes import circle_shape - -import rlberry - -logger = rlberry.logger - - -class TwinRooms(RenderInterface2D, Model): - """ - Two continuous grid worlds, side by side, separated by a wall. - Both are identical (or almost identical), and the agent has equal probability to - start in any of the two rooms. - - It can be used to test the generalization capability of agents: - a policy learned in one of the rooms can be used to learn faster - a policy in the other room. - - There are 4 actions, one for each direction (left/right/up/down). - - Parameters - ---------- - noise_room1: double, default: 0.01 - Noise in the transitions of the first room. - noise_room2: double, default: 0.01 - Noise in the transitions of the second room. - - Notes - ----- - The function env.sample() does not handle conversions to array states - when array_observation is True. Only the functions env.reset() and - env.step() are covered. - """ - - name = "TwinRooms" - - def __init__(self, noise_room1=0.01, noise_room2=0.01): - Model.__init__(self) - RenderInterface2D.__init__(self) - - self.noise_room1 = noise_room1 - self.noise_room2 = noise_room2 - - self.observation_space = spaces.Box( - low=np.array([0.0, 0.0]), - high=np.array([2.0, 1.0]), - ) - self.action_space = spaces.Discrete(4) - self.reward_range = (0.0, 1.0) - - self.room_noises = [noise_room1, noise_room2] - - # environment parameters - self.action_displacement = 0.1 - self.wall_eps = 0.05 - - # base reward position - self.base_reward_pos = np.array([0.8, 0.8]) - - # rendering info - self.set_clipping_area((0, 2, 0, 1)) - self.set_refresh_interval(100) # in milliseconds - self.renderer_type = "opengl" - - # reset - self.reset() - - def reset(self, seed=None, options=None): - self.current_room = self.seeder.rng.integers(2) - if self.current_room == 0: - self.state = np.array([0.1, 0.1]) - else: - self.state = np.array([1.1, 0.1]) - return self.state.copy(), {} - - def _reward_fn(self, state): - # max reward at (x, y) = reward_pos - reward_pos = self.base_reward_pos - if self.current_room == 1: - reward_pos = reward_pos + np.array([1.0, 0.0]) - xr, yr = reward_pos - - dist = np.sqrt((state[0] - xr) ** 2.0 + (state[1] - yr) ** 2.0) - reward = max(0.0, 1.0 - dist / 0.1) - return reward - - def _clip_to_room(self, state): - state[1] = max(0.0, state[1]) - state[1] = min(1.0, state[1]) - if self.current_room == 0: - state[0] = max(0.0, state[0]) - state[0] = min(1.0 - self.wall_eps, state[0]) - else: - state[0] = max(1.0 + self.wall_eps, state[0]) - state[0] = min(2.0, state[0]) - return state - - def step(self, action): - assert self.action_space.contains(action), "Invalid action!" - - # save state for rendering - if self.is_render_enabled(): - self.append_state_for_rendering(self.state) - - next_state, reward, terminated, truncated, info = self.sample( - self.state, action - ) - self.state = next_state - return self.state.copy(), reward, terminated, truncated, info - - def sample(self, state, action): - delta = self.action_displacement - if action == 0: - displacement = np.array([delta, 0.0]) - elif action == 1: - displacement = np.array([-delta, 0.0]) - elif action == 2: - displacement = np.array([0.0, delta]) - elif action == 3: - displacement = np.array([0.0, -delta]) - else: - raise ValueError("Invalid action") - - next_state = ( - state - + displacement - + self.room_noises[self.current_room] * self.rng.normal(size=2) - ) - - # clip to room - next_state = self._clip_to_room(next_state) - - reward = self._reward_fn(state) - terminated = False - truncated = False - info = {} - - return next_state, reward, terminated, truncated, info - - # - # Code for rendering - # - - def get_background(self): - """ - Returne a scene (list of shapes) representing the background - """ - bg = Scene() - - # wall - eps = self.wall_eps - shape = GeometricPrimitive("POLYGON") - shape.set_color((0.25, 0.25, 0.25)) - shape.add_vertex((1 - eps, 0)) - shape.add_vertex((1 - eps, 1)) - shape.add_vertex((1 + eps, 1)) - shape.add_vertex((1 + eps, 0)) - bg.add_shape(shape) - - # rewards - for x, y in [ - self.base_reward_pos, - self.base_reward_pos + np.array([1.0, 0.0]), - ]: - reward = circle_shape((x, y), 0.1, n_points=50) - reward.type = "POLYGON" - reward.set_color((0.0, 0.5, 0.0)) - bg.add_shape(reward) - - return bg - - def get_scene(self, state): - """ - Return scene (list of shapes) representing a given state - """ - x, y = state - scene = Scene() - agent = circle_shape((x, y), 0.02, n_points=5) - agent.type = "POLYGON" - agent.set_color((0.75, 0.0, 0.5)) - scene.add_shape(agent) - return scene diff --git a/rlberry/envs/benchmarks/grid_exploration/__init__.py b/rlberry/envs/benchmarks/grid_exploration/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rlberry/envs/benchmarks/grid_exploration/apple_gold.py b/rlberry/envs/benchmarks/grid_exploration/apple_gold.py deleted file mode 100644 index 4a4599156..000000000 --- a/rlberry/envs/benchmarks/grid_exploration/apple_gold.py +++ /dev/null @@ -1,180 +0,0 @@ -import numpy as np -import rlberry.spaces as spaces -from rlberry.envs.finite import GridWorld -from rlberry.rendering import Scene, GeometricPrimitive - -import rlberry - -logger = rlberry.logger - - -class AppleGold(GridWorld): - """ - AppleGold with six rooms: this is merely a slightly modified - version of SixRoom. - - Parameters - ---------- - reward_free : bool, default=False - If true, no rewards are given to the agent. - array_observation: - If true, the observations are converted to an array (x, y) - instead of a discrete index. - - Notes - ----- - The function env.sample() does not handle conversions to array states - when array_observation is True. Only the functions env.reset() and - env.step() are covered. - - Reference - --------- - .. seaalso:: - Guo et al.: Self-Imitation Learning via - Trajectory-Conditioned Policy - for Hard-Exploration Tasks - arXiv preprint arXiv:1907.10247 - """ - - name = "AppleGold" - - def __init__(self, reward_free=False, array_observation=False): - self.reward_free = reward_free - self.array_observation = array_observation - - # Common parameters - nrows = 13 - ncols = 17 - start_coord = (5, 1) - terminal_states = ((7, 7),) - success_probability = 0.95 - # - walls = () - for ii in range(13): - walls += ((ii, 0),) - walls += ((ii, 16),) - for jj in range(17): - walls += ((0, jj),) - walls += ((12, jj),) - for ii in range(13): - if ii not in [1, 11]: - walls += ((ii, 6),) - walls += ((ii, 10),) - walls += ((11, 6),) - for jj in range(17): - if jj not in [1, 15]: - walls += ((6, jj),) - - # Default reward according to the difficulty - default_reward = 0 - - # Rewards according to the difficulty - if self.reward_free: - reward_at = {} - else: - reward_at = {(7, 7): 10.0, (8, 2): 1.0, (10, 3): 1.0} - for jj in range(7, 16): - for ii in range(1, 12): - if (ii, jj) not in walls and (ii, jj) != (7, 7): - reward_at[(ii, jj)] = -0.05 - - # Init base class - GridWorld.__init__( - self, - nrows=nrows, - ncols=ncols, - start_coord=start_coord, - terminal_states=terminal_states, - success_probability=success_probability, - reward_at=reward_at, - walls=walls, - default_reward=default_reward, - ) - - # spaces - if self.array_observation: - self.observation_space = spaces.Box(0.0, 1.0, shape=(2,)) - - def _convert_index_to_float_coord(self, state_index): - yy, xx = self.index2coord[state_index] - - # centering - xx = xx + 0.5 - yy = yy + 0.5 - # map to [0, 1] - xx = xx / self.ncols - yy = yy / self.nrows - return np.array([xx, yy]) - - def reset(self, seed=None, options=None): - self.state = self.coord2index[self.start_coord] - state_to_return = self.state - if self.array_observation: - state_to_return = self._convert_index_to_float_coord(self.state) - if self.render_mode == "human": - self.render() - return state_to_return, {} - - def step(self, action): - assert self.action_space.contains(action), "Invalid action!" - - # save state for rendering - if self.is_render_enabled(): - self.append_state_for_rendering(self.state) - - # take step - next_state, reward, terminated, truncated, info = self.sample( - self.state, action - ) - self.state = next_state - - state_to_return = self.state - if self.array_observation: - state_to_return = self._convert_index_to_float_coord(self.state) - if self.render_mode == "human": - self.render() - return state_to_return, reward, terminated, truncated, info - - def get_background(self): - """ - Returne a scene (list of shapes) representing the background - """ - bg = Scene() - - # walls - for wall in self.walls: - y, x = wall - shape = GeometricPrimitive("POLYGON") - shape.set_color((0.25, 0.25, 0.25)) - shape.add_vertex((x, y)) - shape.add_vertex((x + 1, y)) - shape.add_vertex((x + 1, y + 1)) - shape.add_vertex((x, y + 1)) - bg.add_shape(shape) - - # rewards - for y, x in self.reward_at: - rwd = self.reward_at[(y, x)] - if rwd == -0.05: - rock = GeometricPrimitive("POLYGON") - rock.set_color((0.6, 0.6, 0.6)) - rock.add_vertex((x, y)) - rock.add_vertex((x + 1, y)) - rock.add_vertex((x + 1, y + 1)) - rock.add_vertex((x, y + 1)) - bg.add_shape(rock) - else: - flag = GeometricPrimitive("POLYGON") - if rwd == 10: - flag.set_color((0.0, 0.5, 0.0)) - elif rwd == 1: - flag.set_color((0.0, 0.0, 0.5)) - - x += 0.5 - y += 0.25 - flag.add_vertex((x, y)) - flag.add_vertex((x + 0.25, y + 0.5)) - flag.add_vertex((x - 0.25, y + 0.5)) - bg.add_shape(flag) - - return bg diff --git a/rlberry/envs/benchmarks/grid_exploration/four_room.py b/rlberry/envs/benchmarks/grid_exploration/four_room.py deleted file mode 100644 index b4e2d67a5..000000000 --- a/rlberry/envs/benchmarks/grid_exploration/four_room.py +++ /dev/null @@ -1,130 +0,0 @@ -import numpy as np -import rlberry.spaces as spaces -from rlberry.envs.finite import GridWorld - -import rlberry - -logger = rlberry.logger - - -class FourRoom(GridWorld): - """ - GridWorld with four rooms. - - Parameters - ---------- - reward_free : bool, default=False - If true, no rewards are given to the agent. - difficulty: int, {0, 1 or 2} - Difficulty 0: reward in one location - Difficulty 1: easy suboptimal reward, hard optimal reward - Difficulty 2: easy suboptimal reward, hard optimal reward, - negative rewards by default. - Note: this parameter is ignored if reward_free is True. - array_observation: - If true, the observations are converted to an array (x, y) - instead of a discrete index. - - Notes - ----- - The function env.sample() does not handle conversions to array states - when array_observation is True. Only the functions env.reset() and - env.step() are covered. - """ - - name = "FourRoom" - - def __init__(self, reward_free=False, difficulty=0, array_observation=False): - self.reward_free = reward_free - self.difficulty = difficulty - self.array_observation = array_observation - - if difficulty not in [0, 1, 2]: - raise ValueError("FourRoom difficulty must be in [0, 1, 2]") - - # Common parameters - nrows = 9 - ncols = 9 - start_coord = (0, 0) - terminal_states = ((8, 0),) - success_probability = 0.95 - # - walls = () - for ii in range(9): - if ii not in [2, 6]: - walls += ((ii, 4),) - for jj in range(9): - if jj != 7: - walls += ((4, jj),) - - # Default reward according to the difficulty - if difficulty in [0, 1]: - default_reward = 0.0 - elif difficulty == 2: - default_reward = -0.005 - - # Rewards according to the difficulty - if self.reward_free: - reward_at = {} - else: - if difficulty == 0: - reward_at = {(8, 0): 1.0} - elif difficulty in [1, 2]: - reward_at = { - (8, 0): 1.0, - (3, 3): 0.1, - } - - # Init base class - GridWorld.__init__( - self, - nrows=nrows, - ncols=ncols, - start_coord=start_coord, - terminal_states=terminal_states, - success_probability=success_probability, - reward_at=reward_at, - walls=walls, - default_reward=default_reward, - ) - - # spaces - if self.array_observation: - self.observation_space = spaces.Box(0.0, 1.0, shape=(2,)) - - def _convert_index_to_float_coord(self, state_index): - yy, xx = self.index2coord[state_index] - - # centering - xx = xx + 0.5 - yy = yy + 0.5 - # map to [0, 1] - xx = xx / self.ncols - yy = yy / self.nrows - return np.array([xx, yy]) - - def reset(self, seed=None, options=None): - self.state = self.coord2index[self.start_coord] - state_to_return = self.state - if self.array_observation: - state_to_return = self._convert_index_to_float_coord(self.state) - return state_to_return, {} - - def step(self, action): - assert self.action_space.contains(action), "Invalid action!" - - # save state for rendering - if self.is_render_enabled(): - self.append_state_for_rendering(self.state) - - # take step - next_state, reward, terminated, truncated, info = self.sample( - self.state, action - ) - self.state = next_state - - state_to_return = self.state - if self.array_observation: - state_to_return = self._convert_index_to_float_coord(self.state) - - return state_to_return, reward, terminated, truncated, info diff --git a/rlberry/envs/benchmarks/grid_exploration/nroom.py b/rlberry/envs/benchmarks/grid_exploration/nroom.py deleted file mode 100644 index 51cc0f279..000000000 --- a/rlberry/envs/benchmarks/grid_exploration/nroom.py +++ /dev/null @@ -1,305 +0,0 @@ -import math -import numpy as np -import rlberry.spaces as spaces -from rlberry.envs.finite import GridWorld -from rlberry.rendering import Scene, GeometricPrimitive - -import rlberry - -logger = rlberry.logger - - -# def get_nroom_state_coord(state_index, nroom_env): -# yy, xx = nroom_env.index2coord[state_index] -# # centering -# xx = xx + 0.5 -# yy = yy + 0.5 -# # map to [0, 1] -# xx = xx / nroom_env.ncols -# yy = yy / nroom_env.nrows -# return np.array([xx, yy]) - - -class NRoom(GridWorld): - """ - GridWorld with N rooms of size L x L. The agent starts in the middle room. - - There is one small and easy reward in the first room, - one big reward in the last room and zero reward elsewhere. - - There is a 5% error probability in the transitions when taking an action. - - Parameters - ---------- - nrooms : int - Number of rooms. - reward_free : bool, default=False - If true, no rewards are given to the agent. - array_observation: - If true, the observations are converted to an array (x, y) - instead of a discrete index. - The underlying discrete space is saved in env.discrete_observation_space. - room_size : int - Dimension (L) of each room (L x L). - success_probability : double, default: 0.95 - Sucess probability of an action. A failure is going to the wrong direction. - remove_walls : bool, default: False - If True, remove walls. Useful for debug. - initial_state_distribution: {'center', 'uniform'} - If 'center', always start at the center. - If 'uniform', start anywhere with uniform probability. - include_traps: bool, default: False - If true, each room will have a terminal state (a "trap"). - Notes - ----- - The function env.sample() does not handle conversions to array states - when array_observation is True. Only the functions env.reset() and - env.step() are covered. - """ - - name = "N-Room" - - def __init__( - self, - nrooms=7, - reward_free=False, - array_observation=False, - room_size=5, - success_probability=0.95, - remove_walls=False, - initial_state_distribution="center", - include_traps=False, - ): - assert nrooms > 0, "nrooms must be > 0" - assert initial_state_distribution in ("center", "uniform") - - self.reward_free = reward_free - self.array_observation = array_observation - self.nrooms = nrooms - self.room_size = room_size - self.success_probability = success_probability - self.remove_walls = remove_walls - self.initial_state_distribution = initial_state_distribution - self.include_traps = include_traps - - # Max number of rooms/columns per row - self.max_rooms_per_row = 5 - - # Room size (default = 5x5) - self.room_size = room_size - - # Grid size - self.room_nrows = math.ceil(nrooms / self.max_rooms_per_row) - if self.room_nrows > 1: - self.room_ncols = self.max_rooms_per_row - else: - self.room_ncols = nrooms - nrows = self.room_size * self.room_nrows + (self.room_nrows - 1) - ncols = self.room_size * self.room_ncols + (self.room_ncols - 1) - - # # walls - walls = [] - for room_col in range(self.room_ncols - 1): - col = (room_col + 1) * (self.room_size + 1) - 1 - for jj in range(nrows): - if (jj % (self.room_size + 1)) != (self.room_size // 2): - walls.append((jj, col)) - - for room_row in range(self.room_nrows - 1): - row = (room_row + 1) * (self.room_size + 1) - 1 - for jj in range(ncols): - walls.append((row, jj)) - - # process each room - start_coord = None - terminal_state = None - self.traps = [] - count = 0 - for room_r in range(self.room_nrows): - if room_r % 2 == 0: - cols_iterator = range(self.room_ncols) - else: - cols_iterator = reversed(range(self.room_ncols)) - for room_c in cols_iterator: - # existing rooms - if count < self.nrooms: - # remove top wall - if ((room_c == self.room_ncols - 1) and (room_r % 2 == 0)) or ( - (room_c == 0) and (room_r % 2 == 1) - ): - if room_r != self.room_nrows - 1: - wall_to_remove = self._convert_room_coord_to_global( - room_r, room_c, self.room_size, self.room_size // 2 - ) - if wall_to_remove in walls: - walls.remove(wall_to_remove) - # rooms to remove - else: - for ii in range(-1, self.room_size + 1): - for jj in range(-1, self.room_size + 1): - wall_to_include = self._convert_room_coord_to_global( - room_r, room_c, ii, jj - ) - if ( - wall_to_include[0] >= 0 - and wall_to_include[0] < nrows - and wall_to_include[1] >= 0 - and wall_to_include[1] < ncols - and (wall_to_include not in walls) - ): - walls.append(wall_to_include) - pass - - # start coord - if count == nrooms // 2: - start_coord = self._convert_room_coord_to_global( - room_r, room_c, self.room_size // 2, self.room_size // 2 - ) - # terminal state - if count == nrooms - 1: - terminal_state = self._convert_room_coord_to_global( - room_r, room_c, self.room_size // 2, self.room_size // 2 - ) - # trap - if include_traps: - self.traps.append( - self._convert_room_coord_to_global( - room_r, - room_c, - self.room_size // 2 + 1, - self.room_size // 2 + 1, - ) - ) - count += 1 - - terminal_states = (terminal_state,) + tuple(self.traps) - - if self.reward_free: - reward_at = {} - else: - reward_at = { - terminal_state: 1.0, - start_coord: 0.01, - (self.room_size // 2, self.room_size // 2): 0.1, - } - - # Check remove_walls - if remove_walls: - walls = () - - # Init base class - GridWorld.__init__( - self, - nrows=nrows, - ncols=ncols, - start_coord=start_coord, - terminal_states=terminal_states, - success_probability=success_probability, - reward_at=reward_at, - walls=walls, - default_reward=0.0, - ) - - # Check initial distribution - if initial_state_distribution == "uniform": - distr = np.ones(self.observation_space.n) / self.observation_space.n - self.set_initial_state_distribution(distr) - - # spaces - if self.array_observation: - self.discrete_observation_space = self.observation_space - self.observation_space = spaces.Box(0.0, 1.0, shape=(2,)) - - def _convert_room_coord_to_global( - self, room_row, room_col, room_coord_row, room_coord_col - ): - col_offset = (self.room_size + 1) * room_col - row_offset = (self.room_size + 1) * room_row - - row = room_coord_row + row_offset - col = room_coord_col + col_offset - return (row, col) - - def _convert_index_to_float_coord(self, state_index): - yy, xx = self.index2coord[state_index] - - # centering - xx = xx + 0.5 - yy = yy + 0.5 - # map to [0, 1] - xx = xx / self.ncols - yy = yy / self.nrows - return np.array([xx, yy]) - - def reset(self, seed=None, options=None): - self.state, info = GridWorld.reset(self, seed=seed, options=options) - state_to_return = self.state - if self.array_observation: - state_to_return = self._convert_index_to_float_coord(self.state) - return state_to_return, info - - def step(self, action): - assert self.action_space.contains(action), "Invalid action!" - - # save state for rendering - if self.is_render_enabled(): - self.append_state_for_rendering(self.state) - - # take step - next_state, reward, terminated, truncated, info = self.sample( - self.state, action - ) - self.state = next_state - - state_to_return = self.state - if self.array_observation: - state_to_return = self._convert_index_to_float_coord(self.state) - - return state_to_return, reward, terminated, truncated, info - - def get_background(self): - """ - Returne a scene (list of shapes) representing the background - """ - bg = Scene() - - # traps - for y, x in self.traps: - shape = GeometricPrimitive("POLYGON") - shape.set_color((0.5, 0.0, 0.0)) - shape.add_vertex((x, y)) - shape.add_vertex((x + 1, y)) - shape.add_vertex((x + 1, y + 1)) - shape.add_vertex((x, y + 1)) - bg.add_shape(shape) - - # walls - for wall in self.walls: - y, x = wall - shape = GeometricPrimitive("POLYGON") - shape.set_color((0.25, 0.25, 0.25)) - shape.add_vertex((x, y)) - shape.add_vertex((x + 1, y)) - shape.add_vertex((x + 1, y + 1)) - shape.add_vertex((x, y + 1)) - bg.add_shape(shape) - - # rewards - for y, x in self.reward_at: - flag = GeometricPrimitive("POLYGON") - rwd = self.reward_at[(y, x)] - if rwd == 1.0: - flag.set_color((0.0, 0.5, 0.0)) - elif rwd == 0.1: - flag.set_color((0.0, 0.0, 0.5)) - else: - flag.set_color((0.5, 0.0, 0.0)) - - x += 0.5 - y += 0.25 - flag.add_vertex((x, y)) - flag.add_vertex((x + 0.25, y + 0.5)) - flag.add_vertex((x - 0.25, y + 0.5)) - bg.add_shape(flag) - - return bg diff --git a/rlberry/envs/benchmarks/grid_exploration/six_room.py b/rlberry/envs/benchmarks/grid_exploration/six_room.py deleted file mode 100644 index 4af6fdb28..000000000 --- a/rlberry/envs/benchmarks/grid_exploration/six_room.py +++ /dev/null @@ -1,151 +0,0 @@ -import numpy as np -import rlberry.spaces as spaces -from rlberry.envs.finite import GridWorld -from rlberry.rendering import Scene, GeometricPrimitive - -import rlberry - -logger = rlberry.logger - - -class SixRoom(GridWorld): - """ - GridWorld with six rooms. - - Parameters - ---------- - reward_free : bool, default=False - If true, no rewards are given to the agent. - array_observation: - If true, the observations are converted to an array (x, y) - instead of a discrete index. - - Notes - ----- - The function env.sample() does not handle conversions to array states - when array_observation is True. Only the functions env.reset() and - env.step() are covered. - """ - - name = "SixRoom" - - def __init__(self, reward_free=False, array_observation=False): - self.reward_free = reward_free - self.array_observation = array_observation - - # Common parameters - nrows = 11 - ncols = 17 - start_coord = (0, 0) - terminal_states = ((10, 0),) - success_probability = 0.95 - # - walls = () - for ii in range(11): - if ii not in [2, 8]: - walls += ((ii, 5),) - walls += ((ii, 11),) - for jj in range(17): - if jj != 15: - walls += ((5, jj),) - - # Default reward according to the difficulty - default_reward = -0.001 - - # Rewards according to the difficulty - if self.reward_free: - reward_at = {} - else: - reward_at = { - (10, 0): 10.0, - (4, 4): 0.1, - } - - # Init base class - GridWorld.__init__( - self, - nrows=nrows, - ncols=ncols, - start_coord=start_coord, - terminal_states=terminal_states, - success_probability=success_probability, - reward_at=reward_at, - walls=walls, - default_reward=default_reward, - ) - - # spaces - if self.array_observation: - self.observation_space = spaces.Box(0.0, 1.0, shape=(2,)) - - def _convert_index_to_float_coord(self, state_index): - yy, xx = self.index2coord[state_index] - - # centering - xx = xx + 0.5 - yy = yy + 0.5 - # map to [0, 1] - xx = xx / self.ncols - yy = yy / self.nrows - return np.array([xx, yy]) - - def reset(self, seed=None, options=None): - self.state = self.coord2index[self.start_coord] - state_to_return = self.state - if self.array_observation: - state_to_return = self._convert_index_to_float_coord(self.state) - return state_to_return, {} - - def step(self, action): - assert self.action_space.contains(action), "Invalid action!" - - # save state for rendering - if self.is_render_enabled(): - self.append_state_for_rendering(self.state) - - # take step - next_state, reward, terminated, truncated, info = self.sample( - self.state, action - ) - self.state = next_state - - state_to_return = self.state - if self.array_observation: - state_to_return = self._convert_index_to_float_coord(self.state) - - return state_to_return, reward, terminated, truncated, info - - def get_background(self): - """ - Returne a scene (list of shapes) representing the background - """ - bg = Scene() - - # walls - for wall in self.walls: - y, x = wall - shape = GeometricPrimitive("POLYGON") - shape.set_color((0.25, 0.25, 0.25)) - shape.add_vertex((x, y)) - shape.add_vertex((x + 1, y)) - shape.add_vertex((x + 1, y + 1)) - shape.add_vertex((x, y + 1)) - bg.add_shape(shape) - - # rewards - for y, x in self.reward_at: - flag = GeometricPrimitive("POLYGON") - rwd = self.reward_at[(y, x)] - if rwd == 10: - flag.set_color((0.0, 0.5, 0.0)) - else: - flag.set_color((0.0, 0.0, 0.5)) - - x += 0.5 - y += 0.25 - flag.add_vertex((x, y)) - flag.add_vertex((x + 0.25, y + 0.5)) - flag.add_vertex((x - 0.25, y + 0.5)) - bg.add_shape(flag) - - return bg diff --git a/rlberry/envs/bullet3/data/__init__.py b/rlberry/envs/bullet3/data/__init__.py deleted file mode 100644 index fa3615af0..000000000 --- a/rlberry/envs/bullet3/data/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -import os - - -def getDataPath(): - resdir = os.path.join(os.path.dirname(__file__)) - return resdir diff --git a/rlberry/envs/bullet3/data/mjcf/pendulum.xml b/rlberry/envs/bullet3/data/mjcf/pendulum.xml deleted file mode 100644 index e27fc88f4..000000000 --- a/rlberry/envs/bullet3/data/mjcf/pendulum.xml +++ /dev/null @@ -1,28 +0,0 @@ - - - - - - - - - diff --git a/rlberry/envs/bullet3/data/pendulum.urdf b/rlberry/envs/bullet3/data/pendulum.urdf deleted file mode 100644 index af450f425..000000000 --- a/rlberry/envs/bullet3/data/pendulum.urdf +++ /dev/null @@ -1,51 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/rlberry/envs/bullet3/pybullet_envs/__init__.py b/rlberry/envs/bullet3/pybullet_envs/__init__.py deleted file mode 100644 index 093f8d9eb..000000000 --- a/rlberry/envs/bullet3/pybullet_envs/__init__.py +++ /dev/null @@ -1,40 +0,0 @@ -import gymnasium as gym -from gym.envs.registration import registry, make, spec - - -def register(id, *args, **kvargs): - if id in registry.env_specs: - return - else: - return gym.envs.registration.register(id, *args, **kvargs) - - -# ------------bullet------------- - -register( - id="PendulumBulletEnv-v0", - entry_point="rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:PendulumBulletEnv", - max_episode_steps=1000, - reward_threshold=950.0, -) - -register( - id="PendulumSwingupBulletEnv-v0", - entry_point="rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:PendulumSwingupBulletEnv", - max_episode_steps=1000, - reward_threshold=800.0, -) - -register( - id="DiscretePendulumBulletEnv-v0", - entry_point="rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:DiscretePendulumBulletEnv", - max_episode_steps=1000, - reward_threshold=950.0, -) - -register( - id="DiscretePendulumSwingupBulletEnv-v0", - entry_point="rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:DiscretePendulumSwingupBulletEnv", - max_episode_steps=1000, - reward_threshold=800.0, -) diff --git a/rlberry/envs/bullet3/pybullet_envs/gym_pendulum_envs.py b/rlberry/envs/bullet3/pybullet_envs/gym_pendulum_envs.py deleted file mode 100644 index 32ce80c6a..000000000 --- a/rlberry/envs/bullet3/pybullet_envs/gym_pendulum_envs.py +++ /dev/null @@ -1,80 +0,0 @@ -from gym import spaces -from pybullet_envs.env_bases import MJCFBaseBulletEnv -from pybullet_envs.gym_pendulum_envs import InvertedPendulumBulletEnv -from pybullet_envs.scene_abstract import SingleRobotEmptyScene - -from rlberry.envs.bullet3.pybullet_envs.robot_pendula import Pendulum, PendulumSwingup -import numpy as np - - -class PendulumBulletEnv(InvertedPendulumBulletEnv): - """Simple pendulum""" - - def __init__(self): - self.robot = Pendulum() - MJCFBaseBulletEnv.__init__(self, self.robot) - self.stateId = -1 - - def create_single_player_scene(self, bullet_client): - return SingleRobotEmptyScene( - bullet_client, gravity=9.81, timestep=0.02, frame_skip=1 - ) - - def step(self, a): - self.robot.apply_action(a) - self.scene.global_step() - state = self.robot.calc_state() # sets self.pos_x self.pos_y - if self.robot.swingup: - reward = np.cos(self.robot.theta) - done = False - else: - reward = 1.0 - done = np.abs(self.robot.theta) > 0.2 - self.rewards = [float(reward)] - self.HUD(state, a, done) - return state, sum(self.rewards), done, {} - - -class PendulumSwingupBulletEnv(PendulumBulletEnv): - def __init__(self): - self.robot = PendulumSwingup() - MJCFBaseBulletEnv.__init__(self, self.robot) - self.stateId = -1 - - -class DiscretePendulumBulletEnv(PendulumBulletEnv): - """pybullet's InvertedPendulum with discrete actions""" - - def __init__(self): - super().__init__() - self.continuous_action_space = self.action_space - self.action_space = spaces.Discrete(3) - - def step(self, a): - if a == 0: - return super().step(self.continuous_action_space.low) - elif a == 1: - return super().step(self.continuous_action_space.high) - elif a == 2: - return super().step(np.zeros(self.continuous_action_space.shape)) - else: - raise IndexError - - -class DiscretePendulumSwingupBulletEnv(PendulumSwingupBulletEnv): - """pybullet's InvertedPendulumSwingup with discrete actions""" - - def __init__(self): - super().__init__() - self.continuous_action_space = self.action_space - self.action_space = spaces.Discrete(3) - - def step(self, a): - if a == 0: - return super().step(self.continuous_action_space.low) - elif a == 1: - return super().step(self.continuous_action_space.high) - elif a == 2: - return super().step(np.zeros(self.continuous_action_space.shape)) - else: - raise IndexError diff --git a/rlberry/envs/bullet3/pybullet_envs/robot_bases.py b/rlberry/envs/bullet3/pybullet_envs/robot_bases.py deleted file mode 100644 index d2dc50e75..000000000 --- a/rlberry/envs/bullet3/pybullet_envs/robot_bases.py +++ /dev/null @@ -1,123 +0,0 @@ -import os -import pybullet -from pybullet_envs.robot_bases import MJCFBasedRobot, URDFBasedRobot - -# Use our custom data -from rlberry.envs.bullet3 import data - - -class MJCFBasedRobot2(MJCFBasedRobot): - def reset(self, bullet_client): - self._p = bullet_client - # print("Created bullet_client with id=", self._p._client) - if self.doneLoading == 0: - self.ordered_joints = [] - self.doneLoading = 1 - if self.self_collision: - self.objects = self._p.loadMJCF( - os.path.join(data.getDataPath(), "mjcf", self.model_xml), - flags=pybullet.URDF_USE_SELF_COLLISION - | pybullet.URDF_USE_SELF_COLLISION_EXCLUDE_ALL_PARENTS - | pybullet.URDF_GOOGLEY_UNDEFINED_COLORS, - ) - ( - self.parts, - self.jdict, - self.ordered_joints, - self.robot_body, - ) = self.addToScene(self._p, self.objects) - else: - self.objects = self._p.loadMJCF( - os.path.join( - data.getDataPath(), - "mjcf", - self.model_xml, - flags=pybullet.URDF_GOOGLEY_UNDEFINED_COLORS, - ) - ) - ( - self.parts, - self.jdict, - self.ordered_joints, - self.robot_body, - ) = self.addToScene(self._p, self.objects) - self.robot_specific_reset(self._p) - - s = ( - self.calc_state() - ) # optimization: calc_state() can calculate something in self.* for calc_potential() to use - - return s - - -class URDFBasedRobot2(URDFBasedRobot): - def __init__( - self, - model_urdf, - robot_name, - action_dim, - obs_dim, - basePosition=[0, 0, 0], - baseOrientation=[0, 0, 0, 1], - fixed_base=False, - self_collision=False, - ): - super().__init__( - model_urdf, - robot_name, - action_dim, - obs_dim, - basePosition, - baseOrientation, - fixed_base, - self_collision, - ) - self.doneLoading = 0 - - def reset(self, bullet_client): - self._p = bullet_client - if self.doneLoading == 0: - self.ordered_joints = [] - self.doneLoading = 1 - if self.self_collision: - ( - self.parts, - self.jdict, - self.ordered_joints, - self.robot_body, - ) = self.addToScene( - self._p, - self._p.loadURDF( - os.path.join(data.getDataPath(), self.model_urdf), - basePosition=self.basePosition, - baseOrientation=self.baseOrientation, - useFixedBase=self.fixed_base, - flags=pybullet.URDF_USE_SELF_COLLISION - | pybullet.URDF_GOOGLEY_UNDEFINED_COLORS, - ), - ) - else: - ( - self.parts, - self.jdict, - self.ordered_joints, - self.robot_body, - ) = self.addToScene( - self._p, - self._p.loadURDF( - os.path.join(data.getDataPath(), self.model_urdf), - basePosition=self.basePosition, - baseOrientation=self.baseOrientation, - useFixedBase=self.fixed_base, - flags=pybullet.URDF_GOOGLEY_UNDEFINED_COLORS, - ), - ) - - self.robot_specific_reset(self._p) - - s = ( - self.calc_state() - ) # optimization: calc_state() can calculate something in self.* for calc_potential() to use - self.potential = self.calc_potential() - - return s diff --git a/rlberry/envs/bullet3/pybullet_envs/robot_pendula.py b/rlberry/envs/bullet3/pybullet_envs/robot_pendula.py deleted file mode 100644 index aca31c7c6..000000000 --- a/rlberry/envs/bullet3/pybullet_envs/robot_pendula.py +++ /dev/null @@ -1,46 +0,0 @@ -import gymnasium as gym -import numpy as np - -from rlberry.envs.bullet3.pybullet_envs.robot_bases import URDFBasedRobot2 - - -class Pendulum(URDFBasedRobot2): - swingup = False - - def __init__(self): - # MJCFBasedRobot2.__init__(self, 'pendulum.xml', 'pole', action_dim=1, obs_dim=2) - URDFBasedRobot2.__init__(self, "pendulum.urdf", "pole", action_dim=1, obs_dim=2) - self.action_space = gym.spaces.Box(shape=(1,), low=-20, high=20) - - def robot_specific_reset(self, bullet_client): - self._p = bullet_client - self.pole = self.parts["pole"] - self.j1 = self.jdict["hinge"] - u = self.np_random.uniform(low=-0.1, high=0.1) - self.j1.reset_current_position(u if not self.swingup else np.pi + u, 0) - self.j1.set_motor_torque(0) - - def apply_action(self, a): - assert np.isfinite(a).all() - if not np.isfinite(a).all(): - print("a is inf") - a[0] = 0 - self.j1.set_motor_torque( - np.clip(a[0], self.action_space.low, self.action_space.high) - ) - - def calc_state(self): - self.theta, theta_dot = self.j1.current_position() - if not np.isfinite(self.theta): - print("theta is inf") - self.theta = 0 - - if not np.isfinite(theta_dot): - print("theta_dot is inf") - theta_dot = 0 - - return np.array([self.theta, theta_dot]) - - -class PendulumSwingup(Pendulum): - swingup = True diff --git a/rlberry/envs/classic_control/SpringCartPole.py b/rlberry/envs/classic_control/SpringCartPole.py deleted file mode 100644 index 4bfd5f634..000000000 --- a/rlberry/envs/classic_control/SpringCartPole.py +++ /dev/null @@ -1,604 +0,0 @@ -""" -SpringCartPole environment introduced in J-F. Hren PhD thesis. -""" - -import numpy as np -import rlberry.spaces as spaces -from rlberry.envs.interface import Model -from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D -from rlberry.rendering.common_shapes import bar_shape, circle_shape - - -class SpringCartPole(RenderInterface2D, Model): - """ - SpringCartPole is an extension of the CartPole environment proposed in - PhD thesis J-F. Hren. It consists of two carts connected by a spring. - - Parameters - ---------- - dt : float, default=0.02 - Time step of the simulation. - obs_trans : bool, default=True - If True, state has dimension 10: - State: - 'Cart position1', 'Cart velocity1', 'Pole cos1', 'Pole sin1', 'Pole angular velocity1', - 'Cart position2', 'Cart velocity2', 'Pole cos2', 'Pole sin2', 'Pole angular velocity2' - If False, state has dimension 8: - State: - 'Cart position1', 'Cart velocity1', 'Pole angle1', 'Pole angular velocity1', - 'Cart position2', 'Cart velocity2', 'Pole angle2', 'Pole angular velocity2' - swing_up : bool, default=False - If True, the pole starting position is at the bottom - If False, the pole starting position is at the top - random_init : bool, default=True - If True, the noise is added to the carts and poles starting positions - - Notes - ----- - State: - The state consists of the position of cart 1, its speed, the angle - of pole 1 (expressed in radians or in a tuple of cos() and sin()) and - its angular speed, and the same set of values for cart 2 and pole 2. - For both poles, the angle of 0 corresponds to the vertical position, - the positive angles correspond to a counterclockwise rotation. - - Actions: - The action is either 0, 1, 2, or 3, corresponding to the four possible - actions: - LL = 0, move cart 1 to the left, cart 2 to the left - RR = 1, move cart 1 to the right, cart 2 to the right - LR = 2, move cart 1 to the left, cart 2 to the right - RL = 3, move cart 1 to the right, cart 2 to the left - The magnitude of actions is fixed to 2.0. - - Reward: - If spring is not deformed (its length is within [self.min_spring_length, self.max_spring_length]) - and the carts stay on the track (|Cart Position| <= self.track_length / 2) - then the reward is ((1 + Pole cos1) + (1 + Pole cos2)) / 4, else reward is 0. - - Reference: - .. seealso:: - J-F. Hren: Planification optimiste pour systèmes déterministes, PhD thesis - .. warning:: - This version of the domain uses the Runge-Kutta method for integrating - the system dynamics and is more realistic than Euler method - """ - - name = "SpringCartPole" - - ACT_RNG = 2.0 - AVAIL_TORQUES = [ - np.array([-2.0, -2.0]), - np.array([2.0, 2.0]), - np.array([-2.0, 2.0]), - np.array([2.0, -2.0]), - ] - - book_or_nips = "book" - action_arrow = None - domain_fig = None - actions_num = 4 - - def __init__(self, dt=0.02, obs_trans=True, swing_up=False, random_init=True): - Model.__init__(self) - RenderInterface2D.__init__(self) - - self.dt = dt - self.gravity = 9.81 - self.track_length = 2.0 - self.L = 0.5 * self.track_length - self.pole_length = 1.0 - self.l = 0.5 * self.pole_length - self.masspole = 0.1 - self.masscart = 1.0 - self.cart_friction = 5e-4 - self.pole_friction = 2e-6 - self.spring = 2.0 - self.normal_spring_length = 0.5 - self.min_spring_length = 0.1 - self.max_spring_length = 1.5 - self.max_velocity = 15.0 - self.ang_velocity = 10.0 - self.force_mag = self.ACT_RNG - self.swing_up = swing_up - self.random_init = random_init - self.obs_trans = obs_trans - - if self.obs_trans: - self.obs_shape = 10 - else: - self.obs_shape = 8 - - # init base classes - self.reward_range = (0.0, 1.0) - - # rendering info - boundy = self.pole_length * 2 + 0.2 - boundx = self.track_length + self.pole_length * 2 + 0.2 - # (left, right, bottom, top) - self.set_clipping_area((-boundx, boundx, -boundy, boundy)) - self.set_refresh_interval(10) # in milliseconds - - # observation and action spaces - if self.obs_trans: - high = np.array( - [ - self.track_length, - np.finfo(np.float32).max, - 1, - 1, - np.finfo(np.float32).max, - self.track_length, - np.finfo(np.float32).max, - 1, - 1, - np.finfo(np.float32).max, - ] - ) - else: - high = np.array( - [ - self.track_length, - np.finfo(np.float32).max, - 2 * np.pi, - np.finfo(np.float32).max, - self.track_length, - np.finfo(np.float32).max, - 2 * np.pi, - np.finfo(np.float32).max, - ] - ) - low = -high - self.observation_space = spaces.Box(low=low, high=high) - self.action_space = spaces.Discrete(4) - - # initialize - self.state = None # state in pos or angles - self.state_ = None # state in angles - self.reset() - - def transform_states(self, state): - """Transform state with dim=8 to the state with dim=10""" - assert state.shape[-1] == 8, "State has wrong shape, should be 8" - shape = list(state.shape) - shape[-1] = 10 - state_ = np.zeros(shape) - state_[..., :2] = state[..., :2] - state_[..., 4:7] = state[..., 3:6] - state_[..., -1] = state[..., -1] - theta1 = state[..., 2] - theta2 = state[..., 6] - state_[..., 2] = np.cos(theta1) - state_[..., 3] = np.sin(theta1) - state_[..., 7] = np.cos(theta2) - state_[..., 8] = np.sin(theta2) - return state_ - - # def trigonometric2angle(self, costheta, sintheta): - # C = costheta**2 + sintheta**2 - # costheta, sintheta = costheta / C, sintheta / C - # theta = np.arctan2(sintheta / C, costheta / C) - # return theta - - def reset(self): - if self.random_init: - rand_state = self.rng.uniform(low=-0.1, high=0.1, size=(8,)) - else: - rand_state = np.zeros((8,)) - rand_state[4] += self.normal_spring_length - if self.swing_up: - rand_state[2] += np.pi - rand_state[6] += np.pi - if self.obs_trans: - self.state = self.transform_states(rand_state) - else: - self.state = rand_state - self.state_ = rand_state - return self.state, {} - - def _reward(self): - state = self.state - if state.shape[-1] == 10: - ( - _, - _, - cos1, - sin1, - _, - _, - _, - cos2, - sin2, - _, - ) = np.split(state, 10, axis=-1) - C1 = np.sqrt(cos1**2 + sin1**2) - C2 = np.sqrt(cos2**2 + sin2**2) - cos1 = cos1 / C1 - sin1 = sin1 / C1 - cos2 = cos2 / C2 - sin2 = sin2 / C2 - else: - _, _, theta1, _, _, _, theta2, _ = np.split(state, 8, axis=-1) - cos1 = np.cos(theta1) - sin1 = np.sin(theta1) - cos2 = np.cos(theta2) - sin2 = np.sin(theta2) - - bad_condition = self._terminal() - - pos_reward = (1 + cos1) / 4 + (1 + cos2) / 4 - neg_reward = 0.0 - - return np.where(bad_condition, neg_reward, pos_reward) - - def bound_states(self, state): - assert state.shape[-1] == 8, "state must be of shape (8,)" - x1, x1dot, theta1, theta1dot, x2, x2dot, theta2, theta2dot = np.split( - state, 8, axis=-1 - ) - theta1 = np.asarray(wrap(theta1, -np.pi, np.pi)) - theta2 = np.asarray(wrap(theta2, -np.pi, np.pi)) - x1dot = np.asarray(bound(x1dot, [-self.max_velocity, self.max_velocity])) - x2dot = np.asarray(bound(x2dot, -self.max_velocity, self.max_velocity)) - theta1dot = np.asarray(bound(theta1dot, -self.ang_velocity, self.ang_velocity)) - theta2dot = np.asarray(bound(theta2dot, -self.max_velocity, self.ang_velocity)) - state = np.concatenate( - [x1, x1dot, theta1, theta1dot, x2, x2dot, theta2, theta2dot], axis=-1 - ) - return state - - def step(self, action): - assert self.action_space.contains(action), "%r (%s) invalid" % ( - action, - type(action), - ) - - # save state for rendering - if self.is_render_enabled(): - self.append_state_for_rendering(np.array(self.state_)) - - s = self.state_ - torque = self.AVAIL_TORQUES[action] - - # # Add noise to the force action - # if self.torque_noise_max > 0: - # torque += self.rng.uniform(-self.torque_noise_max, self.torque_noise_max) - - # Now, augment the state with our force action so it can be passed to - # _dsdt - s_augmented = np.append(s, torque) - - try: - from scipy.integrate import solve_ivp - - ns = solve_ivp(lambda t, y: self._dsdt(y, t), [0, self.dt], s_augmented) - ns = ns.y[:, -1] # final timestep - except: - print("Can't import scipy library, use rk4 function") - ns = rk4(self._dsdt, s_augmented, [0, self.dt]) - # only care about final timestep of integration returned by integrator - ns = ns[-1] - - ns = ns[:-2] # omit action - - ns = self.bound_states(ns) - self.state_ = ns - if self.obs_trans: - self.state = self.transform_states(ns) - else: - self.state = ns - terminated = self._terminal() - truncated = False - reward = self._reward()[0] - return self.state, reward, terminated, truncated, {} - - def _terminal(self): - s = self.state_ - x1 = s[0] - x2 = s[4] - bad_condition = False - bad_condition += np.abs(x1) > self.L - bad_condition += np.abs(x2) > self.L - bad_condition += x2 <= x1 - bad_condition += np.abs(x1 - x2) < self.min_spring_length - bad_condition += np.abs(x1 - x2) > self.max_spring_length - - return bool(bad_condition) - - def _dsdt(self, sa, t): - assert sa.shape[-1] == 10, "state + action must be of shape (10,)" - x1, x1dot, theta1, theta1dot, x2, x2dot, theta2, theta2dot, a1, a2 = np.split( - sa, 10, axis=-1 - ) - cos1 = np.cos(theta1) - sin1 = np.sin(theta1) - cos2 = np.cos(theta2) - sin2 = np.sin(theta2) - # x1 - size [N, 1] or [L, N, 1] - - f1 = a1 + self.spring * (self.normal_spring_length - np.abs(x1 - x2)) - f2 = a2 + self.spring * (self.normal_spring_length - np.abs(x1 - x2)) - - a11 = 4 * self.l / 3 - a22 = -self.masscart - self.masspole - - a121 = -cos1 - a122 = -cos2 - a211 = self.l * self.masspole * cos1 - a212 = self.l * self.masspole * cos2 - - b11 = ( - self.gravity * sin1 - - self.pole_friction * theta1dot / self.l / self.masspole - ) - b12 = ( - self.gravity * sin2 - - self.pole_friction * theta2dot / self.l / self.masspole - ) - - b21 = ( - self.l * self.masspole * sin1 * theta1dot**2 - - f1 - + self.cart_friction * np.sign(x1dot) - ) - b22 = ( - self.l * self.masspole * sin2 * theta2dot**2 - - f2 - + self.cart_friction * np.sign(x2dot) - ) - - theta1acc = (a121 * b21 - a22 * b11) / (a121 * a211 - a11 * a22) - theta2acc = (a122 * b22 - a22 * b12) / (a122 * a212 - a11 * a22) - - x1acc = (b11 - a11 * theta1acc) / a121 - x2acc = (b12 - a11 * theta2acc) / a122 - - a1dot = np.zeros_like(a1) - a2dot = np.zeros_like(a2) - - return np.concatenate( - [ - x1dot, - x1acc, - theta1dot, - theta1acc, - x2dot, - x2acc, - theta2dot, - theta2acc, - a1dot, - a2dot, - ], - axis=-1, - ) - - # - # Below: code for rendering - # - - def get_background(self): - bg = Scene() - return bg - - def get_scene(self, state): - scene = Scene() - SCALE = 3 - - assert state.shape[-1] == 8, "state must be of shape (8,)" - - x1 = state[0] - x2 = state[4] - theta1 = state[2] - theta2 = state[6] - - cartx1 = x1 * SCALE # MIDDLE OF CART 1 - - cartx2 = x2 * SCALE # MIDDLE OF CART 2 - - cartwidth = 0.05 * SCALE - - c1p1 = ( - cartx1 - cartwidth / 2, - 0, - ) - c1p2 = ( - cartx1 + cartwidth / 2, - 0, - ) - - c2p1 = ( - cartx2 - cartwidth / 2, - 0, - ) - c2p2 = ( - cartx2 + cartwidth / 2, - 0, - ) - - p1 = ( - cartx1 - np.sin(theta1) * self.pole_length * SCALE, - np.cos(theta1) * self.pole_length * SCALE, - ) - - p01 = (cartx1, 0) - p02 = (cartx2, 0) - - p2 = ( - cartx2 - np.sin(theta2) * self.pole_length * SCALE, - np.cos(theta2) * self.pole_length * SCALE, - ) - - cart1 = bar_shape(c1p1, c1p2, 0.02 * SCALE) - cart1.set_color((255 / 255, 100 / 255, 0 / 255)) - - cart2 = bar_shape(c2p1, c2p2, 0.02 * SCALE) - cart2.set_color((255 / 255, 100 / 255, 0 / 255)) - - pole1 = bar_shape(p01, p1, 0.01 * SCALE) - pole1.set_color((255 / 255, 215 / 255, 0 / 255)) - - pole2 = bar_shape(p02, p2, 0.01 * SCALE) - pole2.set_color((255 / 255, 215 / 255, 0 / 255)) - - spring = bar_shape( - p01, - p02, - 0.03 - * np.sqrt(self.normal_spring_length) - * SCALE - / np.sqrt(cartx2 - cartx1), - ) - spring.set_color((50 / 255, 50 / 255, 50 / 255)) - - joint1 = circle_shape(p01, 0.03) - joint1.set_color((0 / 255, 255 / 255, 0 / 255)) - - joint2 = circle_shape(p02, 0.03) - joint2.set_color((0 / 255, 255 / 255, 0 / 255)) - - track_line = GeometricPrimitive("LINES") - track_line.add_vertex((-self.track_length / 2 * SCALE, -0.02 * SCALE)) - track_line.add_vertex((self.track_length / 2 * SCALE, -0.02 * SCALE)) - - axis1 = GeometricPrimitive("LINES") - axis1.add_vertex((cartx1, 0)) - axis1.add_vertex((cartx1, self.pole_length * SCALE)) - axis1.set_color((250 / 255, 250 / 255, 250 / 255)) - - axis2 = GeometricPrimitive("LINES") - axis2.add_vertex((cartx2, 0)) - axis2.add_vertex((cartx2, self.pole_length * SCALE)) - axis2.set_color((250 / 255, 250 / 255, 250 / 255)) - - scene.add_shape(cart1) - scene.add_shape(cart2) - scene.add_shape(pole1) - scene.add_shape(pole2) - scene.add_shape(joint1) - scene.add_shape(joint2) - scene.add_shape(spring) - scene.add_shape(track_line) - - return scene - - -def wrap(x, m, M): - """Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which - truncates, ``wrap()`` wraps x around the coordinate system defined - by m, M. - For example, m = -180, M = 180 (degrees), x = 360 --> returns 0. - - Parameters - ---------- - x: a scalar - m: - minimum possible value in range - M: - maximum possible value in range - - Returns - ------- - x: - a scalar, wrapped - """ - diff = M - m - while x > M: - x = x - diff - while x < m: - x = x + diff - return x - - -def bound(x, m, M=None): - """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR* - have m as length 2 vector, bound(x,m, ) returns m[0] <= x <= m[1]. - - Parameters - ---------- - x: - scalar - - Returns - ------- - x: - scalar, bound between min (m) and Max (M) - """ - if M is None: - M = m[1] - m = m[0] - # bound x between min (m) and Max (M) - return np.clip(x, m, M) - - -def rk4(derivs, y0, t, *args, **kwargs): - """ - Integrate 1D or ND system of ODEs using 4-th order Runge-Kutta. - This is a toy implementation which may be useful if you find - yourself stranded on a system w/o scipy. Otherwise use - :func:`scipy.integrate`. - - Parameters: - ----------- - derivs: - the derivative of the system and has the signature - ``dy = derivs(yi, ti)`` - y0: - initial state vector - t: - sample times - args: - additional arguments passed to the derivative function - kwargs: - additional keyword arguments passed to the derivative function - - Returns - ------- - yout: - Runge-Kutta approximation of the ODE - - Examples - -------- - Example 1:: - ## 2D system - def derivs6(x,t): - d1 = x[0] + 2*x[1] - d2 = -3*x[0] + 4*x[1] - return (d1, d2) - dt = 0.0005 - t = arange(0.0, 2.0, dt) - y0 = (1,2) - yout = rk4(derivs6, y0, t) - - Example 2:: - ## 1D system - alpha = 2 - def derivs(x,t): - return -alpha*x + exp(-t) - y0 = 1 - yout = rk4(derivs, y0, t) - - If you have access to scipy, you should probably be using the - scipy.integrate tools rather than this function. - """ - - try: - Ny = len(y0) - except TypeError: - yout = np.zeros((len(t),), np.float_) - else: - yout = np.zeros((len(t), Ny), np.float_) - - yout[0] = y0 - - for i in np.arange(len(t) - 1): - thist = t[i] - dt = t[i + 1] - thist - dt2 = dt / 2.0 - y0 = yout[i] - - k1 = np.asarray(derivs(y0, thist, *args, **kwargs)) - k2 = np.asarray(derivs(y0 + dt2 * k1, thist + dt2, *args, **kwargs)) - k3 = np.asarray(derivs(y0 + dt2 * k2, thist + dt2, *args, **kwargs)) - k4 = np.asarray(derivs(y0 + dt * k3, thist + dt, *args, **kwargs)) - yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4) - return yout diff --git a/rlberry/envs/classic_control/__init__.py b/rlberry/envs/classic_control/__init__.py deleted file mode 100644 index a6cd76c14..000000000 --- a/rlberry/envs/classic_control/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .mountain_car import MountainCar -from .acrobot import Acrobot -from .pendulum import Pendulum -from .SpringCartPole import SpringCartPole diff --git a/rlberry/envs/classic_control/acrobot.py b/rlberry/envs/classic_control/acrobot.py deleted file mode 100644 index 2404b66e1..000000000 --- a/rlberry/envs/classic_control/acrobot.py +++ /dev/null @@ -1,394 +0,0 @@ -""" -Acrobot environment adapted from OpenAI gym [1]. (updated to gymnasium template [2]) - -Modifications: -* define reward_range -* render function follows the rlberry rendering interface. - -[1] https://github.com/openai/gym/blob/master/gym/ -[2] https://gymnasium.farama.org/api/env/ -envs/classic_control/acrobot.py -""" - -import numpy as np -import rlberry.spaces as spaces -from rlberry.envs.interface import Model -from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D -from rlberry.rendering.common_shapes import bar_shape, circle_shape - -__copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy" -__credits__ = [ - "Alborz Geramifard", - "Robert H. Klein", - "Christoph Dann", - "William Dabney", - "Jonathan P. How", -] -__license__ = "BSD 3-Clause" -__author__ = "Christoph Dann " - - -# SOURCE: -# https://github.com/rlpy/rlpy/blob/master/rlpy/Domains/Acrobot.py - - -class Acrobot(RenderInterface2D, Model): - """ - Acrobot is a 2-link pendulum with only the second joint actuated. - Initially, both links point downwards. The goal is to swing the - end-effector at a height at least the length of one link above the base. - Both links can swing freely and can pass by each other, i.e., they don't - collide when they have the same angle. - - Notes - ----- - State: - The state consists of the sin() and cos() of the two rotational joint - angles and the joint angular velocities: - [cos(theta1) sin(theta1) cos(theta2) sin(theta2) thetaDot1 thetaDot2]. - For the first link, an angle of 0 corresponds to the link pointing - downwards. - The angle of the second link is relative to the angle of the first link. - An angle of 0 corresponds to having the same angle between the two links. - A state of [1, 0, 1, 0, ..., ...] means that both links point downwards. - - Actions: - The action is either applying +1, 0 or -1 torque on the joint between - the two pendulum links. - .. note:: - The dynamics equations were missing some terms in the NIPS paper which - are present in the book. R. Sutton confirmed in personal correspondence - that the experimental results shown in the paper and the book were - generated with the equations shown in the book. - However, there is the option to run the domain with the paper equations - by setting book_or_nips = 'nips' - - Reference: - .. seealso:: - R. Sutton: Generalization in Reinforcement Learning: - Successful Examples Using Sparse Coarse Coding (NIPS 1996) - .. seealso:: - R. Sutton and A. G. Barto: - Reinforcement learning: An introduction. - Cambridge: MIT press, 1998. - .. warning:: - This version of the domain uses the Runge-Kutta method for integrating - the system dynamics and is more realistic, but also considerably harder - than the original version which employs Euler integration, - see the AcrobotLegacy class. - """ - - name = "Acrobot" - - dt = 0.2 - - LINK_LENGTH_1 = 1.0 # [m] - LINK_LENGTH_2 = 1.0 # [m] - LINK_MASS_1 = 1.0 #: [kg] mass of link 1 - LINK_MASS_2 = 1.0 #: [kg] mass of link 2 - LINK_COM_POS_1 = 0.5 #: [m] position of the center of mass of link 1 - LINK_COM_POS_2 = 0.5 #: [m] position of the center of mass of link 2 - LINK_MOI = 1.0 #: moments of inertia for both links - - MAX_VEL_1 = 4 * np.pi - MAX_VEL_2 = 9 * np.pi - - AVAIL_TORQUE = [-1.0, 0.0, +1] - - torque_noise_max = 0.0 - - #: use dynamics equations from the nips paper or the book - book_or_nips = "book" - action_arrow = None - domain_fig = None - actions_num = 3 - - def __init__(self): - # init base classes - Model.__init__(self) - RenderInterface2D.__init__(self) - self.reward_range = (-1.0, 0.0) - - # rendering info - bound = self.LINK_LENGTH_1 + self.LINK_LENGTH_2 + 0.2 - # (left, right, bottom, top) - self.set_clipping_area((-bound, bound, -bound, bound)) - self.set_refresh_interval(10) # in milliseconds - - # observation and action spaces - high = np.array([1.0, 1.0, 1.0, 1.0, self.MAX_VEL_1, self.MAX_VEL_2]) - low = -high - self.observation_space = spaces.Box(low=low, high=high) - self.action_space = spaces.Discrete(3) - - # initialize - self.state = None - self.reset() - - def reset(self, seed=None, options=None): - self.state = self.rng.uniform(low=-0.1, high=0.1, size=(4,)) - return self._get_ob(), {} - - def step(self, action): - assert self.action_space.contains(action), "%r (%s) invalid" % ( - action, - type(action), - ) - - # save state for rendering - if self.is_render_enabled(): - self.append_state_for_rendering(np.array(self.state)) - - s = self.state - torque = self.AVAIL_TORQUE[action] - - # Add noise to the force action - if self.torque_noise_max > 0: - torque += self.rng.uniform(-self.torque_noise_max, self.torque_noise_max) - - # Now, augment the state with our force action so it can be passed to - # _dsdt - s_augmented = np.append(s, torque) - - ns = rk4(self._dsdt, s_augmented, [0, self.dt]) - # only care about final timestep of integration returned by integrator - ns = ns[-1] - ns = ns[:4] # omit action - # ODEINT IS TOO SLOW! - # ns_continuous = integrate.odeint(self._dsdt, self.s_continuous, - # [0, self.dt]) - # self.s_continuous = ns_continuous[-1] # We only care about the state - # at the ''final timestep'', self.dt - - ns[0] = wrap(ns[0], -np.pi, np.pi) - ns[1] = wrap(ns[1], -np.pi, np.pi) - ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1) - ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2) - self.state = ns - terminated = self._terminal() - truncated = False - reward = -1.0 if not terminated else 0.0 - return self._get_ob(), reward, terminated, truncated, {} - - def _get_ob(self): - s = self.state - return np.array( - [np.cos(s[0]), np.sin(s[0]), np.cos(s[1]), np.sin(s[1]), s[2], s[3]] - ) - - def _terminal(self): - s = self.state - return bool(-np.cos(s[0]) - np.cos(s[1] + s[0]) > 1.0) - - def _dsdt(self, s_augmented, t): - m1 = self.LINK_MASS_1 - m2 = self.LINK_MASS_2 - l1 = self.LINK_LENGTH_1 - lc1 = self.LINK_COM_POS_1 - lc2 = self.LINK_COM_POS_2 - I1 = self.LINK_MOI - I2 = self.LINK_MOI - g = 9.8 - a = s_augmented[-1] - s = s_augmented[:-1] - theta1 = s[0] - theta2 = s[1] - dtheta1 = s[2] - dtheta2 = s[3] - d1 = ( - m1 * lc1**2 - + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * np.cos(theta2)) - + I1 - + I2 - ) - d2 = m2 * (lc2**2 + l1 * lc2 * np.cos(theta2)) + I2 - phi2 = m2 * lc2 * g * np.cos(theta1 + theta2 - np.pi / 2.0) - phi1 = ( - -m2 * l1 * lc2 * dtheta2**2 * np.sin(theta2) - - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * np.sin(theta2) - + (m1 * lc1 + m2 * l1) * g * np.cos(theta1 - np.pi / 2) - + phi2 - ) - if self.book_or_nips == "nips": - # the following line is consistent with the description in the - # paper - ddtheta2 = (a + d2 / d1 * phi1 - phi2) / (m2 * lc2**2 + I2 - d2**2 / d1) - else: - # the following line is consistent with the java implementation - # and the book - ddtheta2 = ( - a - + d2 / d1 * phi1 - - m2 * l1 * lc2 * dtheta1**2 * np.sin(theta2) - - phi2 - ) / (m2 * lc2**2 + I2 - d2**2 / d1) - ddtheta1 = -(d2 * ddtheta2 + phi1) / d1 - return (dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0) - - # - # Below: code for rendering - # - - def get_background(self): - bg = Scene() - return bg - - def get_scene(self, state): - scene = Scene() - - p0 = (0.0, 0.0) - - p1 = ( - self.LINK_LENGTH_1 * np.sin(state[0]), - -self.LINK_LENGTH_1 * np.cos(state[0]), - ) - p2 = ( - p1[0] + self.LINK_LENGTH_2 * np.sin(state[0] + state[1]), - p1[1] - self.LINK_LENGTH_2 * np.cos(state[0] + state[1]), - ) - - link1 = bar_shape(p0, p1, 0.1) - link1.set_color((255 / 255, 140 / 255, 0 / 255)) - - link2 = bar_shape(p1, p2, 0.1) - link2.set_color((210 / 255, 105 / 255, 30 / 255)) - - joint1 = circle_shape(p0, 0.075) - joint1.set_color((255 / 255, 215 / 255, 0 / 255)) - - joint2 = circle_shape(p1, 0.075) - joint2.set_color((255 / 255, 215 / 255, 0 / 255)) - - goal_line = GeometricPrimitive("LINES") - goal_line.add_vertex((-5, 1)) - goal_line.add_vertex((5, 1)) - - scene.add_shape(link1) - scene.add_shape(link2) - scene.add_shape(joint1) - scene.add_shape(joint2) - scene.add_shape(goal_line) - - return scene - - -def wrap(x, m, M): - """Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which - truncates, ``wrap()`` wraps x around the coordinate system defined - by m, M. - For example, m = -180, M = 180 (degrees), x = 360 --> returns 0. - - Parameters - ---------- - x: a scalar - m: - minimum possible value in range - M: - maximum possible value in range - - Returns - ------- - x: - a scalar, wrapped - """ - diff = M - m - while x > M: - x = x - diff - while x < m: - x = x + diff - return x - - -def bound(x, m, M=None): - """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR* - have m as length 2 vector, bound(x,m, ) returns m[0] <= x <= m[1]. - - Parameters - ---------- - x: - scalar - - Returns - ------- - x: - scalar, bound between min (m) and Max (M) - """ - if M is None: - M = m[1] - m = m[0] - # bound x between min (m) and Max (M) - return min(max(x, m), M) - - -def rk4(derivs, y0, t, *args, **kwargs): - """ - Integrate 1D or ND system of ODEs using 4-th order Runge-Kutta. - This is a toy implementation which may be useful if you find - yourself stranded on a system w/o scipy. Otherwise use - :func:`scipy.integrate`. - - Parameters: - ----------- - derivs: - the derivative of the system and has the signature - ``dy = derivs(yi, ti)`` - y0: - initial state vector - t: - sample times - args: - additional arguments passed to the derivative function - kwargs: - additional keyword arguments passed to the derivative function - - Returns - ------- - yout: - Runge-Kutta approximation of the ODE - - Examples - -------- - Example 1:: - ## 2D system - def derivs6(x,t): - d1 = x[0] + 2*x[1] - d2 = -3*x[0] + 4*x[1] - return (d1, d2) - dt = 0.0005 - t = arange(0.0, 2.0, dt) - y0 = (1,2) - yout = rk4(derivs6, y0, t) - - Example 2:: - ## 1D system - alpha = 2 - def derivs(x,t): - return -alpha*x + exp(-t) - y0 = 1 - yout = rk4(derivs, y0, t) - - If you have access to scipy, you should probably be using the - scipy.integrate tools rather than this function. - """ - - try: - Ny = len(y0) - except TypeError: - yout = np.zeros((len(t),), np.float_) - else: - yout = np.zeros((len(t), Ny), np.float_) - - yout[0] = y0 - - for i in np.arange(len(t) - 1): - thist = t[i] - dt = t[i + 1] - thist - dt2 = dt / 2.0 - y0 = yout[i] - - k1 = np.asarray(derivs(y0, thist, *args, **kwargs)) - k2 = np.asarray(derivs(y0 + dt2 * k1, thist + dt2, *args, **kwargs)) - k3 = np.asarray(derivs(y0 + dt2 * k2, thist + dt2, *args, **kwargs)) - k4 = np.asarray(derivs(y0 + dt * k3, thist + dt, *args, **kwargs)) - yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4) - return yout diff --git a/rlberry/envs/classic_control/mountain_car.py b/rlberry/envs/classic_control/mountain_car.py deleted file mode 100644 index ff3cb1335..000000000 --- a/rlberry/envs/classic_control/mountain_car.py +++ /dev/null @@ -1,202 +0,0 @@ -""" -Mountain Car environment adapted from OpenAI gym [1]. (updated to gymnasium template [2]) - -* default reward is 0 (instead of -1) -* reward in goal state is 1 (instead of 0) -* also implemented as a generative model (in addition to an online model) -* render function follows the rlberry rendering interface. - -[1] https://github.com/openai/gym/blob/master/gym/envs/ -[2] https://gymnasium.farama.org/api/env/ -classic_control/mountain_car.py -""" - -import math - -import numpy as np - -import rlberry.spaces as spaces -from rlberry.envs.interface import Model -from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D - - -class MountainCar(RenderInterface2D, Model): - """ - The agent (a car) is started at the bottom of a valley. For any given - state the agent may choose to accelerate to the left, right or cease - any acceleration. - - Notes - ----- - Source: - The environment appeared first in Andrew Moore's PhD Thesis (1990). - - Observation: - Type: Box(2) - Num Observation Min Max - 0 Car Position -1.2 0.6 - 1 Car Velocity -0.07 0.07 - - Actions: - Type: Discrete(3) - Num Action - 0 Accelerate to the Left - 1 Don't accelerate - 2 Accelerate to the Right - - Note: This does not affect the amount of velocity affected by the - gravitational pull acting on the car. - - Reward: - Reward of 1 is awarded if the agent reached the flag (position = 0.5) - on top of the mountain. - Reward of 0 is awarded if the position of the agent is less than 0.5. - - Starting State: - The position of the car is assigned a uniform random value in - [-0.6 , -0.4]. - The starting velocity of the car is always assigned to 0. - - Episode Termination: - The car position is more than 0.5 - """ - - name = "MountainCar" - - def __init__(self, goal_velocity=0): - # init base classes - Model.__init__(self) - RenderInterface2D.__init__(self) - - self.min_position = -1.2 - self.max_position = 0.6 - self.max_speed = 0.07 - self.goal_position = 0.5 - self.goal_velocity = goal_velocity - - self.force = 0.001 - self.gravity = 0.0025 - - self.low = np.array([self.min_position, -self.max_speed]) - self.high = np.array([self.max_position, self.max_speed]) - - self.action_space = spaces.Discrete(3) - self.observation_space = spaces.Box(self.low, self.high) - - self.reward_range = (0.0, 1.0) - - # rendering info - self.set_clipping_area((-1.2, 0.6, -0.2, 1.1)) - self.set_refresh_interval(10) # in milliseconds - - # initial reset - self.reset() - - def step(self, action): - assert self.action_space.contains(action), "%r (%s) invalid" % ( - action, - type(action), - ) - - # save state for rendering - if self.is_render_enabled(): - self.append_state_for_rendering(np.array(self.state)) - - next_state, reward, terminated, truncated, info = self.sample( - self.state, action - ) - self.state = next_state.copy() - - return next_state, reward, terminated, truncated, info - - def reset(self, seed=None, options=None): - self.state = np.array([self.rng.uniform(low=-0.6, high=-0.4), 0]) - return self.state.copy(), {} - - def sample(self, state, action): - if not isinstance(state, np.ndarray): - state = np.array(state) - assert self.observation_space.contains( - state - ), "Invalid state as argument of reset()." - assert self.action_space.contains(action), "%r (%s) invalid" % ( - action, - type(action), - ) - - position = state[0] - velocity = state[1] - velocity += (action - 1) * self.force + math.cos(3 * position) * (-self.gravity) - velocity = np.clip(velocity, -self.max_speed, self.max_speed) - position += velocity - position = np.clip(position, self.min_position, self.max_position) - if position == self.min_position and velocity < 0: - velocity = 0 - - terminated = bool( - position >= self.goal_position and velocity >= self.goal_velocity - ) - truncated = False - done = terminated or truncated - reward = 0.0 - if done: - reward = 1.0 - - next_state = np.array([position, velocity]) - return next_state, reward, terminated, truncated, {} - - @staticmethod - def _height(xs): - return np.sin(3 * xs) * 0.45 + 0.55 - - # - # Below: code for rendering - # - - def get_background(self): - bg = Scene() - mountain = GeometricPrimitive("TRIANGLE_FAN") - flag = GeometricPrimitive("TRIANGLES") - mountain.set_color((0.6, 0.3, 0.0)) - flag.set_color((0.0, 0.5, 0.0)) - - # Mountain - mountain.add_vertex((-0.3, -1.0)) - mountain.add_vertex((0.6, -1.0)) - - n_points = 50 - obs_range = self.observation_space.high[0] - self.observation_space.low[0] - eps = obs_range / (n_points - 1) - for ii in reversed(range(n_points)): - x = self.observation_space.low[0] + ii * eps - y = self._height(x) - mountain.add_vertex((x, y)) - mountain.add_vertex((-1.2, -1.0)) - - # Flag - goal_x = self.goal_position - goal_y = self._height(goal_x) - flag.add_vertex((goal_x, goal_y)) - flag.add_vertex((goal_x + 0.025, goal_y + 0.075)) - flag.add_vertex((goal_x - 0.025, goal_y + 0.075)) - - bg.add_shape(mountain) - bg.add_shape(flag) - - return bg - - def get_scene(self, state): - scene = Scene() - - agent = GeometricPrimitive("QUADS") - agent.set_color((0.0, 0.0, 0.0)) - size = 0.025 - x = state[0] - y = self._height(x) - agent.add_vertex((x - size, y - size)) - agent.add_vertex((x + size, y - size)) - agent.add_vertex((x + size, y + size)) - agent.add_vertex((x - size, y + size)) - - scene.add_shape(agent) - return scene diff --git a/rlberry/envs/classic_control/pendulum.py b/rlberry/envs/classic_control/pendulum.py deleted file mode 100644 index 972db1ceb..000000000 --- a/rlberry/envs/classic_control/pendulum.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -Pendulum environment adapted from OpenAI gym [1]. (updated to gymnasium template [2]) - -Modifications: -* render function follows the rlberry rendering interface - -[1] https://github.com/openai/gym/blob/master/gym/ -[2] https://gymnasium.farama.org/api/env/ -envs/classic_control/pendulum.py -""" - -import numpy as np -import rlberry.spaces as spaces -from rlberry.envs.interface import Model -from rlberry.rendering import Scene, RenderInterface2D -from rlberry.rendering.common_shapes import bar_shape, circle_shape - - -class Pendulum(RenderInterface2D, Model): - """ - The inverted pendulum swingup problem is a classic problem - in the control literature. In this version of the problem, - the pendulum starts in a random position, and the goal - is to swing it up so it stays upright. - """ - - name = "Pendulum" - - def __init__(self): - # init base classes - Model.__init__(self) - RenderInterface2D.__init__(self) - - # environment parameters - self.max_speed = 8.0 - self.max_torque = 2.0 - self.dt = 0.5 - self.gravity = 10.0 - self.mass = 1.0 - self.length = 1.0 - - # rendering info - self.set_clipping_area((-2.2, 2.2, -2.2, 2.2)) - self.set_refresh_interval(10) - - # observation and action spaces - high = np.array([1.0, 1.0, self.max_speed]) - low = -high - self.action_space = spaces.Box( - low=-self.max_torque, high=self.max_torque, shape=(1,) - ) - self.observation_space = spaces.Box(low=low, high=high) - - # initialize - self.reset() - - def reset(self, seed=None, options=None): - high = np.array([np.pi, 1]) - low = -high - self.state = self.rng.uniform(low=low, high=high) - self.last_action = None - return self._get_ob(), {} - - def step(self, action): - assert self.action_space.contains(action), "%r (%s) invalid" % ( - action, - type(action), - ) - - # save state for rendering - if self.is_render_enabled(): - self.append_state_for_rendering(np.array(self.state)) - - theta, thetadot = self.state - gravity = self.gravity - mass = self.mass - length = self.length - dt = self.dt - - action = np.clip(action, -self.max_torque, self.max_torque)[0] - self.last_action = action # for rendering - costs = ( - angle_normalize(theta) ** 2 + 0.1 * thetadot**2 + 0.001 * (action**2) - ) - - # compute the next state after action - newthetadot = ( - thetadot - + ( - -3 * gravity / (2 * length) * np.sin(theta + np.pi) - + 3.0 / (mass * length**2) * action - ) - * dt - ) - newtheta = theta + newthetadot * dt - newthetadot = np.clip(newthetadot, -self.max_speed, self.max_speed) - - self.state = np.array([newtheta, newthetadot]) - return self._get_ob(), -costs, False, False, {} - - def _get_ob(self): - theta, thetadot = self.state - return np.array([np.cos(theta), np.sin(theta), thetadot]) - - # - # Below code for rendering - # - - def get_background(self): - bg = Scene() - return bg - - def get_scene(self, state): - scene = Scene() - - p0 = (0.0, 0.0) - p1 = (self.length * np.sin(state[0]), -self.length * np.cos(state[0])) - - link = bar_shape(p0, p1, 0.1) - link.set_color((255 / 255, 105 / 255, 30 / 255)) - - joint = circle_shape(p0, 0.075) - joint.set_color((255 / 255, 215 / 255, 0 / 255)) - - scene.add_shape(link) - scene.add_shape(joint) - - return scene - - -def angle_normalize(x): - return ((x + np.pi) % (2 * np.pi)) - np.pi diff --git a/rlberry/envs/finite/__init__.py b/rlberry/envs/finite/__init__.py deleted file mode 100644 index 036e4520a..000000000 --- a/rlberry/envs/finite/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .finite_mdp import FiniteMDP -from .gridworld import GridWorld -from .chain import Chain diff --git a/rlberry/envs/finite/chain.py b/rlberry/envs/finite/chain.py deleted file mode 100644 index da333d713..000000000 --- a/rlberry/envs/finite/chain.py +++ /dev/null @@ -1,132 +0,0 @@ -import numpy as np - -from rlberry.envs.finite import FiniteMDP -from rlberry.rendering import RenderInterface2D, Scene, GeometricPrimitive - - -class Chain(RenderInterface2D, FiniteMDP): - """ - Simple chain environment. - Reward 0.05 in initial state, reward 1.0 in final state. - - Parameters - ---------- - L : int - length of the chain - fail_prob : double - fail probability - """ - - name = "Chain" - - def __init__(self, L=5, fail_prob=0.1): - assert L >= 2 - self.L = L - self.fail_prob = fail_prob - - # transition probabilities - P = np.zeros((L, 2, L)) - for ss in range(L): - for _ in range(2): - if ss == 0: - P[ss, 0, ss] = 1.0 - fail_prob # action 0 = don't move - P[ss, 1, ss + 1] = 1.0 - fail_prob # action 1 = right - P[ss, 0, ss + 1] = fail_prob - P[ss, 1, ss] = fail_prob - elif ss == L - 1: - P[ss, 0, ss - 1] = 1.0 - fail_prob # action 0 = left - P[ss, 1, ss] = 1.0 - fail_prob # action 1 = don't move - P[ss, 0, ss] = fail_prob - P[ss, 1, ss - 1] = fail_prob - else: - P[ss, 0, ss - 1] = 1.0 - fail_prob # action 0 = left - P[ss, 1, ss + 1] = 1.0 - fail_prob # action 1 = right - P[ss, 0, ss + 1] = fail_prob - P[ss, 1, ss - 1] = fail_prob - - # mean reward - S = L - A = 2 - R = np.zeros((S, A)) - R[L - 1, :] = 1.0 - R[0, :] = 0.05 - - # init base classes - FiniteMDP.__init__(self, R, P, initial_state_distribution=0) - RenderInterface2D.__init__(self) - self.reward_range = (0.0, 1.0) - - # rendering info - self.set_clipping_area((0, L, 0, 1)) - self.set_refresh_interval(100) # in milliseconds - - def step(self, action): - assert action in self._actions, "Invalid action!" - - # save state for rendering - if self.is_render_enabled(): - self.append_state_for_rendering(self.state) - - # take step - next_state, reward, terminated, truncated, info = self.sample( - self.state, action - ) - - self.state = next_state - return next_state, reward, terminated, truncated, info - - # - # Code for rendering - # - - def get_background(self): - """ - Returne a scene (list of shapes) representing the background - """ - bg = Scene() - colors = [(0.8, 0.8, 0.8), (0.9, 0.9, 0.9)] - for ii in range(self.L): - shape = GeometricPrimitive("QUADS") - shape.add_vertex((ii, 0)) - shape.add_vertex((ii + 1, 0)) - shape.add_vertex((ii + 1, 1)) - shape.add_vertex((ii, 1)) - shape.set_color(colors[ii % 2]) - bg.add_shape(shape) - - flag = GeometricPrimitive("TRIANGLES") - flag.set_color((0.0, 0.5, 0.0)) - x = self.L - 0.5 - y = 0.25 - flag.add_vertex((x, y)) - flag.add_vertex((x + 0.25, y + 0.5)) - flag.add_vertex((x - 0.25, y + 0.5)) - bg.add_shape(flag) - - return bg - - def get_scene(self, state): - """ - Return scene (list of shapes) representing a given state - """ - scene = Scene() - - agent = GeometricPrimitive("QUADS") - agent.set_color((0.75, 0.0, 0.5)) - - size = 0.25 - x = state + 0.5 - y = 0.5 - - agent.add_vertex((x - size / 4.0, y - size)) - agent.add_vertex((x + size / 4.0, y - size)) - agent.add_vertex((x + size / 4.0, y + size)) - agent.add_vertex((x - size / 4.0, y + size)) - - agent.add_vertex((x - size, y - size / 4.0)) - agent.add_vertex((x + size, y - size / 4.0)) - agent.add_vertex((x + size, y + size / 4.0)) - agent.add_vertex((x - size, y + size / 4.0)) - - scene.add_shape(agent) - return scene diff --git a/rlberry/envs/finite/gridworld.py b/rlberry/envs/finite/gridworld.py deleted file mode 100644 index ce585317d..000000000 --- a/rlberry/envs/finite/gridworld.py +++ /dev/null @@ -1,490 +0,0 @@ -import matplotlib -import numpy as np - -import matplotlib.pyplot as plt -from matplotlib import cm - -from rlberry.envs.finite import FiniteMDP -from rlberry.envs.finite import gridworld_utils -from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D -from rlberry.rendering.common_shapes import circle_shape - - -import rlberry - -logger = rlberry.logger - - -class GridWorld(RenderInterface2D, FiniteMDP): - """ - Simple GridWorld environment. - - Parameters - ----------- - nrows : int - number of rows - ncols : int - number of columns - start_coord : tuple - tuple with coordinates of initial position - terminal_states : tuple - ((row_0, col_0), (row_1, col_1), ...) = coordinates of - terminal states - success_probability : double - probability of moving in the chosen direction - reward_at: dict - dictionary, keys = tuple containing coordinates, values = reward - at each coordinate - walls : tuple - ((row_0, col_0), (row_1, col_1), ...) = coordinates of walls - default_reward : double - reward received at states not in 'reward_at' - - """ - - name = "GridWorld" - - def __init__( - self, - nrows=5, - ncols=5, - start_coord=(0, 0), - terminal_states=None, - success_probability=0.9, - reward_at=None, - walls=((1, 1), (2, 2)), - default_reward=0.0, - ): - # Grid dimensions - self.nrows = nrows - self.ncols = ncols - - # Reward parameters - self.default_reward = default_reward - - # Default config - if reward_at is not None: - self.reward_at = reward_at - else: - self.reward_at = {(nrows - 1, ncols - 1): 1} - if walls is not None: - self.walls = walls - else: - self.walls = () - if terminal_states is not None: - self.terminal_states = terminal_states - else: - self.terminal_states = () - - # Probability of going left/right/up/down when choosing the - # correspondent action - # The remaining probability mass is distributed uniformly to other - # available actions - self.success_probability = success_probability - - # Start coordinate - self.start_coord = tuple(start_coord) - - # Actions (string to index & index to string) - self.a_str2idx = {"left": 0, "right": 1, "down": 2, "up": 3} - self.a_idx2str = {0: "left", 1: "right", 2: "down", 3: "up"} - - # -------------------------------------------- - # The variables below are defined in _build() - # -------------------------------------------- - - # Mappings (state index) <-> (state coordinate) - self.index2coord = {} - self.coord2index = {} - - # MDP parameters for base class - self.P = None - self.R = None - self.Ns = None - self.Na = 4 - - # Build - self._build() - init_state_idx = self.coord2index[start_coord] - FiniteMDP.__init__( - self, self.R, self.P, initial_state_distribution=init_state_idx - ) - RenderInterface2D.__init__(self) - self.reset() - self.reward_range = (self.R.min(), self.R.max()) - - # rendering info - self.set_clipping_area((0, self.ncols, 0, self.nrows)) - self.set_refresh_interval(100) # in milliseconds - self.renderer_type = "pygame" - - @classmethod - def from_layout( - cls, layout: str = gridworld_utils.DEFAULT_LAYOUT, success_probability=0.95 - ): - """ - Create GridWorld instance from a layout. - - Layout symbols: - - '#' : wall - 'r' : reward of 1, terminal state - 'R' : reward of 1, non-terminal state - 'T' : terminal state - 'I' : initial state (if several, start uniformly among I) - 'O' : empty state - any other char : empty state - - Layout example: - - IOOOO # OOOOO O OOOOR - OOOOO # OOOOO # OOOOO - OOOOO O OOOOO # OOOOO - OOOOO # OOOOO # OOOOO - IOOOO # OOOOO # OOOOr - """ - info = gridworld_utils.get_layout_info(layout) - nrows = info["nrows"] - ncols = info["ncols"] - walls = info["walls"] - reward_at = info["reward_at"] - terminal_states = info["terminal_states"] - initial_states_coord = info["initial_states"] - - # Init base class - env = cls( - nrows=nrows, - ncols=ncols, - terminal_states=terminal_states, - success_probability=success_probability, - reward_at=reward_at, - walls=walls, - default_reward=0.0, - ) - - # Set initial distribution - distr = np.zeros(env.observation_space.n) - for init_coord in initial_states_coord: - init_index = env.coord2index[init_coord] - distr[init_index] = 1.0 - distr = distr / distr.sum() - env.set_initial_state_distribution(distr) - - return env - - def is_terminal(self, state): - state_coord = self.index2coord[state] - return state_coord in self.terminal_states - - def reward_fn(self, state, action, next_state): - row, col = self.index2coord[state] - if (row, col) in self.reward_at: - return self.reward_at[(row, col)] - if (row, col) in self.walls: - return 0.0 - return self.default_reward - - def _build(self): - self._build_state_mappings_and_states() - self._build_transition_probabilities() - self._build_mean_rewards() - - def _build_state_mappings_and_states(self): - index = 0 - for rr in range(self.nrows): - for cc in range(self.ncols): - if (rr, cc) in self.walls: - self.coord2index[(rr, cc)] = -1 - else: - self.coord2index[(rr, cc)] = index - self.index2coord[index] = (rr, cc) - index += 1 - states = np.arange(index).tolist() - self.Ns = len(states) - - def _build_mean_rewards(self): - S = self.Ns - A = self.Na - self.R = np.zeros((S, A)) - for ss in range(S): - for aa in range(A): - mean_r = 0 - for ns in range(S): - mean_r += self.reward_fn(ss, aa, ns) * self.P[ss, aa, ns] - self.R[ss, aa] = mean_r - - def _build_transition_probabilities(self): - Ns = self.Ns - Na = self.Na - self.P = np.zeros((Ns, Na, Ns)) - for s in range(Ns): - s_coord = self.index2coord[s] - neighbors = self._get_neighbors(*s_coord) - valid_neighbors = [neighbors[nn][0] for nn in neighbors if neighbors[nn][1]] - n_valid = len(valid_neighbors) - for a in range(Na): # each action corresponds to a direction - for nn in neighbors: - next_s_coord = neighbors[nn][0] - if next_s_coord in valid_neighbors: - next_s = self.coord2index[next_s_coord] - if a == nn: # action is successful - self.P[s, a, next_s] = self.success_probability + ( - 1 - self.success_probability - ) * (n_valid == 1) - elif neighbors[a][0] not in valid_neighbors: - self.P[s, a, s] = 1.0 - else: - if n_valid > 1: - self.P[s, a, next_s] = ( - 1.0 - self.success_probability - ) / (n_valid - 1) - - def _get_neighbors(self, row, col): - aux = {} - aux["left"] = (row, col - 1) # left - aux["right"] = (row, col + 1) # right - aux["up"] = (row - 1, col) # up - aux["down"] = (row + 1, col) # down - neighbors = {} - for direction_str in aux: - direction = self.a_str2idx[direction_str] - next_s = aux[direction_str] - neighbors[direction] = (next_s, self._is_valid(*next_s)) - return neighbors - - def get_transition_support(self, state): - row, col = self.index2coord[state] - neighbors = [(row, col - 1), (row, col + 1), (row - 1, col), (row + 1, col)] - return [ - self.coord2index[coord] for coord in neighbors if self._is_valid(*coord) - ] - - def _is_valid(self, row, col): - if (row, col) in self.walls: - return False - elif row < 0 or row >= self.nrows: - return False - elif col < 0 or col >= self.ncols: - return False - return True - - def _build_ascii(self): - grid = [[""] * self.ncols for rr in range(self.nrows)] - grid_idx = [[""] * self.ncols for rr in range(self.nrows)] - for rr in range(self.nrows): - for cc in range(self.ncols): - if (rr, cc) in self.walls: - grid[rr][cc] = "x " - else: - grid[rr][cc] = "o " - grid_idx[rr][cc] = str(self.coord2index[(rr, cc)]).zfill(3) - - for rr, cc in self.reward_at: - rwd = self.reward_at[(rr, cc)] - if rwd > 0: - grid[rr][cc] = "+ " - if rwd < 0: - grid[rr][cc] = "-" - - grid[self.start_coord[0]][self.start_coord[1]] = "I " - - # current position of the agent - x, y = self.index2coord[self.state] - grid[x][y] = "A " - - # - grid_ascii = "" - for rr in range(self.nrows + 1): - if rr < self.nrows: - grid_ascii += str(rr).zfill(2) + 2 * " " + " ".join(grid[rr]) + "\n" - else: - grid_ascii += 3 * " " + " ".join( - [str(jj).zfill(2) for jj in range(self.ncols)] - ) - - self.grid_ascii = grid_ascii - self.grid_idx = grid_idx - return self.grid_ascii - - def display_values(self, values): - assert len(values) == self.Ns - grid_values = [["X".ljust(9)] * self.ncols for ii in range(self.nrows)] - for s_idx in range(self.Ns): - v = values[s_idx] - row, col = self.index2coord[s_idx] - grid_values[row][col] = ("%0.2f" % v).ljust(9) - - grid_values_ascii = "" - for rr in range(self.nrows + 1): - if rr < self.nrows: - grid_values_ascii += ( - str(rr).zfill(2) + 2 * " " + " ".join(grid_values[rr]) + "\n" - ) - else: - grid_values_ascii += 4 * " " + " ".join( - [str(jj).zfill(2).ljust(9) for jj in range(self.ncols)] - ) - logger.info(grid_values_ascii) - - def print_transition_at(self, row, col, action): - s_idx = self.coord2index[(row, col)] - if s_idx < 0: - logger.info("wall!") - return - a_idx = self.a_str2idx[action] - for next_s_idx, prob in enumerate(self.P[s_idx, a_idx]): - if prob > 0: - logger.info( - "to (%d, %d) with prob %f" - % (self.index2coord[next_s_idx] + (prob,)) - ) - - def render_ascii(self): - print(self._build_ascii()) - - def step(self, action): - assert self.action_space.contains(action), "Invalid action!" - - # save state for rendering - if self.is_render_enabled(): - self.append_state_for_rendering(self.state) - - # take step - next_state, reward, terminated, truncated, info = self.sample( - self.state, action - ) - self.state = next_state - return next_state, reward, terminated, truncated, info - - # - # Code for rendering - # - def get_layout_array(self, state_data=None, fill_walls_with=np.nan): - """ - Returns an array 'layout' of shape (nrows, ncols) such that: - - layout[row, col] = state_data[self.coord2idx[row, col]] - - If (row, col) is a wall: - - layout[row, col] = fill_walls_with - - Parameters - ---------- - state_data : np.array, default = None - Array of shape (self.observation_space.n,) - fill_walls_with : float, default: np.nan - Value to set in the layout in the coordinates corresponding to walls. - - Returns - ------- - Gridworld layout array of shape (nrows, ncols). - """ - layout = np.zeros((self.nrows, self.ncols)) - if state_data is not None: - assert state_data.shape == (self.observation_space.n,) - data_rows = [self.index2coord[idx][0] for idx in self.index2coord] - data_cols = [self.index2coord[idx][1] for idx in self.index2coord] - layout[data_rows, data_cols] = state_data - else: - state_rr, state_cc = self.index2coord[self.state] - layout[state_rr, state_cc] = 1.0 - - walls_rows = [ww[0] for ww in self.walls] - walls_cols = [ww[1] for ww in self.walls] - layout[walls_rows, walls_cols] = fill_walls_with - return layout - - def get_layout_img( - self, state_data=None, colormap_name="cool", wall_color=(0.0, 0.0, 0.0) - ): - """ - Returns an image array representing the value of `state_data` on - the gridworld layout. - - Parameters - ---------- - state_data : np.array, default = None - Array of shape (self.observation_space.n,) - colormap_name : str, default = 'cool' - Colormap name. - See https://matplotlib.org/tutorials/colors/colormaps.html - wall_color : tuple - RGB color for walls. - Returns - ------- - Gridworld image array of shape (nrows, ncols, 3). - """ - # map data to [0.0, 1.0] - if state_data is not None: - state_data = state_data - state_data.min() - if state_data.max() > 0.0: - state_data = state_data / state_data.max() - - colormap_fn = plt.get_cmap(colormap_name) - layout = self.get_layout_array(state_data, fill_walls_with=np.nan) - norm = matplotlib.colors.Normalize(vmin=0.0, vmax=1.0) - scalar_map = cm.ScalarMappable(norm=norm, cmap=colormap_fn) - img = np.zeros(layout.shape + (3,)) - for rr in range(layout.shape[0]): - for cc in range(layout.shape[1]): - if np.isnan(layout[rr, cc]): - img[self.nrows - 1 - rr, cc, :] = wall_color - else: - img[self.nrows - 1 - rr, cc, :3] = scalar_map.to_rgba( - layout[rr, cc] - )[:3] - return img - - def get_background(self): - """ - Return a scene (list of shapes) representing the background - """ - bg = Scene() - - # walls - for wall in self.walls: - y, x = wall - shape = GeometricPrimitive("POLYGON") - shape.set_color((0.25, 0.25, 0.25)) - shape.add_vertex((x, y)) - shape.add_vertex((x + 1, y)) - shape.add_vertex((x + 1, y + 1)) - shape.add_vertex((x, y + 1)) - bg.add_shape(shape) - - # rewards - for y, x in self.reward_at: - flag = GeometricPrimitive("POLYGON") - rwd = self.reward_at[(y, x)] - color = 0.5 * np.abs(rwd) / self.reward_range[1] - if rwd > 0: - flag.set_color((0.0, color, 0.0)) - if rwd < 0: - flag.set_color((color, 0.0, 0.0)) - - x += 0.5 - y += 0.25 - flag.add_vertex((x, y)) - flag.add_vertex((x + 0.25, y + 0.5)) - flag.add_vertex((x - 0.25, y + 0.5)) - bg.add_shape(flag) - - return bg - - def get_scene(self, state): - """ - Return scene (list of shapes) representing a given state - """ - y, x = self.index2coord[state] - x = x + 0.5 # centering - y = y + 0.5 # centering - - scene = Scene() - - agent = circle_shape((x, y), 0.25, n_points=5) - agent.type = "POLYGON" - agent.set_color((0.75, 0.0, 0.5)) - - scene.add_shape(agent) - return scene diff --git a/rlberry/envs/finite/gridworld_utils.py b/rlberry/envs/finite/gridworld_utils.py deleted file mode 100644 index ce0390f10..000000000 --- a/rlberry/envs/finite/gridworld_utils.py +++ /dev/null @@ -1,70 +0,0 @@ -import numpy as np - -WALL_SYMBOL = "#" -REWARD_TERMINAL_SYMBOL = "r" -REWARD_SYMBOL = "R" -TERMINAL_STATE_SYMBOL = "T" -INITIAL_STATE_SYMBOL = "I" - - -# spaces are ignored -DEFAULT_LAYOUT = """ -IOOOO # OOOOO O OOOOR -OOOOO # OOOOO # OOOOO -OOOOO O OOOOO # OOOOO -OOOOO # OOOOO # OOOOO -IOOOO # OOOOO # OOOOr -""" - - -def _preprocess_layout(layout): - layout = layout.replace(" ", "") # remove spaces - # remove first and last line breaks - if layout[0] == "\n": - layout = layout[1:] - if layout[-1] == "\n": - layout = layout[:-1] - - # make sure all lines have the same length - lines = layout.split("\n") - len_lines = [len(line) for line in lines] - max_len = np.max(len_lines) - # below, also reverse lines (so that render is not inversed in the y-direction) - adjusted_lines = [ - line.ljust(max_len, "O") for line in reversed(lines) - ] # fill with empty state - layout = "\n".join(adjusted_lines) - return layout - - -def get_layout_info(layout): - layout = _preprocess_layout(layout) - lines = layout.split("\n") - nrows = len(lines) - ncols = len(lines[0]) - walls = [] - initial_states = [] - terminal_states = [] - reward_at = dict() - for rr in range(nrows): - line = lines[rr] - for cc in range(ncols): - symbol = line[cc] - state_coord = (rr, cc) - if symbol == WALL_SYMBOL: - walls.append(state_coord) - if symbol == TERMINAL_STATE_SYMBOL or symbol == REWARD_TERMINAL_SYMBOL: - terminal_states.append(state_coord) - if symbol == REWARD_SYMBOL or symbol == REWARD_TERMINAL_SYMBOL: - reward_at[state_coord] = 1.0 - if symbol == INITIAL_STATE_SYMBOL: - initial_states.append(state_coord) - info = dict( - nrows=nrows, - ncols=ncols, - initial_states=tuple(initial_states), - terminal_states=tuple(terminal_states), - walls=tuple(walls), - reward_at=reward_at, - ) - return info diff --git a/rlberry/envs/finite/finite_mdp.py b/rlberry/envs/finite_mdp.py similarity index 100% rename from rlberry/envs/finite/finite_mdp.py rename to rlberry/envs/finite_mdp.py diff --git a/rlberry/envs/tests/test_bandits.py b/rlberry/envs/tests/test_bandits.py deleted file mode 100644 index 0e35fccf4..000000000 --- a/rlberry/envs/tests/test_bandits.py +++ /dev/null @@ -1,61 +0,0 @@ -import numpy as np -from rlberry.seeding import safe_reseed -from rlberry.seeding import Seeder -from rlberry.envs.bandits import ( - AdversarialBandit, - BernoulliBandit, - NormalBandit, - CorruptedNormalBandit, -) - - -TEST_SEED = 42 - - -def test_bernoulli(): - env = BernoulliBandit(p=[0.05, 0.95]) - safe_reseed(env, Seeder(TEST_SEED)) - - sample = [env.step(1)[1] for f in range(1000)] - - safe_reseed(env, Seeder(TEST_SEED)) - - sample2 = [env.step(1)[1] for f in range(1000)] - - assert np.abs(np.mean(sample) - 0.95) < 0.1 - assert np.mean(sample) == np.mean(sample2), "Not reproducible" - - -def test_normal(): - env = NormalBandit(means=[0, 1]) - safe_reseed(env, Seeder(TEST_SEED)) - - sample = [env.step(1)[1] for f in range(1000)] - safe_reseed(env, Seeder(TEST_SEED)) - - sample2 = [env.step(1)[1] for f in range(1000)] - - assert np.abs(np.mean(sample) - 1) < 0.1 - assert np.abs(sample[0] - sample2[0]) < 0.01, "Not reproducible" - - -def test_cor_normal(): - env = CorruptedNormalBandit(means=[0, 1], cor_prop=0.1) - safe_reseed(env, Seeder(TEST_SEED)) - - sample = [env.step(1)[1] for f in range(1000)] - assert np.abs(np.median(sample) - 1) < 0.5 - - -def test_adversarial(): - r1 = np.concatenate((2 * np.ones((500, 1)), np.ones((500, 1))), axis=1) - - r2 = np.concatenate((np.ones((500, 1)), 2 * np.ones((500, 1))), axis=1) - - rewards = np.concatenate((r1, r2)) - - env = AdversarialBandit(rewards=rewards) - safe_reseed(env, Seeder(TEST_SEED)) - - sample = [env.step(1)[1] for f in range(1000)] - assert np.abs(np.mean(sample) - 1.5) < 1e-10 diff --git a/rlberry/envs/tests/test_env_seeding.py b/rlberry/envs/tests/test_env_seeding.py index 26682e286..336dae652 100644 --- a/rlberry/envs/tests/test_env_seeding.py +++ b/rlberry/envs/tests/test_env_seeding.py @@ -3,13 +3,13 @@ import rlberry.seeding as seeding from copy import deepcopy -from rlberry.envs.classic_control import MountainCar, Acrobot, Pendulum -from rlberry.envs.finite import Chain -from rlberry.envs.finite import GridWorld -from rlberry.envs.benchmarks.grid_exploration.four_room import FourRoom -from rlberry.envs.benchmarks.grid_exploration.six_room import SixRoom -from rlberry.envs.benchmarks.grid_exploration.apple_gold import AppleGold -from rlberry.envs.benchmarks.ball_exploration import PBall2D, SimplePBallND +from rlberry_research.envs.classic_control import MountainCar, Acrobot, Pendulum +from rlberry_research.envs.finite import Chain +from rlberry_research.envs.finite import GridWorld +from rlberry_research.envs.benchmarks.grid_exploration.four_room import FourRoom +from rlberry_research.envs.benchmarks.grid_exploration.six_room import SixRoom +from rlberry_research.envs.benchmarks.grid_exploration.apple_gold import AppleGold +from rlberry_research.envs.benchmarks.ball_exploration import PBall2D, SimplePBallND classes = [ MountainCar, diff --git a/rlberry/envs/tests/test_gym_make.py b/rlberry/envs/tests/test_gym_make.py index 9ad80d2ee..e7d53be85 100644 --- a/rlberry/envs/tests/test_gym_make.py +++ b/rlberry/envs/tests/test_gym_make.py @@ -23,11 +23,11 @@ def test_atari_make(): def test_rendering_with_atari_make(): from rlberry.manager import ExperimentManager - from rlberry.agents.torch import PPOAgent + from rlberry_research.agents.torch import PPOAgent from gymnasium.wrappers.record_video import RecordVideo import os from rlberry.envs.gym_make import atari_make - from rlberry.agents.torch.utils.training import model_factory_from_env + from rlberry_research.agents.torch.utils.training import model_factory_from_env import tempfile with tempfile.TemporaryDirectory() as tmpdirname: diff --git a/rlberry/envs/tests/test_instantiation.py b/rlberry/envs/tests/test_instantiation.py deleted file mode 100644 index d66722484..000000000 --- a/rlberry/envs/tests/test_instantiation.py +++ /dev/null @@ -1,252 +0,0 @@ -import numpy as np -import pytest - -from rlberry.envs import gym_make, PipelineEnv -from rlberry.envs.classic_control import MountainCar, Acrobot, Pendulum -from rlberry.envs.finite import Chain -from rlberry.envs.finite import GridWorld -from rlberry.envs.benchmarks.ball_exploration import PBall2D, SimplePBallND -from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env -from rlberry.envs.benchmarks.grid_exploration.four_room import FourRoom -from rlberry.envs.benchmarks.grid_exploration.six_room import SixRoom -from rlberry.envs.benchmarks.grid_exploration.nroom import NRoom -from rlberry.envs.benchmarks.grid_exploration.apple_gold import AppleGold -from rlberry.rendering.render_interface import RenderInterface2D - -classes = [ - MountainCar, - GridWorld, - Chain, - PBall2D, - SimplePBallND, - Acrobot, - Pendulum, - FourRoom, - SixRoom, - AppleGold, - NRoom, -] - - -@pytest.mark.parametrize("ModelClass", classes) -def test_instantiation(ModelClass): - env = ModelClass() - - if env.is_online(): - for _ in range(2): - state, info = env.reset() - for _ in range(50): - assert env.observation_space.contains(state) - action = env.action_space.sample() - next_s, _, _, _, _ = env.step(action) - state = next_s - - if env.is_generative(): - for _ in range(100): - state = env.observation_space.sample() - action = env.action_space.sample() - next_s, _, _, _, _ = env.sample(state, action) - assert env.observation_space.contains(next_s) - - -@pytest.mark.parametrize("ModelClass", classes) -def test_rendering_calls(ModelClass): - env = ModelClass() - if isinstance(env, RenderInterface2D): - _ = env.get_background() - _ = env.get_scene(env.observation_space.sample()) - - -def test_gridworld_aux_functions(): - env = GridWorld( - nrows=5, ncols=8, walls=((1, 1),), reward_at={(4, 4): 1, (4, 3): -1} - ) - env.log() # from FiniteMDP - env.render_ascii() # from GridWorld - vals = np.arange(env.observation_space.n) - env.display_values(vals) - env.print_transition_at(0, 0, "up") - - layout = env.get_layout_array(vals, fill_walls_with=np.inf) - for rr in range(env.nrows): - for cc in range(env.ncols): - if (rr, cc) in env.walls: - assert layout[rr, cc] == np.inf - else: - assert layout[rr, cc] == vals[env.coord2index[(rr, cc)]] - - -def test_gridworld_from_layout(): - layout = """ - IOOOO # OOOOO O OOOOR - OOOOO # OOOOO # OOOOO - OOOOO O OOOOO # OOTOO - OOOOO # OOOOO # OOOOO - IOOOO # OOOOO # OOOOr""" - env = GridWorld.from_layout(layout) - env.reset() - - -def test_ball2d_benchmark_instantiation(): - for level in [0, 1, 2, 3, 4, 5]: - env = get_benchmark_env(level) - for aa in range(env.action_space.n): - env.step(aa) - env.sample(env.observation_space.sample(), aa) - - -@pytest.mark.parametrize("p", [1, 2, 3, 4, 5, np.inf]) -def test_pball_env(p): - env = PBall2D(p=p) - env.get_reward_lipschitz_constant() - env.get_transitions_lipschitz_constant() - - -@pytest.mark.parametrize( - "reward_free, difficulty, array_observation", - [ - (True, 0, False), - (False, 0, False), - (False, 0, True), - (False, 1, False), - (False, 1, True), - (False, 2, False), - (False, 2, True), - ], -) -def test_four_room(reward_free, difficulty, array_observation): - env = FourRoom( - reward_free=reward_free, - difficulty=difficulty, - array_observation=array_observation, - ) - - initial_state, info = env.reset() - next_state, reward, _, _, _ = env.step(1) - - assert env.observation_space.contains(initial_state) - assert env.observation_space.contains(next_state) - - if reward_free: - assert env.reward_at == {} - - if difficulty == 2: - assert reward < 0.0 - - if array_observation: - assert isinstance(initial_state, np.ndarray) - assert isinstance(next_state, np.ndarray) - - -@pytest.mark.parametrize( - "reward_free, array_observation", - [ - (False, False), - (False, True), - (True, False), - (True, True), - ], -) -def test_six_room(reward_free, array_observation): - env = SixRoom(reward_free=reward_free, array_observation=array_observation) - - initial_state, info = env.reset() - next_state, reward, _, _, _ = env.step(1) - - assert env.observation_space.contains(initial_state) - assert env.observation_space.contains(next_state) - - if reward_free: - assert env.reward_at == {} - - if array_observation: - assert isinstance(initial_state, np.ndarray) - assert isinstance(next_state, np.ndarray) - - -@pytest.mark.parametrize( - "reward_free, array_observation", - [ - (False, False), - (False, True), - (True, False), - (True, True), - ], -) -def test_apple_gold(reward_free, array_observation): - env = AppleGold(reward_free=reward_free, array_observation=array_observation) - - initial_state, info = env.reset() - next_state, reward, _, _, _ = env.step(1) - assert env.observation_space.contains(initial_state) - assert env.observation_space.contains(next_state) - - if reward_free: - assert env.reward_at == {} - - if array_observation: - assert isinstance(initial_state, np.ndarray) - assert isinstance(next_state, np.ndarray) - - -@pytest.mark.parametrize( - "reward_free, array_observation, initial_state_distribution", - [ - (False, False, "center"), - (False, True, "center"), - (True, False, "center"), - (True, True, "center"), - (True, False, "uniform"), - ], -) -def test_n_room(reward_free, array_observation, initial_state_distribution): - env = NRoom( - reward_free=reward_free, - array_observation=array_observation, - initial_state_distribution=initial_state_distribution, - ) - - initial_state, info = env.reset() - next_state, reward, _, _, _ = env.step(1) - - if initial_state_distribution == "uniform": - assert env.initial_state_distribution[0] == 1.0 / env.observation_space.n - - assert env.observation_space.contains(initial_state) - assert env.observation_space.contains(next_state) - - if reward_free: - assert env.reward_at == {} - - if array_observation: - assert isinstance(initial_state, np.ndarray) - assert isinstance(next_state, np.ndarray) - - -def test_pipeline(): - from rlberry.wrappers import RescaleRewardWrapper - from rlberry.wrappers.discretize_state import DiscretizeStateWrapper - - env_ctor, env_kwargs = PipelineEnv, { - "env_ctor": gym_make, - "env_kwargs": {"id": "Acrobot-v1"}, - "wrappers": [(RescaleRewardWrapper, {"reward_range": (0, 1)})], - } - env = env_ctor(**env_kwargs) - _, reward, _, _, _ = env.step(0) - assert (reward <= 1) and (reward >= 0) - - env_ctor, env_kwargs = PipelineEnv, { - "env_ctor": gym_make, - "env_kwargs": {"id": "Acrobot-v1"}, - "wrappers": [ - (RescaleRewardWrapper, {"reward_range": (0, 1)}), - (DiscretizeStateWrapper, {"n_bins": 10}), - ], - } - env = env_ctor(**env_kwargs) - # check that wrapped in the right order - assert isinstance( - env.env, RescaleRewardWrapper - ), "the environments in Pipeline env may not be wrapped in order" - assert isinstance(env.env.env, DiscretizeStateWrapper) diff --git a/rlberry/envs/tests/test_spring_env.py b/rlberry/envs/tests/test_spring_env.py deleted file mode 100644 index 9809cd55b..000000000 --- a/rlberry/envs/tests/test_spring_env.py +++ /dev/null @@ -1,104 +0,0 @@ -import numpy as np -from rlberry.envs import SpringCartPole -from rlberry.envs.classic_control.SpringCartPole import rk4 - - -# # actions -# LL = 0 -# RR = 1 -# LR = 2 -# RL = 3 - -# action_dict = {0: "LL", 1: "RR", 2: "LR", 3: "RL"} - - -HORIZON = 50 - - -def test_spring_cartpole(): - # test 1 - default - env = SpringCartPole() - - _, info = env.reset() - for _ in range(2): - action = np.random.randint(0, env.action_space.n) - next_observation, reward, terminated, truncated, info = env.step(action) - done = terminated or truncated - # if done: - # next_observation,info = env.reset() - # observation = next_observation - - # test 2 - obs_trans = True and random_init = False - env = SpringCartPole(obs_trans=True, random_init=False) - - _, info = env.reset() - for _ in range(2): - action = np.random.randint(0, env.action_space.n) - next_observation, reward, terminated, truncated, info = env.step(action) - done = terminated or truncated - # if done: - # next_observation,info = env.reset() - # observation = next_observation - - # # test 3 - swingup = False and random_init = False - # env = SpringCartPole(dt=0.01, swing_up=False, random_init=False) - # # env.enable_rendering() - - # observation,info = env.reset() - # for tt in range(5): - # if observation[2] > 0: - # if observation[6] > 0: - # action = LL - # else: - # action = LR - # else: - # if observation[6] > 0: - # action = RL - # else: - # action = RR - # # print("Time: ", tt, "Action: ", action_dict[action], "Angle1: ", observation[2], "Angle2: ", observation[6]) - # next_observation, reward, terminated, truncated, info= env.step(action) - # done = terminated or truncated - # if done: - # next_observation,info = env.reset() - # observation = next_observation - - # test 4 - swingup = False and rendering = True - - env = SpringCartPole(dt=0.02, swing_up=False, obs_trans=True) - env.enable_rendering() - - _, info = env.reset() - action = 0 - for _ in range(2 * HORIZON): - next_observation, reward, terminated, truncated, info = env.step(action) - done = terminated or truncated - if done: - action += 1 - if action >= 4: - action = 0 - next_observation, info = env.reset() - _ = next_observation - - _ = env.get_video() - - -def test_rk4(): - """ - Test of the rk4 utils defined in speingcartpole - """ - - ## 2D system - def derivs6(x, t): - d1 = x[0] + 2 * x[1] - d2 = -3 * x[0] + 4 * x[1] - return (d1, d2) - - dt = 0.0005 - t = np.arange(0.0, 2.0, dt) - y0 = (1, 2) - yout = rk4(derivs6, y0, t) - assert np.abs(yout[0][0] - 1) < 1e-2 - assert np.abs(yout[0][1] - 2) < 1e-2 - assert np.abs(yout[-1][0] + 238.087) < 1e-2 - assert np.abs(yout[-1][1] + 220.827) < 1e-2 diff --git a/rlberry/experiment/tests/room.yaml b/rlberry/experiment/tests/room.yaml index 8667278d5..702fbaa38 100644 --- a/rlberry/experiment/tests/room.yaml +++ b/rlberry/experiment/tests/room.yaml @@ -1,4 +1,4 @@ -constructor: 'rlberry.envs.benchmarks.grid_exploration.nroom.NRoom' +constructor: 'rlberry_research.envs.benchmarks.grid_exploration.nroom.NRoom' params: reward_free: false array_observation: true diff --git a/rlberry/experiment/tests/rsucbvi.yaml b/rlberry/experiment/tests/rsucbvi.yaml index 4c9273e1b..c35777881 100644 --- a/rlberry/experiment/tests/rsucbvi.yaml +++ b/rlberry/experiment/tests/rsucbvi.yaml @@ -1,4 +1,4 @@ -agent_class: 'rlberry.agents.kernel_based.rs_ucbvi.RSUCBVIAgent' +agent_class: 'rlberry_research.agents.kernel_based.rs_ucbvi.RSUCBVIAgent' init_kwargs: gamma: 1.0 lp_metric: 2 diff --git a/rlberry/experiment/tests/test_experiment_generator.py b/rlberry/experiment/tests/test_experiment_generator.py index 2c5297198..be361ea57 100644 --- a/rlberry/experiment/tests/test_experiment_generator.py +++ b/rlberry/experiment/tests/test_experiment_generator.py @@ -1,5 +1,5 @@ from rlberry.experiment import experiment_generator -from rlberry.agents.kernel_based.rs_ucbvi import RSUCBVIAgent +from rlberry_research.agents.kernel_based.rs_ucbvi import RSUCBVIAgent import numpy as np diff --git a/rlberry/experiment/yaml_utils.py b/rlberry/experiment/yaml_utils.py index 581b254ec..ff26c21dc 100644 --- a/rlberry/experiment/yaml_utils.py +++ b/rlberry/experiment/yaml_utils.py @@ -30,7 +30,7 @@ def read_agent_config(config_path): Example: ``` myagent.yaml - agent_class: 'rlberry.agents.kernel_based.rs_ucbvi.RSUCBVIAgent' + agent_class: 'rlberry_research.agents.kernel_based.rs_ucbvi.RSUCBVIAgent' gamma: 1.0 lp_metric: 2 min_dist: 0.0 @@ -76,7 +76,7 @@ def read_env_config(config_path): Example: ``` env.yaml - constructor: 'rlberry.envs.benchmarks.grid_exploration.nroom.NRoom' + constructor: 'rlberry_research.envs.benchmarks.grid_exploration.nroom.NRoom' params: reward_free: false array_observation: true diff --git a/rlberry/exploration_tools/__init__.py b/rlberry/exploration_tools/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rlberry/exploration_tools/discrete_counter.py b/rlberry/exploration_tools/discrete_counter.py deleted file mode 100644 index 549a39955..000000000 --- a/rlberry/exploration_tools/discrete_counter.py +++ /dev/null @@ -1,100 +0,0 @@ -import numpy as np -from rlberry.exploration_tools.uncertainty_estimator import UncertaintyEstimator -from rlberry.exploration_tools.typing import preprocess_args -from rlberry.spaces import Discrete -from rlberry.utils.space_discretizer import Discretizer - - -class DiscreteCounter(UncertaintyEstimator): - """ - Parameters - ---------- - observation_space : spaces.Box or spaces.Discrete - action_space : spaces.Box or spaces.Discrete - n_bins_obs: int - number of bins to discretize observation space - n_bins_actions: int - number of bins to discretize action space - rate_power : float - Returns bonuses in 1/n ** rate_power. - """ - - def __init__( - self, - observation_space, - action_space, - n_bins_obs=10, - n_bins_actions=10, - rate_power=0.5, - **kwargs - ): - UncertaintyEstimator.__init__(self, observation_space, action_space) - - self.rate_power = rate_power - - self.continuous_state = False - self.continuous_action = False - - if isinstance(observation_space, Discrete): - self.n_states = observation_space.n - else: - self.continuous_state = True - self.state_discretizer = Discretizer(self.observation_space, n_bins_obs) - self.n_states = self.state_discretizer.discrete_space.n - - if isinstance(action_space, Discrete): - self.n_actions = action_space.n - else: - self.continuous_action = True - self.action_discretizer = Discretizer(self.action_space, n_bins_actions) - self.n_actions = self.action_discretizer.discrete_space.n - - self.N_sa = np.zeros((self.n_states, self.n_actions)) - - def _preprocess(self, state, action): - if self.continuous_state: - state = self.state_discretizer.discretize(state) - if self.continuous_action: - action = self.action_discretizer.discretize(action) - return state, action - - def reset(self): - self.N_sa = np.zeros((self.n_states, self.n_actions)) - - @preprocess_args(expected_type="numpy") - def update(self, state, action, next_state=None, reward=None, **kwargs): - state, action = self._preprocess(state, action) - self.N_sa[state, action] += 1 - - @preprocess_args(expected_type="numpy") - def measure(self, state, action, **kwargs): - state, action = self._preprocess(state, action) - n = np.maximum(1.0, self.N_sa[state, action]) - return np.power(1.0 / n, self.rate_power) - - def count(self, state, action): - state, action = self._preprocess(state, action) - return self.N_sa[state, action] - - def get_n_visited_states(self): - """ - Returns the number of different states sent to the .update() function. - For continuous state spaces, counts the number of different discretized states. - """ - n_visited_states = (self.N_sa.sum(axis=1) > 0).sum() - return n_visited_states - - def get_entropy(self): - """ - Returns the entropy of the empirical distribution over states, induced by the state counts. - Uses log2. - """ - visited = self.N_sa.sum(axis=1) > 0 - if visited.sum() == 0.0: - return 0.0 - # number of visits of visited states only - n_visits = self.N_sa[visited, :].sum(axis=1) - # empirical distribution - dist = n_visits / n_visits.sum() - entropy = (-dist * np.log2(dist)).sum() - return entropy diff --git a/rlberry/exploration_tools/online_discretization_counter.py b/rlberry/exploration_tools/online_discretization_counter.py deleted file mode 100644 index 575114df5..000000000 --- a/rlberry/exploration_tools/online_discretization_counter.py +++ /dev/null @@ -1,189 +0,0 @@ -import numpy as np -from rlberry.utils.jit_setup import numba_jit -from rlberry.exploration_tools.uncertainty_estimator import UncertaintyEstimator -from rlberry.exploration_tools.typing import preprocess_args -from gymnasium.spaces import Box, Discrete -from rlberry.utils.metrics import metric_lp - -import rlberry - -logger = rlberry.logger - - -@numba_jit -def map_to_representative( - state, - lp_metric, - representative_states, - n_representatives, - min_dist, - scaling, - accept_new_repr, -): - """ - Map state to representative state. - """ - dist_to_closest = np.inf - argmin = -1 - for ii in range(n_representatives): - dist = metric_lp(state, representative_states[ii, :], lp_metric, scaling) - if dist < dist_to_closest: - dist_to_closest = dist - argmin = ii - - max_representatives = representative_states.shape[0] - if ( - dist_to_closest > min_dist - and n_representatives < max_representatives - and accept_new_repr - ): - new_index = n_representatives - representative_states[new_index, :] = state - return new_index, 0.0 - return argmin, dist_to_closest - - -class OnlineDiscretizationCounter(UncertaintyEstimator): - """ - Note: currently, only implemented for continuous (Box) states and - discrete actions. - - Parameters - ---------- - observation_space : spaces.Box - action_space : spaces.Discrete - lp_metric: int - The metric on the state space is the one induced by the p-norm, - where p = lp_metric. Default = 2, for the Euclidean metric. - scaling: numpy.ndarray - Must have the same size as state array, used to scale the states - before computing the metric. - If None, set to: - - (env.observation_space.high - env.observation_space.low) if high - and low are bounded - - np.ones(env.observation_space.shape[0]) if high or low are - unbounded - min_dist: double - Minimum distance between two representative states - max_repr: int - Maximum number of representative states. - If None, it is set to (sqrt(d)/min_dist)**d, where d - is the dimension of the state space - rate_power : float - returns bonuses in n^power. - """ - - def __init__( - self, - observation_space, - action_space, - lp_metric=2, - min_dist=0.1, - max_repr=1000, - scaling=None, - rate_power=1, - **kwargs - ): - UncertaintyEstimator.__init__(self, observation_space, action_space) - - assert isinstance(action_space, Discrete) - assert isinstance(observation_space, Box) - - self.lp_metric = lp_metric - self.min_dist = min_dist - self.max_repr = max_repr - self.state_dim = self.observation_space.shape[0] - self.n_actions = self.action_space.n - self.rate_power = rate_power - - # compute scaling, if it is None - if scaling is None: - # if high and low are bounded - if self.observation_space.is_bounded(): - scaling = self.observation_space.high - self.observation_space.low - # if high or low are unbounded - else: - scaling = np.ones(self.state_dim) - else: - assert scaling.ndim == 1 - assert scaling.shape[0] == self.state_dim - self.scaling = scaling - - # initialize - self.n_representatives = None - self.representative_states = None - self.N_sa = None - self.reset() - - def reset(self): - self.n_representatives = 0 - self.representative_states = np.zeros((self.max_repr, self.state_dim)) - self.N_sa = np.zeros((self.max_repr, self.n_actions)) - - self._overflow_warning = False - - def _get_representative_state(self, state, accept_new_repr=True): - state_idx, dist_to_closest = map_to_representative( - state, - self.lp_metric, - self.representative_states, - self.n_representatives, - self.min_dist, - self.scaling, - accept_new_repr, - ) - # check if new representative state - if state_idx == self.n_representatives: - self.n_representatives += 1 - - if self.n_representatives >= self.max_repr and (not self._overflow_warning): - logger.warning( - "OnlineDiscretizationCounter reached \ -the maximum number of representative states." - ) - self._overflow_warning = True - - return state_idx, dist_to_closest - - @preprocess_args(expected_type="numpy") - def update(self, state, action, next_state=None, reward=None, **kwargs): - state_idx, _ = self._get_representative_state(state) - self.N_sa[state_idx, action] += 1 - - @preprocess_args(expected_type="numpy") - def measure(self, state, action, **kwargs): - n = np.maximum(1.0, self.count(state, action)) - return np.power(1 / n, self.rate_power) - - def count(self, state, action): - state_idx, dist_to_closest = self._get_representative_state( - state, accept_new_repr=False - ) - # if state is too far from the closest representative, - # its count is zero. - if dist_to_closest > self.min_dist: - return 0.0 - return self.N_sa[state_idx, action] - - def get_n_visited_states(self): - """ - Returns the number of different states sent to the .update() function. - For continuous state spaces, counts the number of different discretized states. - """ - n_visited_states = (self.N_sa.sum(axis=1) > 0).sum() - return n_visited_states - - def get_entropy(self): - """ - Returns the entropy of the empirical distribution over states, induced by the state counts. - Uses log2. - """ - visited = self.N_sa.sum(axis=1) > 0 - if visited.sum() == 0.0: - return 0.0 - # number of visits of visited states only - n_visits = self.N_sa[visited, :].sum(axis=1) - # empirical distribution - dist = n_visits / n_visits.sum() - entropy = (-dist * np.log2(dist)).sum() - return entropy diff --git a/rlberry/exploration_tools/tests/__init__.py b/rlberry/exploration_tools/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rlberry/exploration_tools/tests/test_discrete_counter.py b/rlberry/exploration_tools/tests/test_discrete_counter.py deleted file mode 100644 index ad6c1f2bb..000000000 --- a/rlberry/exploration_tools/tests/test_discrete_counter.py +++ /dev/null @@ -1,113 +0,0 @@ -import pytest -import numpy as np -from rlberry.envs import GridWorld -from rlberry.envs import MountainCar -from rlberry.envs.benchmarks.grid_exploration.nroom import NRoom -from rlberry.exploration_tools.discrete_counter import DiscreteCounter -from rlberry.exploration_tools.online_discretization_counter import ( - OnlineDiscretizationCounter, -) - - -@pytest.mark.parametrize("rate_power", [0.5, 1]) -def test_discrete_env(rate_power): - env = GridWorld() - counter = DiscreteCounter( - env.observation_space, env.action_space, rate_power=rate_power - ) - - for N in range(10, 20): - assert counter.get_n_visited_states() == 0 - assert counter.get_entropy() == 0.0 - - for ss in range(env.observation_space.n): - for aa in range(env.action_space.n): - for _ in range(N): - ns, rr, _, _, _ = env.sample(ss, aa) - counter.update(ss, aa, ns, rr) - assert counter.N_sa[ss, aa] == N - assert counter.count(ss, aa) == N - if rate_power == pytest.approx(1): - assert np.allclose(counter.measure(ss, aa), 1.0 / N) - elif rate_power == pytest.approx(0.5): - assert np.allclose(counter.measure(ss, aa), np.sqrt(1.0 / N)) - - assert counter.get_n_visited_states() == env.observation_space.n - assert np.allclose(counter.get_entropy(), np.log2(env.observation_space.n)) - - counter.reset() - - -@pytest.mark.parametrize("rate_power", [0.5, 1]) -def test_continuous_state_env(rate_power): - env = MountainCar() - counter = DiscreteCounter( - env.observation_space, env.action_space, rate_power=rate_power - ) - - for N in [10, 20]: - for _ in range(10): - ss = env.observation_space.sample() - aa = env.action_space.sample() - for _ in range(N): - ns, rr, _, _, _ = env.sample(ss, aa) - counter.update(ss, aa, ns, rr) - - dss = counter.state_discretizer.discretize(ss) - assert counter.N_sa[dss, aa] == N - assert counter.count(ss, aa) == N - if rate_power == pytest.approx(1): - assert np.allclose(counter.measure(ss, aa), 1.0 / N) - elif rate_power == pytest.approx(0.5): - assert np.allclose(counter.measure(ss, aa), np.sqrt(1.0 / N)) - counter.reset() - - -@pytest.mark.parametrize("rate_power", [True, False]) -def test_continuous_state_env_2(rate_power): - env = MountainCar() - counter = OnlineDiscretizationCounter( - env.observation_space, env.action_space, rate_power=rate_power - ) - - for N in [10, 20]: - for _ in range(10): - ss = env.observation_space.sample() - aa = env.action_space.sample() - for nn in range(N): - ns, rr, _, _, _ = env.sample(ss, aa) - counter.update(ss, aa, ns, rr) - assert counter.count(ss, aa) == N - if rate_power == pytest.approx(1): - assert np.allclose(counter.measure(ss, aa), 1.0 / N) - elif rate_power == pytest.approx(0.5): - assert np.allclose(counter.measure(ss, aa), np.sqrt(1.0 / N)) - counter.reset() - - -def test_continuous_state_env_3(): - env = NRoom(nrooms=3, array_observation=True) - counter = OnlineDiscretizationCounter( - env.observation_space, env.action_space, rate_power=0.5, min_dist=0.0 - ) - - for N in range(10, 20, 3): - assert counter.get_n_visited_states() == 0 - assert counter.get_entropy() == 0.0 - - for ss in range(env.discrete_observation_space.n): - for aa in range(env.action_space.n): - for _ in range(N): - ns, rr, _, _, _ = env.sample(ss, aa) - continuous_ss = env._convert_index_to_float_coord(ss) - counter.update(continuous_ss, aa, None, rr) - assert counter.N_sa[ss, aa] == N - assert counter.count(continuous_ss, aa) == N - assert np.allclose(counter.measure(continuous_ss, aa), np.sqrt(1.0 / N)) - - assert counter.get_n_visited_states() == env.discrete_observation_space.n - assert np.allclose( - counter.get_entropy(), np.log2(env.discrete_observation_space.n) - ) - - counter.reset() diff --git a/rlberry/exploration_tools/torch/__init__.py b/rlberry/exploration_tools/torch/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rlberry/exploration_tools/torch/rnd.py b/rlberry/exploration_tools/torch/rnd.py deleted file mode 100644 index ac6971c22..000000000 --- a/rlberry/exploration_tools/torch/rnd.py +++ /dev/null @@ -1,212 +0,0 @@ -from functools import partial - -import torch -import gymnasium.spaces as spaces -from torch.nn import functional as F - -from rlberry.agents.utils.memories import ReplayMemory -from rlberry.exploration_tools.uncertainty_estimator import UncertaintyEstimator -from rlberry.exploration_tools.typing import preprocess_args -from rlberry.agents.torch.utils.models import ConvolutionalNetwork -from rlberry.agents.torch.utils.models import MultiLayerPerceptron -from rlberry.utils.factory import load -from rlberry.utils.torch import choose_device - - -def get_network(shape, embedding_dim): - if len(shape) == 3: - if shape[2] < shape[0] and shape[2] < shape[1]: - W, H, C = shape - transpose_obs = True - elif shape[0] < shape[1] and shape[0] < shape[2]: - C, H, W = shape - transpose_obs = False - else: - raise ValueError("Unknown image convention") - - return ConvolutionalNetwork( - in_channels=C, - in_width=W, - in_height=H, - out_size=embedding_dim, - activation="ELU", - transpose_obs=transpose_obs, - is_policy=False, - ) - elif len(shape) == 2: - H, W = shape - return ConvolutionalNetwork( - in_channels=1, - in_width=W, - in_height=H, - activation="ELU", - out_size=embedding_dim, - ) - - elif len(shape) == 1: - return MultiLayerPerceptron( - in_size=shape[0], - activation="RELU", - layer_sizes=[64, 64], - out_size=embedding_dim, - ) - else: - raise ValueError("Incompatible observation shape: {}".format(shape)) - - -class RandomNetworkDistillation(UncertaintyEstimator): - """ - References - ---------- - Burda Yuri, Harrison Edwards, Amos Storkey, and Oleg Klimov. 2018. - "Exploration by random network distillation." - In International Conference on Learning Representations. - """ - - def __init__( - self, - observation_space, - action_space, - learning_rate=0.001, - update_period=100, - embedding_dim=10, - net_fn=None, - net_kwargs=None, - device="cuda:best", - rate_power=0.5, - batch_size=10, - memory_size=10000, - with_action=False, - **kwargs - ): - assert isinstance(observation_space, spaces.Box) - UncertaintyEstimator.__init__(self, observation_space, action_space) - self.learning_rate = learning_rate - self.loss_fn = F.mse_loss - self.update_period = update_period - self.embedding_dim = embedding_dim - out_size = embedding_dim * action_space.n if with_action else embedding_dim - self.net_fn = ( - load(net_fn) - if isinstance(net_fn, str) - else net_fn - or partial( - get_network, shape=observation_space.shape, embedding_dim=out_size - ) - ) - self.net_kwargs = net_kwargs or {} - if "out_size" in self.net_kwargs: - self.net_kwargs["out_size"] = out_size - self.device = choose_device(device) - self.rate_power = rate_power - self.batch_size = batch_size - self.memory = ReplayMemory(capacity=memory_size) - self.with_action = with_action - self.reset() - - def reset(self, **kwargs): - self.random_target_network = self.net_fn(**self.net_kwargs).to(self.device) - self.predictor_network = self.net_fn(**self.net_kwargs).to(self.device) - self.rnd_optimizer = torch.optim.Adam( - self.predictor_network.parameters(), - lr=self.learning_rate, - betas=(0.9, 0.999), - ) - - self.count = 0 - self.loss = torch.tensor(0.0).to(self.device) - - def _get_embeddings(self, state, action=None, batch=False, all_actions=False): - state = state.to(self.device) - if not batch: - state = state.unsqueeze(0) - - random_embedding = self.random_target_network(state) - predicted_embedding = self.predictor_network(state) - - if self.with_action: - random_embedding = random_embedding.view( - (state.shape[0], self.action_space.n, -1) - ) - predicted_embedding = predicted_embedding.view( - (state.shape[0], self.action_space.n, -1) - ) - if not all_actions: - action = action.long().to(self.device) - if not batch: - action = action.unsqueeze(0) - action = ( - action.unsqueeze(1) - .repeat(1, random_embedding.shape[-1]) - .unsqueeze(1) - ) - random_embedding = random_embedding.gather(1, action).squeeze(1) - predicted_embedding = predicted_embedding.gather(1, action).squeeze(1) - return random_embedding, predicted_embedding - - @preprocess_args(expected_type="torch") - def update(self, state, action=None, next_state=None, reward=None, **kwargs): - batch = [(state, action)] - if self.batch_size > 0 and not self.memory.is_empty(): - batch += self.memory.sample(self.batch_size) - self.memory.push((state, action)) - states, actions = zip(*batch) - states = torch.stack(states) - if self.with_action: - actions = torch.stack(actions) - - random_embedding, predicted_embedding = self._get_embeddings( - states, actions, batch=True - ) - - self.loss += self.loss_fn(random_embedding.detach(), predicted_embedding) - - self.count += 1 - if self.count % self.update_period == 0: - self.loss /= self.update_period - self.rnd_optimizer.zero_grad() - self.loss.backward() - self.rnd_optimizer.step() - self.loss = torch.tensor(0.0).to(self.device) - - @preprocess_args(expected_type="torch") - def measure(self, state, action=None, **kwargs): - random_embedding, predicted_embedding = self._get_embeddings( - state, action, batch=False - ) - error = torch.norm( - predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1 - ) - return error.pow(2 * self.rate_power).item() - - @preprocess_args(expected_type="torch") - def measure_batch(self, states, actions, **kwargs): - random_embedding, predicted_embedding = self._get_embeddings( - states, actions, batch=True - ) - error = torch.norm( - predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1 - ) - return error.pow(2 * self.rate_power) - - @preprocess_args(expected_type="torch") - def measure_batch_all_actions(self, states, **kwargs): - """ - Measure N(s,a) for all a in A. - - Parameters - ---------- - states: a batch of states, of shape [B x ] - - Returns - ------- - N(s,a): an array of shape B x A - """ - assert self.with_action - random_embedding, predicted_embedding = self._get_embeddings( - states, None, batch=True, all_actions=True - ) - error = torch.norm( - predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1 - ) - return error.pow(2 * self.rate_power) diff --git a/rlberry/exploration_tools/torch/tests/__init__.py b/rlberry/exploration_tools/torch/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rlberry/exploration_tools/torch/tests/test_rnd.py b/rlberry/exploration_tools/torch/tests/test_rnd.py deleted file mode 100644 index 5e8d506fa..000000000 --- a/rlberry/exploration_tools/torch/tests/test_rnd.py +++ /dev/null @@ -1,27 +0,0 @@ -from rlberry.exploration_tools.torch.rnd import RandomNetworkDistillation -from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env - - -def test_rnd(): - # Environment - env = get_benchmark_env(level=1) - - # RND - rnd = RandomNetworkDistillation( - env.observation_space, - env.action_space, - learning_rate=0.1, - update_period=100, - embedding_dim=2, - ) - - # Test - observation, info = env.reset() - for ii in range(1000): - action = env.action_space.sample() - next_observation, reward, terminated, truncated, info = env.step(action) - done = terminated or truncated - rnd.update(observation, action, next_observation, reward) - observation = next_observation - # measure uncertainty - _ = rnd.measure(observation, action) diff --git a/rlberry/exploration_tools/typing.py b/rlberry/exploration_tools/typing.py deleted file mode 100644 index b51aabf64..000000000 --- a/rlberry/exploration_tools/typing.py +++ /dev/null @@ -1,85 +0,0 @@ -import numpy as np - -_TORCH_INSTALLED = True -try: - import torch -except ImportError: - _TORCH_INSTALLED = False - - -def _get_type(arg): - if _TORCH_INSTALLED and isinstance(arg, torch.Tensor): - return "torch" - elif isinstance(arg, np.ndarray): - return "numpy" - else: - return type(arg) - - -def process_type(arg, expected_type): - """ - Utility function to preprocess numpy/torch arguments, - according to a expected type. - - For instance, if arg is numpy and expected_type is torch, - converts arg to torch.tensor. - - Parameters - ---------- - expected_type: {'torch', 'numpy'} - Desired type for output. - """ - if arg is None: - return None - - if expected_type == "torch": - assert _TORCH_INSTALLED, "expected_type is 'torch', but torch is not installed!" - if isinstance(arg, torch.Tensor): - return arg - elif isinstance(arg, np.ndarray): - return torch.from_numpy(arg) - elif np.issubdtype(type(arg), np.number): - return torch.tensor(arg) - else: - return arg - elif expected_type == "numpy": - if isinstance(arg, np.ndarray): - return arg - elif _TORCH_INSTALLED and isinstance(arg, torch.Tensor): - return arg.detach().cpu().numpy() - else: - return arg - else: - return arg - - -def preprocess_args(expected_type): - """ - Utility decorator for methods to preprocess numpy/torch arguments, - according to an expected type. - - Output type = input type of the first argument. - - For instance, if function args are numpy and expected_type is torch, - converts function args to torch.tensor. - - Parameters - ---------- - expected_type: {'torch', 'numpy'} - Desired type for output. - """ - - def decorator(func): - def inner(self, *args, **kwargs): - processed_args = () - for ii, arg in enumerate(args): - processed_args += (process_type(arg, expected_type),) - output = func(self, *processed_args, **kwargs) - # Process output according to first argument - ouput_expected_type = _get_type(args[0]) - processed_output = process_type(output, ouput_expected_type) - return processed_output - - return inner - - return decorator diff --git a/rlberry/exploration_tools/uncertainty_estimator.py b/rlberry/exploration_tools/uncertainty_estimator.py deleted file mode 100644 index 868b4c90e..000000000 --- a/rlberry/exploration_tools/uncertainty_estimator.py +++ /dev/null @@ -1,34 +0,0 @@ -from abc import ABC, abstractmethod -from rlberry.exploration_tools.typing import _get_type -import numpy as np - - -class UncertaintyEstimator(ABC): - def __init__(self, observation_space, action_space, **kwargs): - super().__init__() - self.observation_space = observation_space - self.action_space = action_space - - def reset(self, **kwargs): - pass - - @abstractmethod - def update(self, state, action, next_state, reward, **kwargs): - pass - - @abstractmethod - def measure(self, state, action, **kwargs): - pass - - def measure_batch(self, states, actions, **kwargs): - batch = [self.measure(s, a, **kwargs) for s, a in zip(states, actions)] - if _get_type(batch[0]) == "torch": - import torch - - return torch.FloatTensor(batch) - return np.array(batch) - - def measure_batch_all_actions(self, states): - return np.array( - [[self.measure(s, a) for a in range(self.action_space.n)] for s in states] - ) diff --git a/rlberry/manager/__init__.py b/rlberry/manager/__init__.py index 7bac68ba5..18ff0bcc2 100644 --- a/rlberry/manager/__init__.py +++ b/rlberry/manager/__init__.py @@ -1,9 +1,8 @@ -from .experiment_manager import ExperimentManager, preset_manager +from .experiment_manager import ExperimentManager +from .experiment_manager import preset_manager from .multiple_managers import MultipleManagers -from .remote_experiment_manager import RemoteExperimentManager from .evaluation import evaluate_agents, plot_writer_data, read_writer_data from .comparison import compare_agents # (Remote)AgentManager alias for the (Remote)ExperimentManager class, for backward compatibility AgentManager = ExperimentManager -RemoteAgentManager = RemoteExperimentManager diff --git a/rlberry/manager/experiment_manager.py b/rlberry/manager/experiment_manager.py index 13e12bda7..bdfc012e0 100644 --- a/rlberry/manager/experiment_manager.py +++ b/rlberry/manager/experiment_manager.py @@ -219,7 +219,9 @@ class ExperimentManager: If 'unique', data is saved to ``output_dir/manager_data/`` If 'timestamp', data is saved to ``output_dir/manager_data/`` default_writer_kwargs : dict - Optional arguments for :class:`~rlberry.utils.writers.DefaultWriter`. + Optional arguments for :class:`~rlberry.utils.writers.DefaultWriter`. Typically one may + want to change the log style with default_writer_kwargs set to {"style_log":"progressbar"} or + {"style_log":"one_line"} init_kwargs_per_instance : List[dict] (optional) List of length ``n_fit`` containing the params to initialize each of the ``n_fit`` agent instances. It can be useful if different instances diff --git a/rlberry/manager/remote_experiment_manager.py b/rlberry/manager/remote_experiment_manager.py deleted file mode 100644 index 38335e2f2..000000000 --- a/rlberry/manager/remote_experiment_manager.py +++ /dev/null @@ -1,235 +0,0 @@ -import base64 -import dill -import io - -import pandas as pd -import pathlib -import pickle -import zipfile -from typing import Any, Mapping, Optional -from rlberry.network import interface -from rlberry.network.client import BerryClient - - -import rlberry - -logger = rlberry.logger - - -class RemoteExperimentManager: - """ - Class to define a client that handles an ExperimentManager instance in a remote BerryServer. - - Parameters - ---------- - client: BerryClient - Client instance, to communicate with a BerryServer. - **kwargs: - Parameters for ExperimentManager instance. - Some parameters (as agent_class, train_env, eval_env) can be defined using a ResourceRequest. - """ - - def __init__( - self, - client: BerryClient, - **kwargs: Mapping[str, Any], - ): - if client: - self._client = client - - # Create a remote ExperimentManager object and keep reference to the filename - # in the server where the object was saved. - msg = self._client.send( - interface.Message.create( - command=interface.Command.AGENT_MANAGER_CREATE_INSTANCE, - params=kwargs, - data=None, - ) - ) - if msg.command == interface.Command.RAISE_EXCEPTION: - raise Exception(msg.message) - - self._remote_experiment_manager_filename = pathlib.Path( - msg.info["filename"] - ) - - # get useful attributes - self.agent_name = msg.info["agent_name"] - self.output_dir = pathlib.Path(msg.info["output_dir"]) # to save locally - - def set_client(self, client: BerryClient): - self._client = client - - @property - def remote_file(self): - return str(self._remote_experiment_manager_filename) - - def get_writer_data(self): - """ - * Calls get_writer_data() in the remote ExperimentManager and returns the result locally. - * If tensorboard data is available in the remote ExperimentManager, the data is zipped, - received locally and unzipped. - """ - msg = self._client.send( - interface.Message.create( - command=interface.Command.AGENT_MANAGER_GET_WRITER_DATA, - params=dict(filename=self.remote_file), - ) - ) - if msg.command == interface.Command.RAISE_EXCEPTION: - raise Exception(msg.message) - raw_data = msg.data["writer_data"] - writer_data = dict() - for idx in raw_data: - csv_content = raw_data[idx] - writer_data[idx] = pd.read_csv(io.StringIO(csv_content), sep=",") - - # check if tensorboard data was received - # If so, read file and unzip it. - tensorboard_bin_data = msg.data["tensorboard_bin_data"] - if tensorboard_bin_data is not None: - tensorboard_bin_data = base64.b64decode( - tensorboard_bin_data.encode("ascii") - ) - zip_file = open(self.output_dir / "tensorboard_data.zip", "wb") - zip_file.write(tensorboard_bin_data) - zip_file.close() - with zipfile.ZipFile( - self.output_dir / "tensorboard_data.zip", "r" - ) as zip_ref: - zip_ref.extractall(self.output_dir) - return writer_data - - def fit(self, budget=None, **kwargs): - msg = self._client.send( - interface.Message.create( - command=interface.Command.AGENT_MANAGER_FIT, - params=dict( - filename=self.remote_file, budget=budget, extra_params=kwargs - ), - data=None, - ) - ) - if msg.command == interface.Command.RAISE_EXCEPTION: - raise Exception(msg.message) - - def eval_agents(self, n_simulations: Optional[int] = None): - msg = self._client.send( - interface.Message.create( - command=interface.Command.AGENT_MANAGER_EVAL, - params=dict(filename=self.remote_file, n_simulations=n_simulations), - data=None, - ) - ) - if msg.command == interface.Command.RAISE_EXCEPTION: - raise Exception(msg.message) - out = msg.data["output"] - return out - - def clear_output_dir(self): - msg = self._client.send( - interface.Message.create( - command=interface.Command.AGENT_MANAGER_CLEAR_OUTPUT_DIR, - params=dict(filename=self.remote_file), - data=None, - ) - ) - if msg.command == interface.Command.RAISE_EXCEPTION: - raise Exception(msg.message) - - def clear_handlers(self): - msg = self._client.send( - interface.Message.create( - command=interface.Command.AGENT_MANAGER_CLEAR_HANDLERS, - params=dict(filename=self.remote_file), - data=None, - ) - ) - if msg.command == interface.Command.RAISE_EXCEPTION: - raise Exception(msg.message) - - def set_writer(self, idx, writer_fn, writer_kwargs=None): - """Note: Use ResourceRequest for writer_fn.""" - params = dict(idx=idx, writer_fn=writer_fn, writer_kwargs=writer_kwargs) - msg = self._client.send( - interface.Message.create( - command=interface.Command.AGENT_MANAGER_SET_WRITER, - params=dict(filename=self.remote_file, kwargs=params), - data=None, - ) - ) - if msg.command == interface.Command.RAISE_EXCEPTION: - raise Exception(msg.message) - - def optimize_hyperparams(self, **kwargs): - msg = self._client.send( - interface.Message.create( - command=interface.Command.AGENT_MANAGER_OPTIMIZE_HYPERPARAMS, - params=dict(filename=self.remote_file, kwargs=kwargs), - data=None, - ) - ) - if msg.command == interface.Command.RAISE_EXCEPTION: - raise Exception(msg.message) - best_params_dict = msg.data - return best_params_dict - - def save(self): - """ - Save RemoteExperimentManager data to self.output_dir. - - Returns - ------- - filename where the ExperimentManager object was saved. - """ - # use self.output_dir - output_dir = self.output_dir - - # create dir if it does not exist - output_dir.mkdir(parents=True, exist_ok=True) - - # save - filename = pathlib.Path("remote_manager_obj").with_suffix(".pickle") - filename = output_dir / filename - filename.parent.mkdir(parents=True, exist_ok=True) - try: - with filename.open("wb") as ff: - pickle.dump(self.__dict__, ff) - logger.info( - "Saved RemoteExperimentManager({}) using pickle.".format( - self.agent_name - ) - ) - except Exception: - try: - with filename.open("wb") as ff: - dill.dump(self.__dict__, ff) - logger.info( - "Saved RemoteExperimentManager({}) using dill.".format( - self.agent_name - ) - ) - except Exception as ex: - logger.warning( - "[RemoteExperimentManager] Instance cannot be pickled: " + str(ex) - ) - - return filename - - @classmethod - def load(cls, filename): - filename = pathlib.Path(filename).with_suffix(".pickle") - - obj = cls(None) - try: - with filename.open("rb") as ff: - tmp_dict = pickle.load(ff) - logger.info("Loaded RemoteExperimentManager using pickle.") - except Exception: - with filename.open("rb") as ff: - tmp_dict = dill.load(ff) - logger.info("Loaded RemoteExperimentManager using dill.") - - obj.__dict__.clear() - obj.__dict__.update(tmp_dict) - return obj diff --git a/rlberry/manager/tests/test_comparisons.py b/rlberry/manager/tests/test_comparisons.py index b95020653..a671c7c60 100644 --- a/rlberry/manager/tests/test_comparisons.py +++ b/rlberry/manager/tests/test_comparisons.py @@ -1,5 +1,5 @@ import pytest -from rlberry.envs import GridWorld +from rlberry_research.envs import GridWorld from rlberry.agents import AgentWithSimplePolicy from rlberry.manager import AgentManager from rlberry.manager import compare_agents diff --git a/rlberry/manager/tests/test_experiment_manager.py b/rlberry/manager/tests/test_experiment_manager.py index 63a489623..870d321c1 100644 --- a/rlberry/manager/tests/test_experiment_manager.py +++ b/rlberry/manager/tests/test_experiment_manager.py @@ -2,7 +2,7 @@ import numpy as np import sys import os -from rlberry.envs import GridWorld +from rlberry_research.envs import GridWorld from rlberry.agents import AgentWithSimplePolicy from rlberry.manager import ( ExperimentManager, @@ -379,3 +379,27 @@ def test_compress(): ) stats.fit() evaluate_agents([stats], show=False) + + +@pytest.mark.parametrize("style_log", ["multi_line", "one_line", "progressbar"]) +def test_logs(style_log): + # Define train and evaluation envs + train_env = (GridWorld, {}) + + # Parameters + params = dict(hyperparameter1=-1, hyperparameter2=lambda x: 42) + eval_kwargs = dict(eval_horizon=10) + + # Run ExperimentManager + stats = ExperimentManager( + DummyAgent, + train_env, + fit_budget=15, + eval_kwargs=eval_kwargs, + init_kwargs=params, + default_writer_kwargs={"style_log": style_log, "log_interval": 0}, + n_fit=3, + seed=123, + ) + stats.fit() + evaluate_agents([stats], show=False) diff --git a/rlberry/manager/tests/test_experiment_manager_seeding.py b/rlberry/manager/tests/test_experiment_manager_seeding.py index d0e5a317c..c1c647ba0 100644 --- a/rlberry/manager/tests/test_experiment_manager_seeding.py +++ b/rlberry/manager/tests/test_experiment_manager_seeding.py @@ -1,8 +1,8 @@ from rlberry.envs.tests.test_env_seeding import get_env_trajectory, compare_trajectories from rlberry.envs import gym_make -from rlberry.envs.classic_control import MountainCar +from rlberry_research.envs.classic_control import MountainCar from rlberry.manager import ExperimentManager, MultipleManagers -from rlberry.agents.torch import A2CAgent +from rlberry_research.agents.torch import A2CAgent import gymnasium as gym import pytest diff --git a/rlberry/manager/tests/test_hyperparam_optim.py b/rlberry/manager/tests/test_hyperparam_optim.py index 2803adbcd..5ee62d3d0 100644 --- a/rlberry/manager/tests/test_hyperparam_optim.py +++ b/rlberry/manager/tests/test_hyperparam_optim.py @@ -1,6 +1,6 @@ -from rlberry.envs import GridWorld +from rlberry_research.envs import GridWorld from rlberry.agents import AgentWithSimplePolicy -from rlberry.agents.dynprog.value_iteration import ValueIterationAgent +from rlberry_scool.agents.dynprog.value_iteration import ValueIterationAgent from rlberry.manager import ExperimentManager from optuna.samplers import TPESampler import numpy as np @@ -86,7 +86,7 @@ def test_hyperparam_optim_random(parallelization, custom_eval_function, fit_frac DummyAgent, train_env, init_kwargs={}, - fit_budget=1, + fit_budget=50, eval_kwargs={"eval_horizon": 5}, n_fit=3, parallelization=parallelization, @@ -97,6 +97,7 @@ def test_hyperparam_optim_random(parallelization, custom_eval_function, fit_frac stats_agent.optimize_hyperparams( sampler_method="random", n_trials=3, + timeout=0.5, optuna_parallelization=parallelization, custom_eval_function=custom_eval_function, fit_fraction=fit_fraction, diff --git a/rlberry/manager/tests/test_plot.py b/rlberry/manager/tests/test_plot.py index 70533ac60..fe0acab86 100644 --- a/rlberry/manager/tests/test_plot.py +++ b/rlberry/manager/tests/test_plot.py @@ -6,9 +6,9 @@ import sys from rlberry.wrappers import WriterWrapper -from rlberry.envs import GridWorld +from rlberry_research.envs import GridWorld from rlberry.manager import plot_writer_data, ExperimentManager, read_writer_data -from rlberry.agents import UCBVIAgent +from rlberry_scool.agents import UCBVIAgent class VIAgent(UCBVIAgent): diff --git a/rlberry/network/__init__.py b/rlberry/network/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rlberry/network/client.py b/rlberry/network/client.py deleted file mode 100644 index 32d07177d..000000000 --- a/rlberry/network/client.py +++ /dev/null @@ -1,53 +0,0 @@ -import pprint -import socket -import json -from typing import List, Union -from rlberry.network import interface -from rlberry.network.utils import serialize_message - - -class BerryClient: - """ - rlberry client - - For now, works only on Linux systems - - Parameters - ---------- - host : - hostname, IP address or empty string. - port : int - Integer from 1-65535 - """ - - def __init__( - self, - host="127.0.0.1", - port: int = 65432, - ) -> None: - assert port >= 1 and port <= 65535 - self._host = host - self._port = port - - def send( - self, - *messages: interface.Message, - print_response: bool = False, - ) -> Union[List[interface.Message], interface.Message]: - returned_messages = [] - pp = pprint.PrettyPrinter(indent=4) - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.connect((self._host, self._port)) - for msg in messages: - msg_bytes = serialize_message(msg) - interface.send_data(s, msg_bytes) - received_bytes = interface.receive_data(s) - received_msg_dict = json.loads(received_bytes) - if print_response: - pp.pprint(received_msg_dict) - received_msg = interface.Message.from_dict(received_msg_dict) - returned_messages.append(received_msg) - - if len(messages) == 1: - return returned_messages[0] - return returned_messages diff --git a/rlberry/network/interface.py b/rlberry/network/interface.py deleted file mode 100644 index 929a3f366..000000000 --- a/rlberry/network/interface.py +++ /dev/null @@ -1,103 +0,0 @@ -import struct -from typing import Any, Dict, Mapping, NamedTuple, Optional - - -REQUEST_PREFIX = "ResourceRequest_" - - -class Command: - NONE = "NONE" - RAISE_EXCEPTION = "RAISE_EXCEPTION" - ECHO = "ECHO" - LIST_RESOURCES = "LIST_RESOURCES" - AGENT_MANAGER_CREATE_INSTANCE = "AGENT_MANAGER_CREATE_INSTANCE" - AGENT_MANAGER_FIT = "AGENT_MANAGER_FIT" - AGENT_MANAGER_EVAL = "AGENT_MANAGER_EVAL" - AGENT_MANAGER_CLEAR_OUTPUT_DIR = "AGENT_MANAGER_CLEAR_OUTPUT_DIR" - AGENT_MANAGER_CLEAR_HANDLERS = "AGENT_MANAGER_CLEAR_HANDLERS" - AGENT_MANAGER_SET_WRITER = "AGENT_MANAGER_SET_WRITER" - AGENT_MANAGER_OPTIMIZE_HYPERPARAMS = "AGENT_MANAGER_OPTIMIZE_HYPERPARAMS" - AGENT_MANAGER_GET_WRITER_DATA = "AGENT_MANAGER_GET_WRITER_DATA" - - -class BerryServerInfo: - host: str - port: int - - -class Message(NamedTuple): - message: Optional[str] = "" - command: Optional[Command] = None - params: Optional[Mapping[str, Any]] = None - data: Optional[Mapping[str, Any]] = None - info: Optional[Mapping[str, Any]] = None - - def to_dict(self): - return self._asdict() - - @classmethod - def create( - cls, - message: Optional[str] = "", - command: Optional[Command] = None, - params: Optional[Mapping[str, Any]] = None, - data: Optional[Mapping[str, Any]] = None, - info: Optional[Mapping[str, Any]] = None, - ): - command = command or "" - params = params or dict() - data = data or dict() - info = info or dict() - return cls( - message=message, - command=command, - params=params, - data=data, - info=info, - ) - - @classmethod - def from_dict(cls, dict_message): - return cls(**dict_message) - - -class ResourceItem(Dict): - obj: Any - description: str - - -Resources = Mapping[str, ResourceItem] - - -class ResourceRequest(NamedTuple): - name: str = "" - kwargs: Optional[Mapping[str, Any]] = None - - -def next_power_of_two(x: int): - return 1 << (x - 1).bit_length() - - -def send_data(socket, data): - """ - adapted from: https://stackoverflow.com/a/63532988 - """ - print(f"[rlberry.network] sending {len(data)} bytes...") - socket.sendall(struct.pack(">I", len(data)) + data) - - -def receive_data(socket): - """ - adapted from: https://stackoverflow.com/a/63532988 - """ - data_size_packed = socket.recv(4) - if not data_size_packed: - return data_size_packed - data_size = struct.unpack(">I", data_size_packed)[0] - received_data = b"" - remaining_size = min(next_power_of_two(data_size), 4096) - while remaining_size > 0: - received_data += socket.recv(remaining_size) - remaining_size = data_size - len(received_data) - print(f"[rlberry.network] ... received {len(received_data)}/{data_size} bytes.") - return received_data diff --git a/rlberry/network/server.py b/rlberry/network/server.py deleted file mode 100644 index e40bd632d..000000000 --- a/rlberry/network/server.py +++ /dev/null @@ -1,174 +0,0 @@ -import concurrent.futures -import logging -import multiprocessing -import socket -import json -import rlberry.network.server_utils as server_utils -from rlberry.network import interface -from rlberry.network.utils import ( - apply_fn_to_tree, - map_request_to_obj, - serialize_message, -) -from rlberry.envs import gym_make -from typing import Optional - - -import rlberry - -logger = rlberry.logger - - -class ClientHandler: - def __init__(self, client_socket, client_address, resources, timeout): - self._socket = client_socket - self._address = client_address - self._resources = resources - self._logger = logging.getLogger("ClientHandler") - self._timeout = timeout - - def _process_message(self, message: interface.Message): - """Replace resource requests in 'message' by available resources.""" - message = message.to_dict() - message = apply_fn_to_tree( - lambda key, val: map_request_to_obj(key, val, self._resources), - message, - apply_to_nodes=True, - ) - return interface.Message.from_dict(message) - - def _execute_message(self, message: interface.Message): - """Execute command in message and send response.""" - self._socket.settimeout(self._timeout) - try: - # Execute commands - response = server_utils.execute_message(message, self._resources) - # Send response - interface.send_data(self._socket, serialize_message(response)) - except Exception as ex: - response = interface.Message.create( - command=interface.Command.RAISE_EXCEPTION, message=str(ex) - ) - interface.send_data(self._socket, serialize_message(response)) - return 1 - return 0 - - def run(self): - with self._socket: - try: - while True: - print( - f"\n Handling client @ {self._address}" - ) - self._socket.settimeout(self._timeout) - message_bytes = interface.receive_data(self._socket) - if not message_bytes: - break - # process bytes - message = interface.Message.from_dict(json.loads(message_bytes)) - message = self._process_message(message) - print(f" Received message: \n{message}") - # execute message commands and send back a response - self._execute_message(message) - except Exception as ex: - print(f" [ERROR]: {ex}") - self._logger.exception(ex) - finally: - print(f" Finished client @ {self._address}") - - -class BerryServer: - """ - rlberry server - - Parameters - ---------- - host : - hostname, IP address or empty string. - port : int - Integer from 1 to 65535. - backlog : int - Number of unnaccepted connections allowed before refusing new connections. - resources : Resources - List of resources that can be requested by client. - client_socket_timeout : float, default: 120 - Timeout (in seconds) for client socket operations. - terminate_after : int - Number of received client sockets after which to terminate the server. If None, - does not terminate. - """ - - def __init__( - self, - host="127.0.0.1", - port: int = 65432, - backlog: int = 5, - resources: Optional[interface.Resources] = None, - client_socket_timeout: float = 120.0, - terminate_after: Optional[int] = None, - ) -> None: - assert port >= 1 and port <= 65535 - self._host = host - self._port = port - self._backlog = backlog - - self._resources = resources - self._client_socket_timeout = client_socket_timeout - self._terminate_after = terminate_after - self._client_socket_counter = 0 - - # Define basic resources - if resources is None: - self._resources = dict( - gym_make=interface.ResourceItem(obj=gym_make, description="gym_make"), - ) - else: - for _, val in resources.items(): - if set(val.keys()) != set(["obj", "description"]): - raise ValueError( - "resources items must be a dictionary with keys ['obj', 'description']." - f" Received: {list(val.keys())}" - ) - - def start(self): - print( - f"\n\nStarting BerryServer @ (host, port) = ({self._host}, {self._port}).\n\n" - ) - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind((self._host, self._port)) - s.listen(self._backlog) - with concurrent.futures.ProcessPoolExecutor( - mp_context=multiprocessing.get_context("spawn") - ) as executor: - futures = [] - while True: - print( - f" BerryServer({self._host}, {self._port}): waiting for connection..." - ) - client_socket, client_address = s.accept() # wait for connection - self._client_socket_counter += 1 - client_handler = ClientHandler( - client_socket, - client_address, - self._resources, - self._client_socket_timeout, - ) - print( - f" BerryServer({self._host}, {self._port}): " - f"new client @ {client_address}" - ) - futures.append(executor.submit(client_handler.run)) - if ( - self._terminate_after - and self._client_socket_counter >= self._terminate_after - ): - print( - " Terminating server (main process): " - "reached max number of client sockets." - ) - break - - -if __name__ == "__main__": - server = BerryServer() - server.start() diff --git a/rlberry/network/server_utils.py b/rlberry/network/server_utils.py deleted file mode 100644 index 75922a83f..000000000 --- a/rlberry/network/server_utils.py +++ /dev/null @@ -1,118 +0,0 @@ -import pathlib -from rlberry.network import interface -from rlberry.manager import ExperimentManager -from rlberry import metadata_utils -import rlberry.utils.io -import base64 - - -def execute_message( - message: interface.Message, resources: interface.Resources -) -> interface.Message: - response = interface.Message.create(command=interface.Command.ECHO) - # LIST_RESOURCES - if message.command == interface.Command.LIST_RESOURCES: - info = {} - for rr in resources: - info[rr] = resources[rr]["description"] - response = interface.Message.create(info=info) - # AGENT_MANAGER_CREATE_INSTANCE - elif message.command == interface.Command.AGENT_MANAGER_CREATE_INSTANCE: - params = message.params - base_dir = pathlib.Path(metadata_utils.RLBERRY_DEFAULT_DATA_DIR) - if "output_dir" in params: - params["output_dir"] = base_dir / "server_data" / params["output_dir"] - else: - params["output_dir"] = base_dir / "server_data/" - experiment_manager = ExperimentManager(**params) - filename = str(experiment_manager.save()) - response = interface.Message.create( - info=dict( - filename=filename, - agent_name=experiment_manager.agent_name, - output_dir=str(experiment_manager.output_dir).replace( - "server_data/", "client_data/" - ), - ) - ) - del experiment_manager - # AGENT_MANAGER_FIT - elif message.command == interface.Command.AGENT_MANAGER_FIT: - filename = message.params["filename"] - budget = message.params["budget"] - extra_params = message.params["extra_params"] - experiment_manager = ExperimentManager.load(filename) - experiment_manager.fit(budget, **extra_params) - experiment_manager.save() - response = interface.Message.create(command=interface.Command.ECHO) - del experiment_manager - # AGENT_MANAGER_EVAL - elif message.command == interface.Command.AGENT_MANAGER_EVAL: - filename = message.params["filename"] - experiment_manager = ExperimentManager.load(filename) - eval_output = experiment_manager.eval_agents(message.params["n_simulations"]) - response = interface.Message.create(data=dict(output=eval_output)) - del experiment_manager - # AGENT_MANAGER_CLEAR_OUTPUT_DIR - elif message.command == interface.Command.AGENT_MANAGER_CLEAR_OUTPUT_DIR: - filename = message.params["filename"] - experiment_manager = ExperimentManager.load(filename) - experiment_manager.clear_output_dir() - response = interface.Message.create( - message=f"Cleared output dir: {experiment_manager.output_dir}" - ) - del experiment_manager - # AGENT_MANAGER_CLEAR_HANDLERS - elif message.command == interface.Command.AGENT_MANAGER_CLEAR_HANDLERS: - filename = message.params["filename"] - experiment_manager = ExperimentManager.load(filename) - experiment_manager.clear_handlers() - experiment_manager.save() - response = interface.Message.create(message=f"Cleared handlers: {filename}") - del experiment_manager - # AGENT_MANAGER_SET_WRITER - elif message.command == interface.Command.AGENT_MANAGER_SET_WRITER: - filename = message.params["filename"] - experiment_manager = ExperimentManager.load(filename) - experiment_manager.set_writer(**message.params["kwargs"]) - experiment_manager.save() - del experiment_manager - # AGENT_MANAGER_OPTIMIZE_HYPERPARAMS - elif message.command == interface.Command.AGENT_MANAGER_OPTIMIZE_HYPERPARAMS: - filename = message.params["filename"] - experiment_manager = ExperimentManager.load(filename) - best_params_dict = experiment_manager.optimize_hyperparams( - **message.params["kwargs"] - ) - experiment_manager.save() - del experiment_manager - response = interface.Message.create(data=best_params_dict) - # AGENT_MANAGER_GET_WRITER_DATA - elif message.command == interface.Command.AGENT_MANAGER_GET_WRITER_DATA: - # writer scalar data - filename = message.params["filename"] - experiment_manager = ExperimentManager.load(filename) - writer_data = experiment_manager.get_writer_data() - writer_data = writer_data or dict() - for idx in writer_data: - writer_data[idx] = writer_data[idx].to_csv(index=False) - # tensoboard data - tensorboard_bin_data = None - if experiment_manager.tensorboard_dir is not None: - tensorboard_zip_file = rlberry.utils.io.zipdir( - experiment_manager.tensorboard_dir, - experiment_manager.output_dir / "tensorboard_data.zip", - ) - if tensorboard_zip_file is not None: - tensorboard_bin_data = open(tensorboard_zip_file, "rb").read() - tensorboard_bin_data = base64.b64encode(tensorboard_bin_data).decode( - "ascii" - ) - response = interface.Message.create( - data=dict( - writer_data=writer_data, tensorboard_bin_data=tensorboard_bin_data - ) - ) - del experiment_manager - # end - return response diff --git a/rlberry/network/tests/__init__.py b/rlberry/network/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rlberry/network/tests/conftest.py b/rlberry/network/tests/conftest.py deleted file mode 100644 index 91ffaff1f..000000000 --- a/rlberry/network/tests/conftest.py +++ /dev/null @@ -1,43 +0,0 @@ -# content of conftest.py -# This file is used to spawn a server to connect to in the tests from test_server.py - -import multiprocessing - -from rlberry.network.interface import ResourceItem -from rlberry.network.server import BerryServer -from rlberry.agents import ValueIterationAgent -from rlberry.agents.torch import REINFORCEAgent -from rlberry.envs import GridWorld, gym_make -from rlberry.utils.writers import DefaultWriter - -import sys - - -def print_err(s): - sys.stderr.write(s) - sys.stderr.flush() - - -def server(port): - # definition of server - resources = dict( - GridWorld=ResourceItem(obj=GridWorld, description="GridWorld constructor"), - gym_make=ResourceItem(obj=gym_make, description="gym_make"), - REINFORCEAgent=ResourceItem(obj=REINFORCEAgent, description="REINFORCEAgent"), - ValueIterationAgent=ResourceItem( - obj=ValueIterationAgent, - description="ValueIterationAgent constructor" + ValueIterationAgent.__doc__, - ), - DefaultWriter=ResourceItem( - obj=DefaultWriter, description="rlberry default writer" - ), - ) - server = BerryServer(resources=resources, port=port, client_socket_timeout=120.0) - server.start() - - -if __name__ == "__main__": - default_port = 4242 - p = multiprocessing.Process(target=server, args=(default_port,)) - p.start() - print_err("Server startup completed!") diff --git a/rlberry/network/tests/test_server.py b/rlberry/network/tests/test_server.py deleted file mode 100644 index 8f3bf3ca1..000000000 --- a/rlberry/network/tests/test_server.py +++ /dev/null @@ -1,91 +0,0 @@ -import sys - -import py -import pytest -from xprocess import ProcessStarter -import numpy as np - -from rlberry.network.client import BerryClient -from rlberry.network import interface -from rlberry.network.interface import Message, ResourceRequest -from rlberry.manager import RemoteExperimentManager -from rlberry.manager.evaluation import evaluate_agents - -server_name = "berry" - - -@pytest.fixture(autouse=True) -def start_server(xprocess): - python_executable_full_path = sys.executable - python_server_script_full_path = py.path.local(__file__).dirpath("conftest.py") - - class Starter(ProcessStarter): - pattern = "completed" - args = [python_executable_full_path, python_server_script_full_path] - - xprocess.ensure(server_name, Starter) - yield - xprocess.getinfo(server_name).terminate() - - -def test_client(): - port = 4242 - client = BerryClient(port=port) - # Send params for ExperimentManager - client.send( - Message.create( - command=interface.Command.AGENT_MANAGER_CREATE_INSTANCE, - params=dict( - agent_class=ResourceRequest(name="ValueIterationAgent"), - train_env=ResourceRequest(name="GridWorld", kwargs=dict(nrows=3)), - fit_budget=2, - init_kwargs=dict(gamma=0.95), - eval_kwargs=dict(eval_horizon=2, n_simulations=2), - n_fit=2, - seed=10, - ), - data=None, - ), - Message.create( - command=interface.Command.LIST_RESOURCES, params=dict(), data=dict() - ), - ) - - client.send( - Message.create( - command=interface.Command.NONE, - params=dict(), - data=dict(big_list=list(1.0 * np.arange(2**4))), - ), - print_response=True, - ) - - -def test_remote_manager(): - port = 4242 - client = BerryClient(port=port) - remote_manager = RemoteExperimentManager( - client, - agent_class=ResourceRequest(name="REINFORCEAgent"), - train_env=ResourceRequest(name="gym_make", kwargs=dict(id="CartPole-v1")), - fit_budget=10, - init_kwargs=dict(gamma=0.99), - eval_kwargs=dict(eval_horizon=2, n_simulations=2), - n_fit=2, - agent_name="REINFORCE(remote)", - ) - remote_manager.set_writer( - idx=0, - writer_fn=ResourceRequest(name="DefaultWriter"), - writer_kwargs=dict(name="debug_reinforce_writer"), - ) - - # Optimize hyperparams of remote agent - best_params = remote_manager.optimize_hyperparams(timeout=1) - print(f"best params = {best_params}") - - fname1 = remote_manager.save() - del remote_manager - remote_manager = RemoteExperimentManager.load(fname1) - remote_manager.fit(3) - evaluate_agents([remote_manager], n_simulations=2, show=False) diff --git a/rlberry/network/utils.py b/rlberry/network/utils.py deleted file mode 100644 index 67e2ae1f7..000000000 --- a/rlberry/network/utils.py +++ /dev/null @@ -1,83 +0,0 @@ -import json -from copy import deepcopy -from rlberry.network import interface -from typing import Any, Callable, Mapping, Optional, Tuple, Union - - -Tree = Union[Any, Tuple, Mapping[Any, "Tree"]] - - -def apply_fn_to_tree( - fn: Callable[[Any, Any], Tuple[Any, Any]], - tree: Tree, - is_leaf: Optional[Callable[[Any], Any]] = None, - apply_to_nodes: Optional[bool] = False, -): - """ - new_key, new_val = fn(key, my_dict[key]) - """ - is_leaf = is_leaf or ( - lambda x: not isinstance(x, Mapping) and not isinstance(x, Tuple) - ) - if is_leaf(tree): - return deepcopy(tree) - if isinstance(tree, Mapping): - new_tree = dict() - keys = list(tree.keys()) - for key in keys: - new_tree[key] = tree[key] - if apply_to_nodes or is_leaf(tree[key]): - new_key, new_val = fn(key, tree[key]) - new_tree.pop(key) - new_tree[new_key] = new_val - return { - key: apply_fn_to_tree(fn, val, is_leaf, apply_to_nodes) - for (key, val) in new_tree.items() - } - elif isinstance(tree, Tuple): - return tuple( - [apply_fn_to_tree(fn, val, is_leaf, apply_to_nodes) for val in tree] - ) - else: - raise RuntimeError("Tree is not a Mapping or Tuple.") - - -def _map_resource_request_to_dict(key, val): - if isinstance(val, interface.ResourceRequest): - assert isinstance(key, str) - new_key = interface.REQUEST_PREFIX + key - new_val = val._asdict() - return new_key, new_val - return key, val - - -def map_request_to_obj(key, val, resources: interface.Resources): - if key.startswith(interface.REQUEST_PREFIX): - new_key = key[len(interface.REQUEST_PREFIX) :] - resource_name = val["name"] - try: - resource_kwargs = val["kwargs"] - except KeyError: - resource_kwargs = None - if resource_name in resources: - if resource_kwargs: - new_val = (resources[resource_name]["obj"], resource_kwargs) - else: - new_val = resources[resource_name]["obj"] - return new_key, new_val - else: - raise RuntimeError(f"Unavailable requested resource: {resource_name}") - else: - return key, val - - -def serialize_message(message: interface.Message) -> bytes: - message = message.to_dict() - message = apply_fn_to_tree( - _map_resource_request_to_dict, message, apply_to_nodes=True - ) - - def default(obj): - return f"<>" - - return str.encode(json.dumps(message, default=default)) diff --git a/rlberry/rendering/__init__.py b/rlberry/rendering/__init__.py deleted file mode 100644 index 5bdd0e295..000000000 --- a/rlberry/rendering/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .core import Scene, GeometricPrimitive -from .render_interface import RenderInterface -from .render_interface import RenderInterface2D diff --git a/rlberry/rendering/common_shapes.py b/rlberry/rendering/common_shapes.py deleted file mode 100644 index 91f942c14..000000000 --- a/rlberry/rendering/common_shapes.py +++ /dev/null @@ -1,39 +0,0 @@ -import numpy as np -from rlberry.rendering import GeometricPrimitive - - -def bar_shape(p0, p1, width): - shape = GeometricPrimitive("QUADS") - - x0, y0 = p0 - x1, y1 = p1 - - direction = np.array([x1 - x0, y1 - y0]) - norm = np.sqrt((direction * direction).sum()) - direction = direction / norm - - # get vector perpendicular to direction - u_vec = np.zeros(2) - u_vec[0] = -direction[1] - u_vec[1] = direction[0] - - u_vec = u_vec * width / 2 - - shape.add_vertex((x0 + u_vec[0], y0 + u_vec[1])) - shape.add_vertex((x0 - u_vec[0], y0 - u_vec[1])) - shape.add_vertex((x1 - u_vec[0], y1 - u_vec[1])) - shape.add_vertex((x1 + u_vec[0], y1 + u_vec[1])) - return shape - - -def circle_shape(center, radius, n_points=50): - shape = GeometricPrimitive("POLYGON") - - x0, y0 = center - theta = np.linspace(0.0, 2 * np.pi, n_points) - for tt in theta: - xx = radius * np.cos(tt) - yy = radius * np.sin(tt) - shape.add_vertex((x0 + xx, y0 + yy)) - - return shape diff --git a/rlberry/rendering/core.py b/rlberry/rendering/core.py deleted file mode 100644 index 0cc5e92ef..000000000 --- a/rlberry/rendering/core.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Provide classes for geometric primitives in OpenGL and scenes. -""" - - -class Scene: - """ - Class representing a scene, which is a vector of GeometricPrimitive objects - """ - - def __init__(self): - self.shapes = [] - - def add_shape(self, shape): - self.shapes.append(shape) - - -class GeometricPrimitive: - """ - Class representing an OpenGL geometric primitive. - - Primitive type (GL_LINE_LOOP by defaut) - - If using OpenGLRender2D, one of the following: - POINTS - LINES - LINE_STRIP - LINE_LOOP - POLYGON - TRIANGLES - TRIANGLE_STRIP - TRIANGLE_FAN - QUADS - QUAD_STRIP - - If using PyGameRender2D: - POLYGON - - - TODO: Add support to more pygame shapes, - see https://www.pygame.org/docs/ref/draw.html - """ - - def __init__(self, primitive_type="GL_LINE_LOOP"): - # primitive type - self.type = primitive_type - # color in RGB - self.color = (0.25, 0.25, 0.25) - # list of vertices. each vertex is a tuple with coordinates in space - self.vertices = [] - - def add_vertex(self, vertex): - self.vertices.append(vertex) - - def set_color(self, color): - self.color = color diff --git a/rlberry/rendering/opengl_render2d.py b/rlberry/rendering/opengl_render2d.py deleted file mode 100644 index 64ec79646..000000000 --- a/rlberry/rendering/opengl_render2d.py +++ /dev/null @@ -1,252 +0,0 @@ -""" -OpenGL code for 2D rendering, using pygame. -""" - -import numpy as np -from os import environ - -from rlberry.rendering import Scene - -import rlberry - -logger = rlberry.logger -environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "1" - -_IMPORT_SUCESSFUL = True -_IMPORT_ERROR_MSG = "" -try: - import pygame as pg - from pygame.locals import DOUBLEBUF, OPENGL - - from OpenGL.GLU import gluOrtho2D - from OpenGL.GL import glMatrixMode, glLoadIdentity, glClearColor - from OpenGL.GL import glClear, glFlush, glBegin, glEnd - from OpenGL.GL import glColor3f, glVertex2f - from OpenGL.GL import glReadBuffer, glReadPixels - from OpenGL.GL import GL_PROJECTION, GL_COLOR_BUFFER_BIT - from OpenGL.GL import GL_POINTS, GL_LINES, GL_LINE_STRIP, GL_LINE_LOOP - from OpenGL.GL import GL_POLYGON, GL_TRIANGLES, GL_TRIANGLE_STRIP - from OpenGL.GL import GL_TRIANGLE_FAN, GL_QUADS, GL_QUAD_STRIP - from OpenGL.GL import GL_FRONT, GL_RGB, GL_UNSIGNED_BYTE - -except Exception as ex: - _IMPORT_SUCESSFUL = False - _IMPORT_ERROR_MSG = str(ex) - - -class OpenGLRender2D: - """ - Class to render a list of scenes using OpenGL and pygame. - """ - - def __init__(self): - # parameters - self.window_width = 800 - self.window_height = 800 # multiples of 16 are preferred - self.background_color = (0.6, 0.75, 1.0) - self.refresh_interval = 50 - self.window_name = "rlberry render" - self.clipping_area = (-1.0, 1.0, -1.0, 1.0) - - # time counter - self.time_count = 0 - - # background scene - self.background = Scene() - # data to be rendered (list of scenes) - self.data = [] - - def set_window_name(self, name): - self.window_name = name - - def set_refresh_interval(self, interval): - self.refresh_interval = interval - - def set_clipping_area(self, area): - """ - The clipping area is tuple with elements (left, right, bottom, top) - Default = (-1.0, 1.0, -1.0, 1.0) - """ - self.clipping_area = area - base_size = max(self.window_width, self.window_height) - width_range = area[1] - area[0] - height_range = area[3] - area[2] - base_range = max(width_range, height_range) - width_range /= base_range - height_range /= base_range - self.window_width = int(base_size * width_range) - self.window_height = int(base_size * height_range) - - # width and height must be divisible by 2 - if self.window_width % 2 == 1: - self.window_width += 1 - if self.window_height % 2 == 1: - self.window_height += 1 - - def set_data(self, data): - self.data = data - - def set_background(self, background): - self.background = background - - def initGL(self): - """ - initialize GL - """ - glMatrixMode(GL_PROJECTION) - glLoadIdentity() - gluOrtho2D( - self.clipping_area[0], - self.clipping_area[1], - self.clipping_area[2], - self.clipping_area[3], - ) - - def display(self): - """ - Callback function, handler for window re-paint - """ - # Set background color (clear background) - glClearColor( - self.background_color[0], - self.background_color[1], - self.background_color[2], - 1.0, - ) - glClear(GL_COLOR_BUFFER_BIT) - - # Display background - for shape in self.background.shapes: - self.draw_geometric2d(shape) - - # Display objects - if len(self.data) > 0: - idx = self.time_count % len(self.data) - for shape in self.data[idx].shapes: - self.draw_geometric2d(shape) - - self.time_count += 1 - glFlush() - - @staticmethod - def draw_geometric2d(shape): - """ - Draw a 2D shape, of type GeometricPrimitive - """ - if shape.type == "POINTS": - glBegin(GL_POINTS) - elif shape.type == "LINES": - glBegin(GL_LINES) - elif shape.type == "LINE_STRIP": - glBegin(GL_LINE_STRIP) - elif shape.type == "LINE_LOOP": - glBegin(GL_LINE_LOOP) - elif shape.type == "POLYGON": - glBegin(GL_POLYGON) - elif shape.type == "TRIANGLES": - glBegin(GL_TRIANGLES) - elif shape.type == "TRIANGLE_STRIP": - glBegin(GL_TRIANGLE_STRIP) - elif shape.type == "TRIANGLE_FAN": - glBegin(GL_TRIANGLE_FAN) - elif shape.type == "QUADS": - glBegin(GL_QUADS) - elif shape.type == "QUAD_STRIP": - glBegin(GL_QUAD_STRIP) - else: - logger.error("Invalid type for geometric primitive!") - raise NameError - - # set color - glColor3f(shape.color[0], shape.color[1], shape.color[2]) - - # create vertices - for vertex in shape.vertices: - glVertex2f(vertex[0], vertex[1]) - glEnd() - - def run_graphics(self, loop=True): - """ - Sequentially displays scenes in self.data - - If loop is True, keep rendering until user closes the window. - """ - global _IMPORT_SUCESSFUL - - if _IMPORT_SUCESSFUL: - pg.init() - display = (self.window_width, self.window_height) - pg.display.set_mode(display, DOUBLEBUF | OPENGL) - pg.display.set_caption(self.window_name) - self.initGL() - while True: - for event in pg.event.get(): - if event.type == pg.QUIT: - pg.quit() - return - # - self.display() - # - pg.display.flip() - pg.time.wait(self.refresh_interval) - - # if not loop, stop - if not loop: - pg.quit() - return - else: - logger.error( - f"Not possible to render the environment due to the following error: {_IMPORT_ERROR_MSG}" - ) - return - - def get_gl_image_str(self): - # see https://gist.github.com/Jerdak/7364746 - glReadBuffer(GL_FRONT) - pixels = glReadPixels( - 0, 0, self.window_width, self.window_height, GL_RGB, GL_UNSIGNED_BYTE - ) - return pixels - - def get_video_data(self): - """ - Stores scenes in self.data in a list of numpy arrays that can be used - to save a video. - """ - global _IMPORT_SUCESSFUL - - if _IMPORT_SUCESSFUL: - video_data = [] - - pg.init() - display = (self.window_width, self.window_height) - _ = pg.display.set_mode(display, DOUBLEBUF | OPENGL) - pg.display.set_caption(self.window_name) - self.initGL() - - self.time_count = 0 - while self.time_count <= len(self.data): - # - self.display() - # - pg.display.flip() - - # - # See https://stackoverflow.com/a/42754578/5691288 - # - string_image = self.get_gl_image_str() - temp_surf = pg.image.frombytes( - string_image, (self.window_width, self.window_height), "RGB" - ) - tmp_arr = pg.surfarray.array3d(temp_surf) - imgdata = np.moveaxis(tmp_arr, 0, 1) - imgdata = np.flipud(imgdata) - video_data.append(imgdata) - - pg.quit() - return video_data - else: - logger.error( - f"Not possible to render the environment due to the following error: {_IMPORT_ERROR_MSG}" - ) - return [] diff --git a/rlberry/rendering/pygame_render2d.py b/rlberry/rendering/pygame_render2d.py deleted file mode 100644 index a8d5b3990..000000000 --- a/rlberry/rendering/pygame_render2d.py +++ /dev/null @@ -1,197 +0,0 @@ -""" -Code for 2D rendering, using pygame (without OpenGL) -""" - -import numpy as np -from os import environ - -from rlberry.rendering import Scene - -import rlberry - -logger = rlberry.logger - -environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "1" - -_IMPORT_SUCESSFUL = True -_IMPORT_ERROR_MSG = "" -try: - import pygame as pg - -except Exception as ex: - _IMPORT_SUCESSFUL = False - _IMPORT_ERROR_MSG = str(ex) - - -class PyGameRender2D: - """Class to render a list of scenes using pygame.""" - - def __init__(self): - # parameters - self.window_width = 800 - self.window_height = 800 # multiples of 16 are preferred - self.background_color = (150, 190, 255) - self.refresh_interval = 50 - self.window_name = "rlberry render" - self.clipping_area = (-1.0, 1.0, -1.0, 1.0) - - # time counter - self.time_count = 0 - - # background scene - self.background = Scene() - # data to be rendered (list of scenes) - self.data = [] - - def set_window_name(self, name): - self.window_name = name - - def set_refresh_interval(self, interval): - self.refresh_interval = interval - - def set_clipping_area(self, area): - """ - The clipping area is tuple with elements (left, right, bottom, top) - Default = (-1.0, 1.0, -1.0, 1.0) - """ - self.clipping_area = area - base_size = max(self.window_width, self.window_height) - width_range = area[1] - area[0] - height_range = area[3] - area[2] - base_range = max(width_range, height_range) - width_range /= base_range - height_range /= base_range - self.window_width = int(base_size * width_range) - self.window_height = int(base_size * height_range) - - # width and height must be divisible by 2 - if self.window_width % 2 == 1: - self.window_width += 1 - if self.window_height % 2 == 1: - self.window_height += 1 - - def set_data(self, data): - self.data = data - - def set_background(self, background): - self.background = background - - def display(self): - """ - Callback function, handler for window re-paint - """ - # Set background color (clear background) - self.screen.fill(self.background_color) - - # Display background - for shape in self.background.shapes: - self.draw_geometric2d(shape) - - # Display objects - if len(self.data) > 0: - idx = self.time_count % len(self.data) - for shape in self.data[idx].shapes: - self.draw_geometric2d(shape) - - self.time_count += 1 - - def draw_geometric2d(self, shape): - """ - Draw a 2D shape, of type GeometricPrimitive - """ - if shape.type in ["POLYGON"]: - area = self.clipping_area - width_range = area[1] - area[0] - height_range = area[3] - area[2] - - vertices = [] - for vertex in shape.vertices: - xx = vertex[0] * self.window_width / width_range - yy = vertex[1] * self.window_height / height_range - - # put origin at bottom left instead of top left - yy = self.window_height - yy - - pg_vertex = (xx, yy) - vertices.append(pg_vertex) - - color = (255 * shape.color[0], 255 * shape.color[1], 255 * shape.color[2]) - pg.draw.polygon(self.screen, color, vertices) - - else: - raise NotImplementedError( - "Shape type %s not implemented in pygame renderer." % shape.type - ) - - def run_graphics(self, loop=True): - """ - Sequentially displays scenes in self.data - """ - global _IMPORT_SUCESSFUL - - if _IMPORT_SUCESSFUL: - pg.init() - display = (self.window_width, self.window_height) - self.screen = pg.display.set_mode(display) - pg.display.set_caption(self.window_name) - while True: - for event in pg.event.get(): - if event.type == pg.QUIT: - pg.quit() - return - # - self.display() - # - pg.display.flip() - pg.time.wait(self.refresh_interval) - - # if not loop, stop - if not loop: - pg.quit() - return - else: - logger.error( - f"Not possible to render the environment due to the following error: {_IMPORT_ERROR_MSG}" - ) - return - - def get_video_data(self): - """ - Stores scenes in self.data in a list of numpy arrays that can be used - to save a video. - """ - global _IMPORT_SUCESSFUL - - if _IMPORT_SUCESSFUL: - video_data = [] - - pg.init() - display = (self.window_width, self.window_height) - self.screen = pg.display.set_mode(display) - pg.display.set_caption(self.window_name) - - self.time_count = 0 - while self.time_count <= len(self.data): - # - self.display() - # - pg.display.flip() - - # - # See https://stackoverflow.com/a/42754578/5691288 - # - string_image = pg.image.tobytes(self.screen, "RGB") - temp_surf = pg.image.frombytes( - string_image, (self.window_width, self.window_height), "RGB" - ) - tmp_arr = pg.surfarray.array3d(temp_surf) - imgdata = np.moveaxis(tmp_arr, 0, 1) - video_data.append(imgdata) - - pg.quit() - return video_data - else: - logger.error( - f"Not possible to render the environment due to the following error: {_IMPORT_ERROR_MSG}" - ) - return [] diff --git a/rlberry/rendering/render_interface.py b/rlberry/rendering/render_interface.py deleted file mode 100644 index af846cf33..000000000 --- a/rlberry/rendering/render_interface.py +++ /dev/null @@ -1,162 +0,0 @@ -""" -Interface that allows 2D rendering. -""" - - -from abc import ABC, abstractmethod - -from rlberry.rendering.opengl_render2d import OpenGLRender2D -from rlberry.rendering.pygame_render2d import PyGameRender2D -from rlberry.rendering.utils import video_write - -import rlberry - -logger = rlberry.logger - - -class RenderInterface(ABC): - """ - Common interface for rendering in rlberry. - """ - - def __init__(self): - self._rendering_enabled = False - - def is_render_enabled(self): - return self._rendering_enabled - - def enable_rendering(self): - self._rendering_enabled = True - - def disable_rendering(self): - self._rendering_enabled = False - - def save_video(self, filename, **kwargs): - """ - Save video file. - """ - pass - - def get_video(self, **kwargs): - """ - Get video data. - """ - pass - - @abstractmethod - def render(self, **kwargs): - """ - Display on screen. - """ - pass - - -class RenderInterface2D(RenderInterface): - """ - Interface for 2D rendering in rlberry. - """ - - def __init__(self): - RenderInterface.__init__(self) - self._rendering_enabled = False - self._rendering_type = "2d" - self._state_history_for_rendering = [] - self._refresh_interval = 50 # in milliseconds - self._clipping_area = (-1.0, 1.0, -1.0, 1.0) # (left,right,bottom,top) - - # rendering type, either 'pygame' or 'opengl' - self.renderer_type = "opengl" - - def get_renderer(self): - if self.renderer_type == "opengl": - return OpenGLRender2D() - elif self.renderer_type == "pygame": - return PyGameRender2D() - else: - raise NotImplementedError("Unknown renderer type.") - - @abstractmethod - def get_scene(self, state): - """ - Return scene (list of shapes) representing a given state - """ - pass - - @abstractmethod - def get_background(self): - """ - Returne a scene (list of shapes) representing the background - """ - pass - - def append_state_for_rendering(self, state): - self._state_history_for_rendering.append(state) - - def set_refresh_interval(self, interval): - self._refresh_interval = interval - - def clear_render_buffer(self): - self._state_history_for_rendering = [] - - def set_clipping_area(self, area): - self._clipping_area = area - - def _get_background_and_scenes(self): - # background - background = self.get_background() - - # data: convert states to scenes - scenes = [] - for state in self._state_history_for_rendering: - scene = self.get_scene(state) - scenes.append(scene) - return background, scenes - - def render(self, loop=True, **kwargs): - """ - Function to render an environment that implements the interface. - """ - - if self.is_render_enabled(): - # background and data - background, data = self._get_background_and_scenes() - - if len(data) == 0: - logger.info("No data to render.") - return - - # render - renderer = self.get_renderer() - - renderer.window_name = self.name - renderer.set_refresh_interval(self._refresh_interval) - renderer.set_clipping_area(self._clipping_area) - renderer.set_data(data) - renderer.set_background(background) - renderer.run_graphics(loop) - return 0 - else: - logger.info("Rendering not enabled for the environment.") - return 1 - - def get_video(self, framerate=25, **kwargs): - # background and data - background, data = self._get_background_and_scenes() - - if len(data) == 0: - logger.info("No data to save.") - return - - # get video data from renderer - renderer = self.get_renderer() - renderer.window_name = self.name - renderer.set_refresh_interval(self._refresh_interval) - renderer.set_clipping_area(self._clipping_area) - renderer.set_data(data) - renderer.set_background(background) - - return renderer.get_video_data() - - def save_video(self, filename, framerate=25, **kwargs): - video_data = self.get_video(framerate=framerate, **kwargs) - video_write(filename, video_data, framerate=framerate) diff --git a/rlberry/rendering/tests/__init__.py b/rlberry/rendering/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rlberry/rendering/tests/test_rendering_interface.py b/rlberry/rendering/tests/test_rendering_interface.py deleted file mode 100644 index f0c793700..000000000 --- a/rlberry/rendering/tests/test_rendering_interface.py +++ /dev/null @@ -1,125 +0,0 @@ -import os -import pytest -import sys - -from pyvirtualdisplay import Display -from rlberry.envs.classic_control import MountainCar -from rlberry.envs.classic_control import Acrobot -from rlberry.envs.classic_control import Pendulum -from rlberry.envs.finite import Chain -from rlberry.envs.finite import GridWorld -from rlberry.envs.benchmarks.grid_exploration.four_room import FourRoom -from rlberry.envs.benchmarks.grid_exploration.six_room import SixRoom -from rlberry.envs.benchmarks.grid_exploration.apple_gold import AppleGold -from rlberry.envs.benchmarks.ball_exploration import PBall2D, SimplePBallND -from rlberry.envs.benchmarks.generalization.twinrooms import TwinRooms -from rlberry.rendering import RenderInterface -from rlberry.rendering import RenderInterface2D -from rlberry.envs import Wrapper - -import tempfile - -try: - display = Display(visible=0, size=(1400, 900)) - display.start() -except Exception: - pass - -classes = [ - Acrobot, - Pendulum, - MountainCar, - GridWorld, - Chain, - PBall2D, - SimplePBallND, - FourRoom, - SixRoom, - AppleGold, - TwinRooms, -] - - -@pytest.mark.parametrize("ModelClass", classes) -def test_instantiation(ModelClass): - env = ModelClass() - - if isinstance(env, RenderInterface): - env.disable_rendering() - assert not env.is_render_enabled() - env.enable_rendering() - assert env.is_render_enabled() - - -@pytest.mark.xfail(sys.platform != "linux", reason="bug with mac and windows???") -@pytest.mark.parametrize("ModelClass", classes) -def test_render2d_interface(ModelClass): - env = ModelClass() - - if isinstance(env, RenderInterface2D): - env.enable_rendering() - - if env.is_online(): - for _ in range(2): - observation, info = env.reset() - for _ in range(5): - assert env.observation_space.contains(observation) - action = env.action_space.sample() - observation, _, _, _, _ = env.step(action) - env.render(loop=False) - - with tempfile.TemporaryDirectory() as tmpdirname: - saving_path = tmpdirname + "/test_video.mp4" - - env.save_video(saving_path) - env.clear_render_buffer() - - -@pytest.mark.xfail(sys.platform != "linux", reason="bug with mac and windows???") -@pytest.mark.parametrize("ModelClass", classes) -def test_render2d_interface_wrapped(ModelClass): - env = Wrapper(ModelClass()) - - if isinstance(env.env, RenderInterface2D): - env.enable_rendering() - if env.is_online(): - for _ in range(2): - observation, info = env.reset() - for _ in range(5): - assert env.observation_space.contains(observation) - action = env.action_space.sample() - observation, _, _, _, _ = env.step(action) - env.render(loop=False) - - with tempfile.TemporaryDirectory() as tmpdirname: - saving_path = tmpdirname + "/test_video.mp4" - env.save_video(saving_path) - env.clear_render_buffer() - try: - os.remove("test_video.mp4") - except Exception: - pass - - -def test_render_appelGold(): - env = AppleGold() - env.render_mode = "human" - env = Wrapper(env) - - if env.is_online(): - for _ in range(2): - observation, info = env.reset() - for _ in range(5): - assert env.observation_space.contains(observation) - action = env.action_space.sample() - observation, _, _, _, _ = env.step(action) - env.render(loop=False) - - with tempfile.TemporaryDirectory() as tmpdirname: - saving_path = tmpdirname + "/test_video.mp4" - env.save_video(saving_path) - env.clear_render_buffer() - try: - os.remove("test_video.mp4") - except Exception: - pass diff --git a/rlberry/rendering/utils.py b/rlberry/rendering/utils.py deleted file mode 100644 index bf09963d3..000000000 --- a/rlberry/rendering/utils.py +++ /dev/null @@ -1,73 +0,0 @@ -import numpy as np - - -_FFMPEG_INSTALLED = True -try: - import ffmpeg -except Exception: - _FFMPEG_INSTALLED = False - -import rlberry - -logger = rlberry.logger - - -def video_write(fn, images, framerate=60, vcodec="libx264"): - """ - Save list of images to a video file. - - Source: - https://github.com/kkroening/ffmpeg-python/issues/246#issuecomment-520200981 - Modified so that framerate is given to .input(), as suggested in the - thread, to avoid - skipping frames. - - Parameters - ---------- - fn : string - filename - images : list or np.array - list of images to save to a video. - framerate : int - """ - global _FFMPEG_INSTALLED - - try: - if len(images) == 0: - logger.warning("Calling video_write() with empty images.") - return - - if not _FFMPEG_INSTALLED: - logger.error( - "video_write(): Unable to save video, ffmpeg-python \ - package required (https://github.com/kkroening/ffmpeg-python)" - ) - return - - if not isinstance(images, np.ndarray): - images = np.asarray(images) - _, height, width, channels = images.shape - process = ( - ffmpeg.input( - "pipe:", - format="rawvideo", - pix_fmt="rgb24", - s="{}x{}".format(width, height), - r=framerate, - ) - .output(fn, pix_fmt="yuv420p", vcodec=vcodec) - .overwrite_output() - .run_async(pipe_stdin=True) - ) - for frame in images: - process.stdin.write(frame.astype(np.uint8).tobytes()) - process.stdin.close() - process.wait() - - except Exception as ex: - logger.warning( - "Not possible to save \ -video, due to exception: {}".format( - str(ex) - ) - ) diff --git a/rlberry/tests/test_agent_extra.py b/rlberry/tests/test_agent_extra.py index 61cfcdba6..be55e7d5a 100644 --- a/rlberry/tests/test_agent_extra.py +++ b/rlberry/tests/test_agent_extra.py @@ -1,13 +1,13 @@ import pytest -import rlberry.agents as agents -import rlberry.agents.torch as torch_agents +import rlberry_scool.agents as agents_scool +import rlberry_research.agents.torch as torch_agents from rlberry.utils.check_agent import ( check_rl_agent, check_rlberry_agent, check_vectorized_env_agent, check_hyperparam_optimisation_agent, ) -from rlberry.agents.features import FeatureMap +from rlberry_scool.agents.features import FeatureMap import numpy as np import sys @@ -25,12 +25,12 @@ def map(self, observation, action): # LSVIUCBAgent needs a feature map function to work. -class OneHotLSVI(agents.LSVIUCBAgent): +class OneHotLSVI(agents_scool.LSVIUCBAgent): def __init__(self, env, **kwargs): def feature_map_fn(_env): return OneHotFeatureMap(5, 2) # values for Chain - agents.LSVIUCBAgent.__init__( + agents_scool.LSVIUCBAgent.__init__( self, env, feature_map_fn=feature_map_fn, horizon=10, **kwargs ) diff --git a/rlberry/tests/test_agents_base.py b/rlberry/tests/test_agents_base.py index a9c65ee9f..89378f24d 100644 --- a/rlberry/tests/test_agents_base.py +++ b/rlberry/tests/test_agents_base.py @@ -11,8 +11,9 @@ import numpy as np import sys -import rlberry.agents as agents -from rlberry.agents.features import FeatureMap +import rlberry_research.agents as agents_research +import rlberry_scool.agents as agents_scool +from rlberry_scool.agents.features import FeatureMap from rlberry.utils.check_agent import ( check_rl_agent, @@ -33,32 +34,32 @@ def map(self, observation, action): # LSVIUCBAgent needs a feature map function to work. -class OneHotLSVI(agents.LSVIUCBAgent): +class OneHotLSVI(agents_scool.LSVIUCBAgent): def __init__(self, env, **kwargs): def feature_map_fn(_env): return OneHotFeatureMap(5, 2) # values for Chain - agents.LSVIUCBAgent.__init__( + agents_scool.LSVIUCBAgent.__init__( self, env, feature_map_fn=feature_map_fn, horizon=10, **kwargs ) FINITE_MDP_AGENTS = [ - agents.QLAgent, - agents.SARSAAgent, - agents.ValueIterationAgent, - agents.MBQVIAgent, - agents.UCBVIAgent, - agents.OptQLAgent, - agents.PSRLAgent, - agents.RLSVIAgent, + agents_scool.QLAgent, + agents_scool.SARSAAgent, + agents_scool.ValueIterationAgent, + agents_scool.MBQVIAgent, + agents_scool.UCBVIAgent, + agents_research.OptQLAgent, + agents_research.PSRLAgent, + agents_research.RLSVIAgent, OneHotLSVI, ] CONTINUOUS_STATE_AGENTS = [ - agents.RSUCBVIAgent, - agents.RSKernelUCBVIAgent, + agents_research.RSUCBVIAgent, + agents_research.RSKernelUCBVIAgent, ] diff --git a/rlberry/tests/test_envs.py b/rlberry/tests/test_envs.py index 9519de04f..36321d20d 100644 --- a/rlberry/tests/test_envs.py +++ b/rlberry/tests/test_envs.py @@ -1,11 +1,11 @@ from rlberry.utils.check_env import check_env, check_rlberry_env -from rlberry.envs import Acrobot -from rlberry.envs.benchmarks.ball_exploration import PBall2D -from rlberry.envs.benchmarks.generalization.twinrooms import TwinRooms -from rlberry.envs.benchmarks.grid_exploration.apple_gold import AppleGold -from rlberry.envs.benchmarks.grid_exploration.nroom import NRoom -from rlberry.envs.classic_control import MountainCar, SpringCartPole -from rlberry.envs.finite import Chain, GridWorld +from rlberry_research.envs import Acrobot +from rlberry_research.envs.benchmarks.ball_exploration import PBall2D +from rlberry_research.envs.benchmarks.generalization.twinrooms import TwinRooms +from rlberry_research.envs.benchmarks.grid_exploration.apple_gold import AppleGold +from rlberry_research.envs.benchmarks.grid_exploration.nroom import NRoom +from rlberry_research.envs.classic_control import MountainCar, SpringCartPole +from rlberry_research.envs.finite import Chain, GridWorld import pytest ALL_ENVS = [ diff --git a/rlberry/tests/test_rlberry_main_agents_and_env.py b/rlberry/tests/test_rlberry_main_agents_and_env.py new file mode 100644 index 000000000..6a6bb6258 --- /dev/null +++ b/rlberry/tests/test_rlberry_main_agents_and_env.py @@ -0,0 +1,133 @@ +""" +=============================================== +Tests some agent and env from rlberry only (no rlberry-scool or rlberry research) +=============================================== + +""" + +from rlberry.utils.check_env import check_env, check_gym_env +from rlberry.utils.check_agent import check_rl_agent +from rlberry.envs import gym_make, atari_make +from rlberry.agents.stable_baselines import StableBaselinesAgent +from stable_baselines3 import A2C + +from stable_baselines3 import DQN +import pytest + + +import gymnasium as gym +import numpy as np +from typing import Tuple + + +class CustomDummyEnv(gym.Env): + def __init__(self): + obs_dict = dict( + board=gym.spaces.Box(low=0, high=1, shape=(8 * 8,), dtype=bool), + player=gym.spaces.Discrete(8), + ) + self.observation_space = gym.spaces.Dict(obs_dict) + self.action_space = gym.spaces.MultiDiscrete([8, 8]) + self.has_reset_before_step_dummy = False + + def reset(self): + self.has_reset_before_step_dummy = True + return self._obs(), {} + + def _obs(self): + return {"board": np.zeros(shape=(8, 8), dtype=bool).flatten(), "player": 1} + + def step(self, action: Tuple[int, int]): + if not self.has_reset_before_step_dummy: + raise AssertionError("Cannot call env.step() before calling reset()") + reward = 0.2 + terminated = False + truncated = False + info = {} + return self._obs(), reward, terminated, truncated, info + + def render(self): + print("hi") + + def reseed(self, seed): + print("reseed") + + +class CustomDummyEnvBox1(CustomDummyEnv): + def __init__(self): + CustomDummyEnv.__init__(self) + self.action_space = gym.spaces.Box(-np.inf, np.inf) + + +class CustomDummyEnvBox2(CustomDummyEnv): + def __init__(self): + CustomDummyEnv.__init__(self) + self.action_space = gym.spaces.Box(5, 5) + + +FROZEN_LAKE_CONSTR = ( + gym_make, + dict(id="FrozenLake-v1", wrap_spaces=True, is_slippery=False), +) +CART_POLE_CONSTR = (gym_make, dict(id="CartPole-v1", wrap_spaces=True)) +PENDULUM_CONSTR = (gym_make, dict(id="Pendulum-v1", wrap_spaces=True)) +ASTEROIDS_CONSTR = (atari_make, dict(id="ALE/Asteroids-v5", wrap_spaces=True)) +CUSTOM_CONSTR = (CustomDummyEnv, {}) + + +TEST_ENV_SUCCES = [ + FROZEN_LAKE_CONSTR, + CART_POLE_CONSTR, + PENDULUM_CONSTR, + ASTEROIDS_CONSTR, + CUSTOM_CONSTR, +] + + +@pytest.mark.parametrize("Env", TEST_ENV_SUCCES) +def test_env(Env): + current_env = Env[0](**Env[1]) + if not isinstance(current_env, CustomDummyEnv): + check_env(current_env) + check_gym_env(current_env) + + +CUSTOM_BOX_CONSTR1 = (CustomDummyEnvBox1, {}) +CUSTOM_BOX_CONSTR2 = (CustomDummyEnvBox2, {}) + +TEST_ENV_FAIL = [ + CUSTOM_CONSTR, + CUSTOM_BOX_CONSTR1, + CUSTOM_BOX_CONSTR2, +] + + +@pytest.mark.parametrize("Env", TEST_ENV_FAIL) +def test_errors_env(Env): + current_env = Env[0](**Env[1]) + had_exception_step_before_reset = False + try: + current_env.step(0) + except Exception as ex: + had_exception_step_before_reset = True + + assert had_exception_step_before_reset + check_gym_env(current_env) + + +A2C_INIT_KWARGS = {"algo_cls": A2C, "policy": "MlpPolicy", "verbose": 1} +DQN_INIT_KWARGS = {"algo_cls": DQN, "policy": "MlpPolicy", "verbose": 1} + +AGENTS_WITH_ENV = [ + (A2C_INIT_KWARGS, PENDULUM_CONSTR), + (DQN_INIT_KWARGS, CART_POLE_CONSTR), +] + + +@pytest.mark.parametrize("agent_kwargs,env", AGENTS_WITH_ENV) +def test_rlberry_agent(agent_kwargs, env): + check_rl_agent( + StableBaselinesAgent, + env=env, + init_kwargs=agent_kwargs, + ) diff --git a/rlberry/utils/__init__.py b/rlberry/utils/__init__.py index f70c962c1..b2a8ff62c 100644 --- a/rlberry/utils/__init__.py +++ b/rlberry/utils/__init__.py @@ -1,4 +1,3 @@ -from .check_bandit_agent import check_bandit_agent from .check_agent import ( check_rl_agent, check_save_load, diff --git a/rlberry/utils/check_agent.py b/rlberry/utils/check_agent.py index f4a02976c..d0fc9e961 100644 --- a/rlberry/utils/check_agent.py +++ b/rlberry/utils/check_agent.py @@ -1,5 +1,5 @@ -from rlberry.envs import Chain, Pendulum -from rlberry.envs.benchmarks.ball_exploration import PBall2D +from rlberry_research.envs import Chain, Pendulum +from rlberry_research.envs.benchmarks.ball_exploration import PBall2D from rlberry.manager import ExperimentManager import numpy as np from rlberry.seeding import set_external_seed @@ -61,7 +61,13 @@ def _fit_experiment_manager(agent, env="continuous_state", init_kwargs=None): train_env = _make_tuple_env(env) try: agent = ExperimentManager( - agent, train_env, fit_budget=5, n_fit=1, seed=SEED, init_kwargs=init_kwargs + agent, + train_env, + agent_name="test_agent", + fit_budget=5, + n_fit=1, + seed=SEED, + init_kwargs=init_kwargs, ) agent.fit() except Exception as exc: diff --git a/rlberry/utils/check_bandit_agent.py b/rlberry/utils/check_bandit_agent.py deleted file mode 100644 index 89389b77f..000000000 --- a/rlberry/utils/check_bandit_agent.py +++ /dev/null @@ -1,62 +0,0 @@ -from rlberry.envs.bandits import BernoulliBandit -from rlberry.manager import ExperimentManager - - -def check_bandit_agent(Agent, environment=BernoulliBandit, seed=42): - """ - Function used to check a bandit agent in rlberry on a Gaussian bandit problem. - - Parameters - ---------- - Agent: rlberry agent module - Agent class that we want to test. - - environment: rlberry env module - Environment (i.e bandit instance) on which to test the agent. - - seed : Seed sequence from which to spawn the random number generator. - - - Returns - ------- - result : bool - Whether the agent is a valid/compatible bandit agent. - - Examples - -------- - >>> from rlberry.agents.bandits import IndexAgent - >>> from rlberry.utils import check_bandit_agent - >>> import numpy as np - >>> class UCBAgent(IndexAgent): - >>> name = "UCB" - >>> def __init__(self, env, **kwargs): - >>> def index(r, t): - >>> return np.mean(r) + np.sqrt(np.log(t**2) / (2 * len(r))) - >>> IndexAgent.__init__(self, env, index, **kwargs) - >>> check_bandit_agent(UCBAgent) - True - - """ - env_ctor = environment - env_kwargs = {} - - agent1 = ExperimentManager( - Agent, (env_ctor, env_kwargs), fit_budget=10, n_fit=1, seed=seed - ) - agent2 = ExperimentManager( - Agent, (env_ctor, env_kwargs), fit_budget=10, n_fit=1, seed=seed - ) - - agent1.fit() - agent2.fit() - env = env_ctor(**env_kwargs) - state, info = env.reset() - result = True - for _ in range(5): - # test reproducibility on 5 actions - action1 = agent1.agent_handlers[0].policy(state) - action2 = agent2.agent_handlers[0].policy(state) - if action1 != action2: - result = False - - return result diff --git a/rlberry/utils/io.py b/rlberry/utils/io.py deleted file mode 100644 index cb269f29a..000000000 --- a/rlberry/utils/io.py +++ /dev/null @@ -1,33 +0,0 @@ -import os -import zipfile -import pathlib - - -def zipdir(dir_path, ouput_fname): - """ - Zip a directory. - - Parameters - ---------- - dir_path : Path or str - Directory to be compressed. - output_fname : str - Name of output zip file. - - Returns - ------- - path to zip file, or None if dir_path does not exist. - """ - dir_path = pathlib.Path(dir_path) - if not dir_path.exists(): - return None - ouput_fname = pathlib.Path(ouput_fname).with_suffix(".zip") - zipf = zipfile.ZipFile(ouput_fname, "w", zipfile.ZIP_DEFLATED) - for root, _, files in os.walk(dir_path): - for file in files: - zipf.write( - os.path.join(root, file), - os.path.relpath(os.path.join(root, file), os.path.join(dir_path, "..")), - ) - zipf.close() - return ouput_fname diff --git a/rlberry/utils/tests/test_check.py b/rlberry/utils/tests/test_check.py index 070b580d0..c7c926296 100644 --- a/rlberry/utils/tests/test_check.py +++ b/rlberry/utils/tests/test_check.py @@ -1,6 +1,6 @@ import numpy as np import pytest -from rlberry.envs import GridWorld, Chain +from rlberry_research.envs import GridWorld, Chain from rlberry.utils.check_env import check_env from rlberry.utils.check_agent import ( check_rl_agent, @@ -9,7 +9,7 @@ ) from rlberry.spaces import Box, Dict, Discrete import gymnasium as gym -from rlberry.agents import ValueIterationAgent, UCBVIAgent +from rlberry_scool.agents import ValueIterationAgent, UCBVIAgent class ActionDictTestEnv(gym.Env): diff --git a/rlberry/utils/tests/test_writer.py b/rlberry/utils/tests/test_writer.py index 1345649d2..504301800 100644 --- a/rlberry/utils/tests/test_writer.py +++ b/rlberry/utils/tests/test_writer.py @@ -1,5 +1,5 @@ import time -from rlberry.envs import GridWorld +from rlberry_research.envs import GridWorld from rlberry.agents import AgentWithSimplePolicy from rlberry.manager import ExperimentManager @@ -20,8 +20,9 @@ def fit(self, budget, **kwargs): self.total_budget += budget for ii in range(budget): if self.writer is not None: - self.writer.add_scalar("a", 42, ii) + self.writer.add_scalar("a", ii, ii) time.sleep(1) + return None def policy(self, observation): @@ -42,6 +43,11 @@ def test_myoutput(capsys): # or use "capfd" for fd-level ) agent.fit(budget=3) + assert agent.agent_handlers[0].writer.summary_writer == None + assert list(agent.agent_handlers[0].writer.read_tag_value("a")) == [0, 1, 2] + assert agent.agent_handlers[0].writer.read_first_tag_value("a") == 0 + assert agent.agent_handlers[0].writer.read_last_tag_value("a") == 2 + captured = capsys.readouterr() # test that what is written to stderr is longer than 50 char, assert ( diff --git a/rlberry/utils/writers.py b/rlberry/utils/writers.py index 2b3504df2..5bc9ae7bf 100644 --- a/rlberry/utils/writers.py +++ b/rlberry/utils/writers.py @@ -35,7 +35,7 @@ class DefaultWriter: log_interval : int Minimum number of seconds between consecutive logs (with logging module). style_log: str - Possible values are "multi_line" and "one_line". Define the style of the logs. + Possible values are "multi_line", "one_line" and "progressbar". Define the style of the logs. tensorboard_kwargs : Optional[dict] Parameters for tensorboard SummaryWriter. If provided, DefaultWriter will behave as tensorboard.SummaryWriter, and will keep utilities to handle @@ -430,6 +430,8 @@ def __init__(self, *args, desc="", **kwargs): def set_description(self, desc=None, refresh=True): screen_width, _ = _screen_shape_wrapper()(sys.stdout) + if screen_width is None: + screen_width = 600 max_len = screen_width if len(desc) > 1: if not self.subbar: diff --git a/rlberry/wrappers/tests/old_env/old_acrobot.py b/rlberry/wrappers/tests/old_env/old_acrobot.py index 8ee5d24f2..35408486d 100644 --- a/rlberry/wrappers/tests/old_env/old_acrobot.py +++ b/rlberry/wrappers/tests/old_env/old_acrobot.py @@ -12,8 +12,8 @@ import numpy as np import rlberry.spaces as spaces from rlberry.envs.interface import Model -from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D -from rlberry.rendering.common_shapes import bar_shape, circle_shape +from rlberry_research.rendering import Scene, GeometricPrimitive, RenderInterface2D +from rlberry_research.rendering.common_shapes import bar_shape, circle_shape __copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy" __credits__ = [ diff --git a/rlberry/wrappers/tests/old_env/old_apple_gold.py b/rlberry/wrappers/tests/old_env/old_apple_gold.py index 9006c990c..31cc45c87 100644 --- a/rlberry/wrappers/tests/old_env/old_apple_gold.py +++ b/rlberry/wrappers/tests/old_env/old_apple_gold.py @@ -1,7 +1,7 @@ import numpy as np import rlberry.spaces as spaces from rlberry.wrappers.tests.old_env.old_gridworld import Old_GridWorld -from rlberry.rendering import Scene, GeometricPrimitive +from rlberry_research.rendering import Scene, GeometricPrimitive import rlberry diff --git a/rlberry/wrappers/tests/old_env/old_gridworld.py b/rlberry/wrappers/tests/old_env/old_gridworld.py index 4de564bac..774a08a74 100644 --- a/rlberry/wrappers/tests/old_env/old_gridworld.py +++ b/rlberry/wrappers/tests/old_env/old_gridworld.py @@ -5,9 +5,9 @@ from matplotlib import cm from rlberry.wrappers.tests.old_env.old_finite_mdp import Old_FiniteMDP -from rlberry.envs.finite import gridworld_utils -from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D -from rlberry.rendering.common_shapes import circle_shape +from rlberry_research.envs.finite import gridworld_utils +from rlberry_research.rendering import Scene, GeometricPrimitive, RenderInterface2D +from rlberry_research.rendering.common_shapes import circle_shape import rlberry diff --git a/rlberry/wrappers/tests/old_env/old_mountain_car.py b/rlberry/wrappers/tests/old_env/old_mountain_car.py index dc40b31db..1e163f950 100644 --- a/rlberry/wrappers/tests/old_env/old_mountain_car.py +++ b/rlberry/wrappers/tests/old_env/old_mountain_car.py @@ -16,7 +16,7 @@ import rlberry.spaces as spaces from rlberry.envs.interface import Model -from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D +from rlberry_research.rendering import Scene, GeometricPrimitive, RenderInterface2D class Old_MountainCar(RenderInterface2D, Model): diff --git a/rlberry/wrappers/tests/old_env/old_nroom.py b/rlberry/wrappers/tests/old_env/old_nroom.py index 6820ee780..f3b9cd9c5 100644 --- a/rlberry/wrappers/tests/old_env/old_nroom.py +++ b/rlberry/wrappers/tests/old_env/old_nroom.py @@ -2,7 +2,7 @@ import numpy as np import rlberry.spaces as spaces from rlberry.wrappers.tests.old_env.old_gridworld import Old_GridWorld -from rlberry.rendering import Scene, GeometricPrimitive +from rlberry_research.rendering import Scene, GeometricPrimitive import rlberry diff --git a/rlberry/wrappers/tests/old_env/old_pball.py b/rlberry/wrappers/tests/old_env/old_pball.py index acc7ee29d..ae183de47 100644 --- a/rlberry/wrappers/tests/old_env/old_pball.py +++ b/rlberry/wrappers/tests/old_env/old_pball.py @@ -3,7 +3,7 @@ import rlberry.spaces as spaces from rlberry.envs.interface import Model -from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D +from rlberry_research.rendering import Scene, GeometricPrimitive, RenderInterface2D import rlberry diff --git a/rlberry/wrappers/tests/old_env/old_pendulum.py b/rlberry/wrappers/tests/old_env/old_pendulum.py index e8e93ca01..1ce1151fe 100644 --- a/rlberry/wrappers/tests/old_env/old_pendulum.py +++ b/rlberry/wrappers/tests/old_env/old_pendulum.py @@ -11,8 +11,8 @@ import numpy as np import rlberry.spaces as spaces from rlberry.envs.interface import Model -from rlberry.rendering import Scene, RenderInterface2D -from rlberry.rendering.common_shapes import bar_shape, circle_shape +from rlberry_research.rendering import Scene, RenderInterface2D +from rlberry_research.rendering.common_shapes import bar_shape, circle_shape class Old_Pendulum(RenderInterface2D, Model): diff --git a/rlberry/wrappers/tests/old_env/old_six_room.py b/rlberry/wrappers/tests/old_env/old_six_room.py index a51905d2d..a38368819 100644 --- a/rlberry/wrappers/tests/old_env/old_six_room.py +++ b/rlberry/wrappers/tests/old_env/old_six_room.py @@ -1,7 +1,7 @@ import numpy as np import rlberry.spaces as spaces from rlberry.wrappers.tests.old_env.old_gridworld import Old_GridWorld -from rlberry.rendering import Scene, GeometricPrimitive +from rlberry_research.rendering import Scene, GeometricPrimitive import rlberry diff --git a/rlberry/wrappers/tests/old_env/old_twinrooms.py b/rlberry/wrappers/tests/old_env/old_twinrooms.py index c9ffa09a5..1c6078b9a 100644 --- a/rlberry/wrappers/tests/old_env/old_twinrooms.py +++ b/rlberry/wrappers/tests/old_env/old_twinrooms.py @@ -1,8 +1,8 @@ import numpy as np import rlberry.spaces as spaces from rlberry.envs import Model -from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D -from rlberry.rendering.common_shapes import circle_shape +from rlberry_research.rendering import Scene, GeometricPrimitive, RenderInterface2D +from rlberry_research.rendering.common_shapes import circle_shape import rlberry diff --git a/rlberry/wrappers/tests/test_basewrapper.py b/rlberry/wrappers/tests/test_basewrapper.py index f624e46f2..16279473f 100644 --- a/rlberry/wrappers/tests/test_basewrapper.py +++ b/rlberry/wrappers/tests/test_basewrapper.py @@ -1,6 +1,6 @@ from rlberry.envs.interface import Model from rlberry.envs import Wrapper -from rlberry.envs import GridWorld +from rlberry_research.envs import GridWorld import gymnasium as gym diff --git a/rlberry/wrappers/tests/test_common_wrappers.py b/rlberry/wrappers/tests/test_common_wrappers.py index 502d24c2a..eb4b51e8a 100644 --- a/rlberry/wrappers/tests/test_common_wrappers.py +++ b/rlberry/wrappers/tests/test_common_wrappers.py @@ -1,10 +1,11 @@ import numpy as np import pytest from rlberry import spaces -from rlberry.agents import RSUCBVIAgent -from rlberry.envs.classic_control import MountainCar -from rlberry.envs.finite import FiniteMDP, GridWorld -from rlberry.exploration_tools.discrete_counter import DiscreteCounter +from rlberry_research.agents import RSUCBVIAgent +from rlberry_research.envs.classic_control import MountainCar +from rlberry_research.envs.finite import GridWorld +from rlberry.envs.finite_mdp import FiniteMDP +from rlberry_research.exploration_tools.discrete_counter import DiscreteCounter from rlberry.seeding import Seeder from rlberry.wrappers.autoreset import AutoResetWrapper from rlberry.wrappers.discrete2onehot import DiscreteToOneHotWrapper diff --git a/rlberry/wrappers/tests/test_wrapper_seeding.py b/rlberry/wrappers/tests/test_wrapper_seeding.py index 936db0d14..25f6c9f37 100644 --- a/rlberry/wrappers/tests/test_wrapper_seeding.py +++ b/rlberry/wrappers/tests/test_wrapper_seeding.py @@ -3,10 +3,10 @@ from rlberry.seeding import Seeder from copy import deepcopy -from rlberry.envs.classic_control import MountainCar, Acrobot -from rlberry.envs.finite import Chain -from rlberry.envs.finite import GridWorld -from rlberry.envs.benchmarks.ball_exploration import PBall2D, SimplePBallND +from rlberry_research.envs.classic_control import MountainCar, Acrobot +from rlberry_research.envs.finite import Chain +from rlberry_research.envs.finite import GridWorld +from rlberry_research.envs.benchmarks.ball_exploration import PBall2D, SimplePBallND from rlberry.envs import Wrapper from rlberry.wrappers import RescaleRewardWrapper diff --git a/rlberry/wrappers/tests/test_writer_utils.py b/rlberry/wrappers/tests/test_writer_utils.py index da8edad70..4b3c5da78 100644 --- a/rlberry/wrappers/tests/test_writer_utils.py +++ b/rlberry/wrappers/tests/test_writer_utils.py @@ -1,9 +1,9 @@ import pytest from rlberry.wrappers import WriterWrapper -from rlberry.envs import GridWorld +from rlberry_research.envs import GridWorld -from rlberry.agents import UCBVIAgent +from rlberry_scool.agents import UCBVIAgent @pytest.mark.parametrize("write_scalar", ["action", "reward", "action_and_reward"]) diff --git a/rlberry/wrappers/vis2d.py b/rlberry/wrappers/vis2d.py index 808b64bb8..e0db99bba 100644 --- a/rlberry/wrappers/vis2d.py +++ b/rlberry/wrappers/vis2d.py @@ -1,7 +1,7 @@ from rlberry.envs import Wrapper -from rlberry.exploration_tools.discrete_counter import DiscreteCounter +from rlberry_research.exploration_tools.discrete_counter import DiscreteCounter from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas -from rlberry.rendering.utils import video_write +from rlberry_research.rendering.utils import video_write import gymnasium.spaces as spaces import matplotlib.pyplot as plt diff --git a/scripts/fetch_contributors.py b/scripts/fetch_contributors.py index 5421a85f0..e89d347f7 100644 --- a/scripts/fetch_contributors.py +++ b/scripts/fetch_contributors.py @@ -15,7 +15,6 @@ MEMBERS = [ - "sauxpa", "TimotheeMathieu", "omardrwch", "xuedong", @@ -23,6 +22,7 @@ "yfletberliac", "mmcenta", "menardprr", + "sauxpa", "riccardodv", "AleShi94", "KohlerHECTOR", @@ -30,6 +30,7 @@ "riiswa", "brahimdriss", "RemyDegenne", + "YannBerthelot", ]