From 98a089da7a5d281f8060d9a6c8b36e706d08bb96 Mon Sep 17 00:00:00 2001
From: Waris Radji <wradji64@gmail.com>
Date: Tue, 21 Nov 2023 16:18:44 +0100
Subject: [PATCH] Refactorization of the repository (#379)

* Move some class to rlberry-scoo and rlberry-research

* Update and remove some files in agents and envs

* Update the .gitignore

* Updates imports 'paths'

* add tests to better coverage (env with action space in Box)

* add tests to better coverage (observation_space as Dict)

* add tests to better coverage (check_gym_env_warnings)

* increase writer coverage

* add tests to better coverage (check_gym_env_warnings)

* add tests to better coverage (writer)

* removing old doc

* update rlberry-researche -> update poetry.lock

* update display on API doc

* add YannBerthelot to contributor

* update tests on writers

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: JulienT01 <julien.tj@gmail.com>
Co-authored-by: TimotheeMathieu <timothee.mathieu@inria.fr>
---
 .gitignore                                    |   2 +-
 azure-pipelines.yml                           |  16 +
 codecov.yml                                   |   2 -
 docs/api.rst                                  | 142 +--
 docs/basics/experiment_setup.rst              |   4 +-
 docs/basics/multiprocess.rst                  |   4 +-
 docs/basics/rlberry how to.rst                |   2 +-
 docs/contributors.rst                         |   4 +
 examples/comparison_agents.py                 |   4 +-
 examples/demo_agents/video_plot_a2c.py        |   4 +-
 examples/demo_agents/video_plot_mbqvi.py      |   4 +-
 examples/demo_agents/video_plot_ppo.py        |   4 +-
 examples/demo_agents/video_plot_vi.py         |   4 +-
 examples/demo_bandits/plot_TS_bandit.py       |   4 +-
 .../plot_compare_index_bandits.py             |   4 +-
 examples/demo_bandits/plot_exp3_bandit.py     |   4 +-
 examples/demo_bandits/plot_mirror_bandit.py   |   2 +-
 examples/demo_bandits/plot_ucb_bandit.py      |   4 +-
 examples/demo_env/video_plot_apple_gold.py    |   4 +-
 examples/demo_env/video_plot_chain.py         |   2 +-
 examples/demo_env/video_plot_gridworld.py     |   4 +-
 examples/demo_env/video_plot_pball.py         |   2 +-
 examples/demo_env/video_plot_rooms.py         |   4 +-
 examples/demo_env/video_plot_twinrooms.py     |   4 +-
 examples/demo_experiment/room.yaml            |   2 +-
 examples/demo_experiment/rsucbvi.yaml         |   2 +-
 examples/demo_network/run_server.py           |   8 +-
 examples/plot_agent_manager.py                |   2 +-
 examples/plot_kernels.py                      |   2 +-
 examples/plot_writer_wrapper.py               |   4 +-
 poetry.lock                                   |  74 +-
 pyproject.toml                                |   2 +
 rlberry/agents/__init__.py                    |  13 -
 rlberry/agents/adaptiveql/__init__.py         |   1 -
 rlberry/agents/adaptiveql/adaptiveql.py       | 195 ----
 rlberry/agents/adaptiveql/tree.py             | 219 -----
 rlberry/agents/adaptiveql/utils.py            |  61 --
 rlberry/agents/bandits/__init__.py            |  19 -
 rlberry/agents/bandits/bandit_base.py         | 123 ---
 rlberry/agents/bandits/index_agents.py        | 101 ---
 rlberry/agents/bandits/indices.py             | 421 ---------
 rlberry/agents/bandits/priors.py              | 151 ----
 rlberry/agents/bandits/randomized_agents.py   | 115 ---
 rlberry/agents/bandits/tools/__init__.py      |   1 -
 rlberry/agents/bandits/tools/tracker.py       | 231 -----
 rlberry/agents/bandits/ts_agents.py           | 157 ----
 rlberry/agents/dynprog/__init__.py            |   1 -
 rlberry/agents/dynprog/utils.py               | 272 ------
 rlberry/agents/dynprog/value_iteration.py     |  82 --
 rlberry/agents/experimental/__init__.py       |   0
 rlberry/agents/experimental/tests/__init__.py |   0
 rlberry/agents/experimental/torch/__init__.py |   0
 rlberry/agents/features/__init__.py           |   1 -
 rlberry/agents/features/feature_map.py        |  29 -
 rlberry/agents/kernel_based/__init__.py       |   2 -
 rlberry/agents/kernel_based/common.py         |  34 -
 rlberry/agents/kernel_based/kernels.py        |  58 --
 .../agents/kernel_based/rs_kernel_ucbvi.py    | 390 --------
 rlberry/agents/kernel_based/rs_ucbvi.py       | 332 -------
 rlberry/agents/linear/__init__.py             |   1 -
 rlberry/agents/linear/lsvi_ucb.py             | 356 --------
 rlberry/agents/mbqvi/__init__.py              |   1 -
 rlberry/agents/mbqvi/mbqvi.py                 | 152 ----
 rlberry/agents/optql/__init__.py              |   1 -
 rlberry/agents/optql/optql.py                 | 206 -----
 rlberry/agents/psrl/__init__.py               |   1 -
 rlberry/agents/psrl/psrl.py                   | 257 ------
 rlberry/agents/rlsvi/__init__.py              |   1 -
 rlberry/agents/rlsvi/rlsvi.py                 | 280 ------
 rlberry/agents/tabular_rl/__init__.py         |   2 -
 rlberry/agents/tabular_rl/qlearning.py        | 127 ---
 rlberry/agents/tabular_rl/sarsa.py            | 125 ---
 rlberry/agents/tests/test_adaptiveql.py       |  12 -
 rlberry/agents/tests/test_bandits.py          | 131 ---
 rlberry/agents/tests/test_dynprog.py          | 156 ----
 rlberry/agents/tests/test_kernel_based.py     |  58 --
 rlberry/agents/tests/test_lsvi_ucb.py         | 218 -----
 rlberry/agents/tests/test_mbqvi.py            |  27 -
 rlberry/agents/tests/test_optql.py            |   9 -
 rlberry/agents/tests/test_psrl.py             |  29 -
 rlberry/agents/tests/test_replay.py           |  28 +-
 rlberry/agents/tests/test_rlsvi.py            |  19 -
 rlberry/agents/tests/test_tabular_rl.py       |  33 -
 rlberry/agents/tests/test_ucbvi.py            |  30 -
 rlberry/agents/torch/__init__.py              |   7 -
 rlberry/agents/torch/a2c/__init__.py          |   1 -
 rlberry/agents/torch/a2c/a2c.py               | 338 -------
 rlberry/agents/torch/dqn/__init__.py          |   2 -
 rlberry/agents/torch/dqn/dqn.py               | 513 -----------
 rlberry/agents/torch/dqn/dqn_utils.py         | 142 ---
 rlberry/agents/torch/dqn/mdqn.py              | 478 ----------
 rlberry/agents/torch/ppo/__init__.py          |   1 -
 rlberry/agents/torch/ppo/ppo.py               | 843 ------------------
 rlberry/agents/torch/ppo/ppo_utils.py         | 193 ----
 rlberry/agents/torch/reinforce/__init__.py    |   1 -
 rlberry/agents/torch/reinforce/reinforce.py   | 270 ------
 rlberry/agents/torch/sac/__init__.py          |   1 -
 rlberry/agents/torch/sac/sac.py               | 543 -----------
 rlberry/agents/torch/sac/sac_utils.py         |  38 -
 rlberry/agents/torch/tests/__init__.py        |   0
 rlberry/agents/torch/tests/test_a2c.py        | 122 ---
 rlberry/agents/torch/tests/test_dqn.py        | 138 ---
 rlberry/agents/torch/tests/test_factory.py    |  23 -
 rlberry/agents/torch/tests/test_mdqn.py       |  40 -
 rlberry/agents/torch/tests/test_ppo.py        | 201 -----
 rlberry/agents/torch/tests/test_reinforce.py  |  49 -
 rlberry/agents/torch/tests/test_sac.py        |  68 --
 .../agents/torch/tests/test_torch_atari.py    | 287 ------
 .../agents/torch/tests/test_torch_models.py   |  47 -
 .../agents/torch/tests/test_torch_training.py |  32 -
 rlberry/agents/torch/utils/__init__.py        |   0
 rlberry/agents/torch/utils/models.py          | 534 -----------
 rlberry/agents/torch/utils/training.py        | 148 ---
 rlberry/agents/ucbvi/__init__.py              |   1 -
 rlberry/agents/ucbvi/ucbvi.py                 | 332 -------
 rlberry/agents/ucbvi/utils.py                 |  83 --
 rlberry/agents/utils/memories.py              |  59 --
 rlberry/colab_utils/__init__.py               |   0
 rlberry/colab_utils/display_setup.py          |  37 -
 rlberry/envs/__init__.py                      |   3 +-
 rlberry/envs/bandits/__init__.py              |   3 -
 rlberry/envs/bandits/bandit_base.py           | 115 ---
 rlberry/envs/bandits/corrupted_bandits.py     |  90 --
 rlberry/envs/bandits/stochastic_bandits.py    |  58 --
 rlberry/envs/benchmarks/__init__.py           |   0
 .../benchmarks/ball_exploration/__init__.py   |   1 -
 .../benchmarks/ball_exploration/ball2d.py     | 220 -----
 .../envs/benchmarks/ball_exploration/pball.py | 482 ----------
 .../benchmarks/generalization/__init__.py     |   0
 .../benchmarks/generalization/twinrooms.py    | 185 ----
 .../benchmarks/grid_exploration/__init__.py   |   0
 .../benchmarks/grid_exploration/apple_gold.py | 180 ----
 .../benchmarks/grid_exploration/four_room.py  | 130 ---
 .../envs/benchmarks/grid_exploration/nroom.py | 305 -------
 .../benchmarks/grid_exploration/six_room.py   | 151 ----
 rlberry/envs/bullet3/data/__init__.py         |   6 -
 rlberry/envs/bullet3/data/mjcf/pendulum.xml   |  28 -
 rlberry/envs/bullet3/data/pendulum.urdf       |  51 --
 .../envs/bullet3/pybullet_envs/__init__.py    |  40 -
 .../pybullet_envs/gym_pendulum_envs.py        |  80 --
 .../envs/bullet3/pybullet_envs/robot_bases.py | 123 ---
 .../bullet3/pybullet_envs/robot_pendula.py    |  46 -
 .../envs/classic_control/SpringCartPole.py    | 604 -------------
 rlberry/envs/classic_control/__init__.py      |   4 -
 rlberry/envs/classic_control/acrobot.py       | 394 --------
 rlberry/envs/classic_control/mountain_car.py  | 202 -----
 rlberry/envs/classic_control/pendulum.py      | 132 ---
 rlberry/envs/finite/__init__.py               |   3 -
 rlberry/envs/finite/chain.py                  | 132 ---
 rlberry/envs/finite/gridworld.py              | 490 ----------
 rlberry/envs/finite/gridworld_utils.py        |  70 --
 rlberry/envs/{finite => }/finite_mdp.py       |   0
 rlberry/envs/tests/test_bandits.py            |  61 --
 rlberry/envs/tests/test_env_seeding.py        |  14 +-
 rlberry/envs/tests/test_gym_make.py           |   4 +-
 rlberry/envs/tests/test_instantiation.py      | 252 ------
 rlberry/envs/tests/test_spring_env.py         | 104 ---
 rlberry/experiment/tests/room.yaml            |   2 +-
 rlberry/experiment/tests/rsucbvi.yaml         |   2 +-
 .../tests/test_experiment_generator.py        |   2 +-
 rlberry/experiment/yaml_utils.py              |   4 +-
 rlberry/exploration_tools/__init__.py         |   0
 rlberry/exploration_tools/discrete_counter.py | 100 ---
 .../online_discretization_counter.py          | 189 ----
 rlberry/exploration_tools/tests/__init__.py   |   0
 .../tests/test_discrete_counter.py            | 113 ---
 rlberry/exploration_tools/torch/__init__.py   |   0
 rlberry/exploration_tools/torch/rnd.py        | 212 -----
 .../exploration_tools/torch/tests/__init__.py |   0
 .../exploration_tools/torch/tests/test_rnd.py |  27 -
 rlberry/exploration_tools/typing.py           |  85 --
 .../uncertainty_estimator.py                  |  34 -
 rlberry/manager/__init__.py                   |   5 +-
 rlberry/manager/experiment_manager.py         |   4 +-
 rlberry/manager/remote_experiment_manager.py  | 235 -----
 rlberry/manager/tests/test_comparisons.py     |   2 +-
 .../manager/tests/test_experiment_manager.py  |  26 +-
 .../tests/test_experiment_manager_seeding.py  |   4 +-
 .../manager/tests/test_hyperparam_optim.py    |   7 +-
 rlberry/manager/tests/test_plot.py            |   4 +-
 rlberry/network/__init__.py                   |   0
 rlberry/network/client.py                     |  53 --
 rlberry/network/interface.py                  | 103 ---
 rlberry/network/server.py                     | 174 ----
 rlberry/network/server_utils.py               | 118 ---
 rlberry/network/tests/__init__.py             |   0
 rlberry/network/tests/conftest.py             |  43 -
 rlberry/network/tests/test_server.py          |  91 --
 rlberry/network/utils.py                      |  83 --
 rlberry/rendering/__init__.py                 |   3 -
 rlberry/rendering/common_shapes.py            |  39 -
 rlberry/rendering/core.py                     |  56 --
 rlberry/rendering/opengl_render2d.py          | 252 ------
 rlberry/rendering/pygame_render2d.py          | 197 ----
 rlberry/rendering/render_interface.py         | 162 ----
 rlberry/rendering/tests/__init__.py           |   0
 .../tests/test_rendering_interface.py         | 125 ---
 rlberry/rendering/utils.py                    |  73 --
 rlberry/tests/test_agent_extra.py             |  10 +-
 rlberry/tests/test_agents_base.py             |  29 +-
 rlberry/tests/test_envs.py                    |  14 +-
 .../tests/test_rlberry_main_agents_and_env.py | 133 +++
 rlberry/utils/__init__.py                     |   1 -
 rlberry/utils/check_agent.py                  |  12 +-
 rlberry/utils/check_bandit_agent.py           |  62 --
 rlberry/utils/io.py                           |  33 -
 rlberry/utils/tests/test_check.py             |   4 +-
 rlberry/utils/tests/test_writer.py            |  10 +-
 rlberry/utils/writers.py                      |   4 +-
 rlberry/wrappers/tests/old_env/old_acrobot.py |   4 +-
 .../wrappers/tests/old_env/old_apple_gold.py  |   2 +-
 .../wrappers/tests/old_env/old_gridworld.py   |   6 +-
 .../tests/old_env/old_mountain_car.py         |   2 +-
 rlberry/wrappers/tests/old_env/old_nroom.py   |   2 +-
 rlberry/wrappers/tests/old_env/old_pball.py   |   2 +-
 .../wrappers/tests/old_env/old_pendulum.py    |   4 +-
 .../wrappers/tests/old_env/old_six_room.py    |   2 +-
 .../wrappers/tests/old_env/old_twinrooms.py   |   4 +-
 rlberry/wrappers/tests/test_basewrapper.py    |   2 +-
 .../wrappers/tests/test_common_wrappers.py    |   9 +-
 .../wrappers/tests/test_wrapper_seeding.py    |   8 +-
 rlberry/wrappers/tests/test_writer_utils.py   |   4 +-
 rlberry/wrappers/vis2d.py                     |   4 +-
 scripts/fetch_contributors.py                 |   3 +-
 224 files changed, 417 insertions(+), 18516 deletions(-)
 delete mode 100644 rlberry/agents/adaptiveql/__init__.py
 delete mode 100644 rlberry/agents/adaptiveql/adaptiveql.py
 delete mode 100644 rlberry/agents/adaptiveql/tree.py
 delete mode 100644 rlberry/agents/adaptiveql/utils.py
 delete mode 100644 rlberry/agents/bandits/__init__.py
 delete mode 100644 rlberry/agents/bandits/bandit_base.py
 delete mode 100644 rlberry/agents/bandits/index_agents.py
 delete mode 100644 rlberry/agents/bandits/indices.py
 delete mode 100644 rlberry/agents/bandits/priors.py
 delete mode 100644 rlberry/agents/bandits/randomized_agents.py
 delete mode 100644 rlberry/agents/bandits/tools/__init__.py
 delete mode 100644 rlberry/agents/bandits/tools/tracker.py
 delete mode 100644 rlberry/agents/bandits/ts_agents.py
 delete mode 100644 rlberry/agents/dynprog/__init__.py
 delete mode 100644 rlberry/agents/dynprog/utils.py
 delete mode 100644 rlberry/agents/dynprog/value_iteration.py
 delete mode 100644 rlberry/agents/experimental/__init__.py
 delete mode 100644 rlberry/agents/experimental/tests/__init__.py
 delete mode 100644 rlberry/agents/experimental/torch/__init__.py
 delete mode 100644 rlberry/agents/features/__init__.py
 delete mode 100644 rlberry/agents/features/feature_map.py
 delete mode 100644 rlberry/agents/kernel_based/__init__.py
 delete mode 100644 rlberry/agents/kernel_based/common.py
 delete mode 100644 rlberry/agents/kernel_based/kernels.py
 delete mode 100644 rlberry/agents/kernel_based/rs_kernel_ucbvi.py
 delete mode 100644 rlberry/agents/kernel_based/rs_ucbvi.py
 delete mode 100644 rlberry/agents/linear/__init__.py
 delete mode 100644 rlberry/agents/linear/lsvi_ucb.py
 delete mode 100644 rlberry/agents/mbqvi/__init__.py
 delete mode 100644 rlberry/agents/mbqvi/mbqvi.py
 delete mode 100644 rlberry/agents/optql/__init__.py
 delete mode 100644 rlberry/agents/optql/optql.py
 delete mode 100644 rlberry/agents/psrl/__init__.py
 delete mode 100644 rlberry/agents/psrl/psrl.py
 delete mode 100644 rlberry/agents/rlsvi/__init__.py
 delete mode 100644 rlberry/agents/rlsvi/rlsvi.py
 delete mode 100644 rlberry/agents/tabular_rl/__init__.py
 delete mode 100644 rlberry/agents/tabular_rl/qlearning.py
 delete mode 100644 rlberry/agents/tabular_rl/sarsa.py
 delete mode 100644 rlberry/agents/tests/test_adaptiveql.py
 delete mode 100644 rlberry/agents/tests/test_bandits.py
 delete mode 100644 rlberry/agents/tests/test_dynprog.py
 delete mode 100644 rlberry/agents/tests/test_kernel_based.py
 delete mode 100644 rlberry/agents/tests/test_lsvi_ucb.py
 delete mode 100644 rlberry/agents/tests/test_mbqvi.py
 delete mode 100644 rlberry/agents/tests/test_optql.py
 delete mode 100644 rlberry/agents/tests/test_psrl.py
 delete mode 100644 rlberry/agents/tests/test_rlsvi.py
 delete mode 100644 rlberry/agents/tests/test_tabular_rl.py
 delete mode 100644 rlberry/agents/tests/test_ucbvi.py
 delete mode 100644 rlberry/agents/torch/__init__.py
 delete mode 100644 rlberry/agents/torch/a2c/__init__.py
 delete mode 100644 rlberry/agents/torch/a2c/a2c.py
 delete mode 100644 rlberry/agents/torch/dqn/__init__.py
 delete mode 100644 rlberry/agents/torch/dqn/dqn.py
 delete mode 100644 rlberry/agents/torch/dqn/dqn_utils.py
 delete mode 100644 rlberry/agents/torch/dqn/mdqn.py
 delete mode 100644 rlberry/agents/torch/ppo/__init__.py
 delete mode 100644 rlberry/agents/torch/ppo/ppo.py
 delete mode 100644 rlberry/agents/torch/ppo/ppo_utils.py
 delete mode 100644 rlberry/agents/torch/reinforce/__init__.py
 delete mode 100644 rlberry/agents/torch/reinforce/reinforce.py
 delete mode 100644 rlberry/agents/torch/sac/__init__.py
 delete mode 100644 rlberry/agents/torch/sac/sac.py
 delete mode 100644 rlberry/agents/torch/sac/sac_utils.py
 delete mode 100644 rlberry/agents/torch/tests/__init__.py
 delete mode 100644 rlberry/agents/torch/tests/test_a2c.py
 delete mode 100644 rlberry/agents/torch/tests/test_dqn.py
 delete mode 100644 rlberry/agents/torch/tests/test_factory.py
 delete mode 100644 rlberry/agents/torch/tests/test_mdqn.py
 delete mode 100644 rlberry/agents/torch/tests/test_ppo.py
 delete mode 100644 rlberry/agents/torch/tests/test_reinforce.py
 delete mode 100644 rlberry/agents/torch/tests/test_sac.py
 delete mode 100644 rlberry/agents/torch/tests/test_torch_atari.py
 delete mode 100644 rlberry/agents/torch/tests/test_torch_models.py
 delete mode 100644 rlberry/agents/torch/tests/test_torch_training.py
 delete mode 100644 rlberry/agents/torch/utils/__init__.py
 delete mode 100644 rlberry/agents/torch/utils/models.py
 delete mode 100644 rlberry/agents/torch/utils/training.py
 delete mode 100644 rlberry/agents/ucbvi/__init__.py
 delete mode 100644 rlberry/agents/ucbvi/ucbvi.py
 delete mode 100644 rlberry/agents/ucbvi/utils.py
 delete mode 100644 rlberry/agents/utils/memories.py
 delete mode 100644 rlberry/colab_utils/__init__.py
 delete mode 100644 rlberry/colab_utils/display_setup.py
 delete mode 100644 rlberry/envs/bandits/__init__.py
 delete mode 100644 rlberry/envs/bandits/bandit_base.py
 delete mode 100644 rlberry/envs/bandits/corrupted_bandits.py
 delete mode 100644 rlberry/envs/bandits/stochastic_bandits.py
 delete mode 100644 rlberry/envs/benchmarks/__init__.py
 delete mode 100644 rlberry/envs/benchmarks/ball_exploration/__init__.py
 delete mode 100644 rlberry/envs/benchmarks/ball_exploration/ball2d.py
 delete mode 100644 rlberry/envs/benchmarks/ball_exploration/pball.py
 delete mode 100644 rlberry/envs/benchmarks/generalization/__init__.py
 delete mode 100644 rlberry/envs/benchmarks/generalization/twinrooms.py
 delete mode 100644 rlberry/envs/benchmarks/grid_exploration/__init__.py
 delete mode 100644 rlberry/envs/benchmarks/grid_exploration/apple_gold.py
 delete mode 100644 rlberry/envs/benchmarks/grid_exploration/four_room.py
 delete mode 100644 rlberry/envs/benchmarks/grid_exploration/nroom.py
 delete mode 100644 rlberry/envs/benchmarks/grid_exploration/six_room.py
 delete mode 100644 rlberry/envs/bullet3/data/__init__.py
 delete mode 100644 rlberry/envs/bullet3/data/mjcf/pendulum.xml
 delete mode 100644 rlberry/envs/bullet3/data/pendulum.urdf
 delete mode 100644 rlberry/envs/bullet3/pybullet_envs/__init__.py
 delete mode 100644 rlberry/envs/bullet3/pybullet_envs/gym_pendulum_envs.py
 delete mode 100644 rlberry/envs/bullet3/pybullet_envs/robot_bases.py
 delete mode 100644 rlberry/envs/bullet3/pybullet_envs/robot_pendula.py
 delete mode 100644 rlberry/envs/classic_control/SpringCartPole.py
 delete mode 100644 rlberry/envs/classic_control/__init__.py
 delete mode 100644 rlberry/envs/classic_control/acrobot.py
 delete mode 100644 rlberry/envs/classic_control/mountain_car.py
 delete mode 100644 rlberry/envs/classic_control/pendulum.py
 delete mode 100644 rlberry/envs/finite/__init__.py
 delete mode 100644 rlberry/envs/finite/chain.py
 delete mode 100644 rlberry/envs/finite/gridworld.py
 delete mode 100644 rlberry/envs/finite/gridworld_utils.py
 rename rlberry/envs/{finite => }/finite_mdp.py (100%)
 delete mode 100644 rlberry/envs/tests/test_bandits.py
 delete mode 100644 rlberry/envs/tests/test_instantiation.py
 delete mode 100644 rlberry/envs/tests/test_spring_env.py
 delete mode 100644 rlberry/exploration_tools/__init__.py
 delete mode 100644 rlberry/exploration_tools/discrete_counter.py
 delete mode 100644 rlberry/exploration_tools/online_discretization_counter.py
 delete mode 100644 rlberry/exploration_tools/tests/__init__.py
 delete mode 100644 rlberry/exploration_tools/tests/test_discrete_counter.py
 delete mode 100644 rlberry/exploration_tools/torch/__init__.py
 delete mode 100644 rlberry/exploration_tools/torch/rnd.py
 delete mode 100644 rlberry/exploration_tools/torch/tests/__init__.py
 delete mode 100644 rlberry/exploration_tools/torch/tests/test_rnd.py
 delete mode 100644 rlberry/exploration_tools/typing.py
 delete mode 100644 rlberry/exploration_tools/uncertainty_estimator.py
 delete mode 100644 rlberry/manager/remote_experiment_manager.py
 delete mode 100644 rlberry/network/__init__.py
 delete mode 100644 rlberry/network/client.py
 delete mode 100644 rlberry/network/interface.py
 delete mode 100644 rlberry/network/server.py
 delete mode 100644 rlberry/network/server_utils.py
 delete mode 100644 rlberry/network/tests/__init__.py
 delete mode 100644 rlberry/network/tests/conftest.py
 delete mode 100644 rlberry/network/tests/test_server.py
 delete mode 100644 rlberry/network/utils.py
 delete mode 100644 rlberry/rendering/__init__.py
 delete mode 100644 rlberry/rendering/common_shapes.py
 delete mode 100644 rlberry/rendering/core.py
 delete mode 100644 rlberry/rendering/opengl_render2d.py
 delete mode 100644 rlberry/rendering/pygame_render2d.py
 delete mode 100644 rlberry/rendering/render_interface.py
 delete mode 100644 rlberry/rendering/tests/__init__.py
 delete mode 100644 rlberry/rendering/tests/test_rendering_interface.py
 delete mode 100644 rlberry/rendering/utils.py
 create mode 100644 rlberry/tests/test_rlberry_main_agents_and_env.py
 delete mode 100644 rlberry/utils/check_bandit_agent.py
 delete mode 100644 rlberry/utils/io.py

diff --git a/.gitignore b/.gitignore
index 533f60754..4644ebe3d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -167,4 +167,4 @@ dmypy.json
 .pydevproject
 
 
-profile.prof
+*.prof
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index d11f38b93..df4118f15 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -153,6 +153,12 @@ jobs:
       set -xe
       pip install .
     displayName: 'Install rlberry'
+
+  - script: |
+      pip install git+https://github.com/rlberry-py/rlberry-scool.git
+      pip install git+https://github.com/rlberry-py/rlberry-research.git
+    displayName: 'Install rlberry-scool and rlberry-research'
+
   #ignore les tests qui viennent des extras : torch, experimental, stablebaselines, optuna
   - script: |
       pip install pytest==7.0.1 pytest-azurepipelines pytest-xvfb
@@ -186,6 +192,11 @@ jobs:
       pip install .
     displayName: 'Install rlberry'
 
+  - script: |
+      pip install git+https://github.com/rlberry-py/rlberry-scool.git
+      pip install git+https://github.com/rlberry-py/rlberry-research.git
+    displayName: 'Install rlberry-scool and rlberry-research'
+
   - script: |
       pip install pytest==7.0.1 pytest-azurepipelines pytest-xvfb
       pytest rlberry/tests/test_agents_base.py rlberry/tests/test_envs.py
@@ -215,6 +226,11 @@ jobs:
       pip install .
     displayName: 'Install rlberry'
 
+  - script: |
+      pip install git+https://github.com/rlberry-py/rlberry-scool.git
+      pip install git+https://github.com/rlberry-py/rlberry-research.git
+    displayName: 'Install rlberry-scool and rlberry-research'
+
   - script: |
       pip install pytest==7.0.1 pytest-azurepipelines pytest-xvfb
       pytest rlberry/tests/test_agents_base.py rlberry/tests/test_envs.py
diff --git a/codecov.yml b/codecov.yml
index 90677b227..809ad8b6c 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -25,5 +25,3 @@ ignore:
   - "./rlberry/wrappers/tests/old_env/*.py"
   - "./rlberry/rendering/pygame_render2d.py"
   - "./rlberry/colab_utils/display_setup.py"
-  - "./rlberry/agents/experimental/jax/**/*.py"
-  - "./rlberry/network/**/*.py"
diff --git a/docs/api.rst b/docs/api.rst
index 1969d1658..63c7b5552 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -44,25 +44,6 @@ Base classes
     agents.Agent
     agents.AgentWithSimplePolicy
 
-Basic Agents
---------------------
-
-.. autosummary::
-   :toctree: generated/
-   :template: class.rst
-
-   agents.QLAgent
-   agents.SARSAAgent
-   agents.ValueIterationAgent
-   agents.MBQVIAgent
-   agents.UCBVIAgent
-   agents.RSUCBVIAgent
-   agents.RSKernelUCBVIAgent
-   agents.OptQLAgent
-   agents.LSVIUCBAgent
-   agents.RLSVIAgent
-   agents.PSRLAgent
-
 
 Agent importation tools
 -----------------------
@@ -74,22 +55,6 @@ Agent importation tools
    agents.stable_baselines.StableBaselinesAgent
 
 
-Torch Agents
----------------------------
-
-
-.. autosummary::
-   :toctree: generated/
-   :template: class.rst
-
-   agents.torch.SACAgent
-   agents.torch.A2CAgent
-   agents.torch.PPOAgent
-   agents.torch.DQNAgent
-   agents.torch.MunchausenDQNAgent
-   agents.torch.REINFORCEAgent
-
-
 Environments
 ============
 
@@ -116,23 +81,6 @@ Spaces
     spaces.MultiBinary
     spaces.Dict
 
-Benchmark Environments
-----------------------
-
-.. autosummary::
-   :toctree: generated/
-   :template: class.rst
-
-    envs.Acrobot
-    envs.benchmarks.ball_exploration.PBall2D
-    envs.benchmarks.generalization.twinrooms.TwinRooms
-    envs.benchmarks.grid_exploration.apple_gold.AppleGold
-    envs.benchmarks.grid_exploration.nroom.NRoom
-    envs.classic_control.MountainCar
-    envs.SpringCartPole
-    envs.finite.Chain
-    envs.finite.GridWorld
-
 
 Environment tools
 -----------------
@@ -171,6 +119,7 @@ Manager Utilitis
 .. autosummary::
    :toctree: generated/
    :template: function.rst
+
    manager.preset_manager
 
 
@@ -208,16 +157,6 @@ Logging Utilities
    utils.logging.set_level
 
 
-Typing
-------
-
-.. autosummary::
-  :toctree: generated/
-  :template: class.rst
-
-   types.Env
-
-
 Environment Wrappers
 ====================
 
@@ -230,82 +169,3 @@ Environment Wrappers
   wrappers.RescaleRewardWrapper
   wrappers.vis2d.Vis2dWrapper
   wrappers.WriterWrapper
-
-
-Neural Networks
-===============
-
-
-Torch
-------
-
-.. autosummary::
-  :toctree: generated/
-  :template: function.rst
-
-  agents.torch.utils.training.model_factory
-  utils.torch.choose_device
-
-
-.. autosummary::
-  :toctree: generated/
-  :template: class.rst
-
-  agents.torch.utils.models.MultiLayerPerceptron
-  agents.torch.utils.models.ConvolutionalNetwork
-  agents.torch.utils.models.DuelingNetwork
-  agents.torch.utils.models.Table
-
-
-Bandits
-=======
-
-Bandit environments
--------------------
-
-.. autosummary::
-   :toctree: generated/
-   :template: class.rst
-
-   envs.bandits.AdversarialBandit
-   envs.bandits.Bandit
-   envs.bandits.BernoulliBandit
-   envs.bandits.NormalBandit
-   envs.bandits.CorruptedNormalBandit
-
-Bandit algorithms
------------------
-The bandits algorithms use mainly the following tracker tool:
-
-.. autosummary::
-   :toctree: generated/
-   :template: class.rst
-
-   agents.bandits.tools.BanditTracker
-
-Some general class of bandit algorithms are provided.
-
-.. autosummary::
-   :toctree: generated/
-   :template: class.rst
-
-   agents.bandits.BanditWithSimplePolicy
-   agents.bandits.IndexAgent
-   agents.bandits.RandomizedAgent
-   agents.bandits.TSAgent
-
-A number of indices are provided to use in bandits algorithms:
-
-.. autosummary::
-  :toctree: generated/
-  :template: function.rst
-
-  agents.bandits.makeBoundedIMEDIndex
-  agents.bandits.makeBoundedMOSSIndex
-  agents.bandits.makeBoundedNPTSIndex
-  agents.bandits.makeBoundedUCBIndex
-  agents.bandits.makeBoundedUCBVIndex
-  agents.bandits.makeETCIndex
-  agents.bandits.makeEXP3Index
-  agents.bandits.makeSubgaussianMOSSIndex
-  agents.bandits.makeSubgaussianUCBIndex
diff --git a/docs/basics/experiment_setup.rst b/docs/basics/experiment_setup.rst
index e4d82e96d..a13fa600e 100644
--- a/docs/basics/experiment_setup.rst
+++ b/docs/basics/experiment_setup.rst
@@ -36,7 +36,7 @@ This can be done very succinctly as in the example below:
 
 .. code-block:: yaml
 
-    constructor: 'rlberry.envs.benchmarks.grid_exploration.nroom.NRoom'
+    constructor: 'rlberry_research.envs.benchmarks.grid_exploration.nroom.NRoom'
     params:
         reward_free: false
         array_observation: true
@@ -46,7 +46,7 @@ This can be done very succinctly as in the example below:
 
 .. code-block:: yaml
 
-    agent_class: 'rlberry.agents.kernel_based.rs_ucbvi.RSUCBVIAgent'
+    agent_class: 'rlberry_research.agents.kernel_based.rs_ucbvi.RSUCBVIAgent'
     init_kwargs:
         gamma: 1.0
         lp_metric: 2
diff --git a/docs/basics/multiprocess.rst b/docs/basics/multiprocess.rst
index de25cae22..ae65ad2a7 100644
--- a/docs/basics/multiprocess.rst
+++ b/docs/basics/multiprocess.rst
@@ -29,9 +29,9 @@ The advised method of parallelization is spawn (parameter :code:`mp_context="spa
 
 .. code:: python
 
-    from rlberry.agents.torch import A2CAgent
+    from rlberry_research.agents.torch import A2CAgent
     from rlberry.manager import ExperimentManager
-    from rlberry.envs.benchmarks.ball_exploration import PBall2D
+    from rlberry_research.envs.benchmarks.ball_exploration import PBall2D
 
     n_steps = 1e5
     batch_size = 256
diff --git a/docs/basics/rlberry how to.rst b/docs/basics/rlberry how to.rst
index f84e79c5f..dfa443df3 100644
--- a/docs/basics/rlberry how to.rst	
+++ b/docs/basics/rlberry how to.rst	
@@ -6,7 +6,7 @@ Libraries
     import numpy as np
     import pandas as pd
     from rlberry.agents import ValueIterationAgent, AgentWithSimplePolicy
-    from rlberry.envs import GridWorld
+    from rlberry_research.envs import GridWorld
     from rlberry.manager import ExperimentManager, evaluate_agents
 
 
diff --git a/docs/contributors.rst b/docs/contributors.rst
index 85b1b29ac..ae2a9edd4 100644
--- a/docs/contributors.rst
+++ b/docs/contributors.rst
@@ -65,4 +65,8 @@
     <a href='https://github.com/riccardodv'><img src='https://avatars.githubusercontent.com/u/18311484?v=4' class='avatar' /></a> <br />
     <p>Riccardo Della Vecchia</p>
     </div>
+    <div>
+    <a href='https://github.com/YannBerthelot'><img src='https://avatars.githubusercontent.com/u/49097534?v=4' class='avatar' /></a> <br />
+    <p>YannBerthelot</p>
+    </div>
     </div>
diff --git a/examples/comparison_agents.py b/examples/comparison_agents.py
index 897dbb014..113be1645 100644
--- a/examples/comparison_agents.py
+++ b/examples/comparison_agents.py
@@ -15,9 +15,9 @@
 
 from rlberry.manager.comparison import compare_agents
 from rlberry.manager import AgentManager
-from rlberry.envs.bandits import BernoulliBandit
+from rlberry_research.envs.bandits import BernoulliBandit
 from rlberry.wrappers import WriterWrapper
-from rlberry.agents.bandits import (
+from rlberry_research.agents.bandits import (
     IndexAgent,
     makeBoundedMOSSIndex,
     makeBoundedNPTSIndex,
diff --git a/examples/demo_agents/video_plot_a2c.py b/examples/demo_agents/video_plot_a2c.py
index 6e20c537f..80d7a4fe8 100644
--- a/examples/demo_agents/video_plot_a2c.py
+++ b/examples/demo_agents/video_plot_a2c.py
@@ -11,8 +11,8 @@
 """
 # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_a2c.jpg'
 
-from rlberry.agents.torch import A2CAgent
-from rlberry.envs.benchmarks.ball_exploration import PBall2D
+from rlberry_research.agents.torch import A2CAgent
+from rlberry_research.envs.benchmarks.ball_exploration import PBall2D
 from gymnasium.wrappers import TimeLimit
 
 
diff --git a/examples/demo_agents/video_plot_mbqvi.py b/examples/demo_agents/video_plot_mbqvi.py
index 906aec11e..d98ddcf77 100644
--- a/examples/demo_agents/video_plot_mbqvi.py
+++ b/examples/demo_agents/video_plot_mbqvi.py
@@ -10,8 +10,8 @@
 
 """
 # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_mbqvi.jpg'
-from rlberry.agents.mbqvi import MBQVIAgent
-from rlberry.envs.finite import GridWorld
+from rlberry_scool.agents.mbqvi import MBQVIAgent
+from rlberry_research.envs.finite import GridWorld
 
 params = {}
 params["n_samples"] = 100  # samples per state-action pair
diff --git a/examples/demo_agents/video_plot_ppo.py b/examples/demo_agents/video_plot_ppo.py
index 47e4c6629..8834c7960 100644
--- a/examples/demo_agents/video_plot_ppo.py
+++ b/examples/demo_agents/video_plot_ppo.py
@@ -11,8 +11,8 @@
 """
 # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_a2c.jpg'
 
-from rlberry.agents.torch import PPOAgent
-from rlberry.envs.benchmarks.ball_exploration import PBall2D
+from rlberry_research.agents.torch import PPOAgent
+from rlberry_research.envs.benchmarks.ball_exploration import PBall2D
 
 
 env = PBall2D()
diff --git a/examples/demo_agents/video_plot_vi.py b/examples/demo_agents/video_plot_vi.py
index ce84a0dbc..65f4e4b8f 100644
--- a/examples/demo_agents/video_plot_vi.py
+++ b/examples/demo_agents/video_plot_vi.py
@@ -11,8 +11,8 @@
 """
 # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_vi.jpg'
 
-from rlberry.agents.dynprog import ValueIterationAgent
-from rlberry.envs.finite import Chain
+from rlberry_research.agents.dynprog import ValueIterationAgent
+from rlberry_research.envs.finite import Chain
 
 env = Chain()
 agent = ValueIterationAgent(env, gamma=0.95)
diff --git a/examples/demo_bandits/plot_TS_bandit.py b/examples/demo_bandits/plot_TS_bandit.py
index 599033dbc..41de68770 100644
--- a/examples/demo_bandits/plot_TS_bandit.py
+++ b/examples/demo_bandits/plot_TS_bandit.py
@@ -11,8 +11,8 @@
 """
 
 import numpy as np
-from rlberry.envs.bandits import BernoulliBandit, NormalBandit
-from rlberry.agents.bandits import (
+from rlberry_research.envs.bandits import BernoulliBandit, NormalBandit
+from rlberry_research.agents.bandits import (
     IndexAgent,
     TSAgent,
     makeBoundedUCBIndex,
diff --git a/examples/demo_bandits/plot_compare_index_bandits.py b/examples/demo_bandits/plot_compare_index_bandits.py
index f089c5ac3..25e520aa3 100644
--- a/examples/demo_bandits/plot_compare_index_bandits.py
+++ b/examples/demo_bandits/plot_compare_index_bandits.py
@@ -8,10 +8,10 @@
 """
 import numpy as np
 import matplotlib.pyplot as plt
-from rlberry.envs.bandits import BernoulliBandit
+from rlberry_research.envs.bandits import BernoulliBandit
 from rlberry.manager import ExperimentManager, plot_writer_data
 from rlberry.wrappers import WriterWrapper
-from rlberry.agents.bandits import (
+from rlberry_research.agents.bandits import (
     IndexAgent,
     RandomizedAgent,
     makeBoundedIMEDIndex,
diff --git a/examples/demo_bandits/plot_exp3_bandit.py b/examples/demo_bandits/plot_exp3_bandit.py
index f4716a219..7452f85b3 100644
--- a/examples/demo_bandits/plot_exp3_bandit.py
+++ b/examples/demo_bandits/plot_exp3_bandit.py
@@ -8,8 +8,8 @@
 """
 
 import numpy as np
-from rlberry.envs.bandits import AdversarialBandit
-from rlberry.agents.bandits import (
+from rlberry_research.envs.bandits import AdversarialBandit
+from rlberry_research.agents.bandits import (
     RandomizedAgent,
     TSAgent,
     makeEXP3Index,
diff --git a/examples/demo_bandits/plot_mirror_bandit.py b/examples/demo_bandits/plot_mirror_bandit.py
index 4e9b9757d..a89602943 100644
--- a/examples/demo_bandits/plot_mirror_bandit.py
+++ b/examples/demo_bandits/plot_mirror_bandit.py
@@ -16,7 +16,7 @@
 
 from rlberry.manager import ExperimentManager, read_writer_data
 from rlberry.envs.interface import Model
-from rlberry.agents.bandits import BanditWithSimplePolicy
+from rlberry_research.agents.bandits import BanditWithSimplePolicy
 from rlberry.wrappers import WriterWrapper
 import rlberry.spaces as spaces
 
diff --git a/examples/demo_bandits/plot_ucb_bandit.py b/examples/demo_bandits/plot_ucb_bandit.py
index 92b9d8ae2..43e4d1e70 100644
--- a/examples/demo_bandits/plot_ucb_bandit.py
+++ b/examples/demo_bandits/plot_ucb_bandit.py
@@ -7,8 +7,8 @@
 """
 
 import numpy as np
-from rlberry.envs.bandits import NormalBandit
-from rlberry.agents.bandits import IndexAgent, makeSubgaussianUCBIndex
+from rlberry_research.envs.bandits import NormalBandit
+from rlberry_research.agents.bandits import IndexAgent, makeSubgaussianUCBIndex
 from rlberry.manager import ExperimentManager, plot_writer_data
 import matplotlib.pyplot as plt
 from rlberry.wrappers import WriterWrapper
diff --git a/examples/demo_env/video_plot_apple_gold.py b/examples/demo_env/video_plot_apple_gold.py
index 74282cca4..9e6eb34c6 100644
--- a/examples/demo_env/video_plot_apple_gold.py
+++ b/examples/demo_env/video_plot_apple_gold.py
@@ -10,8 +10,8 @@
 
 """
 # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_apple_gold.jpg'
-from rlberry.envs.benchmarks.grid_exploration.apple_gold import AppleGold
-from rlberry.agents.dynprog import ValueIterationAgent
+from rlberry_research.envs.benchmarks.grid_exploration.apple_gold import AppleGold
+from rlberry_research.agents.dynprog import ValueIterationAgent
 
 env = AppleGold(reward_free=False, array_observation=False)
 
diff --git a/examples/demo_env/video_plot_chain.py b/examples/demo_env/video_plot_chain.py
index 6437d3988..42c2b3c8b 100644
--- a/examples/demo_env/video_plot_chain.py
+++ b/examples/demo_env/video_plot_chain.py
@@ -11,7 +11,7 @@
 # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_chain.jpg'
 
 
-from rlberry.envs.finite import Chain
+from rlberry_research.envs.finite import Chain
 
 env = Chain(10, 0.1)
 env.enable_rendering()
diff --git a/examples/demo_env/video_plot_gridworld.py b/examples/demo_env/video_plot_gridworld.py
index 129e5a7e6..872b46fbb 100644
--- a/examples/demo_env/video_plot_gridworld.py
+++ b/examples/demo_env/video_plot_gridworld.py
@@ -12,8 +12,8 @@
 """
 # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_gridworld.jpg'
 
-from rlberry.agents.dynprog import ValueIterationAgent
-from rlberry.envs.finite import GridWorld
+from rlberry_research.agents.dynprog import ValueIterationAgent
+from rlberry_research.envs.finite import GridWorld
 
 
 env = GridWorld(7, 10, walls=((2, 2), (3, 3)))
diff --git a/examples/demo_env/video_plot_pball.py b/examples/demo_env/video_plot_pball.py
index af6c7c637..e9765fa5f 100644
--- a/examples/demo_env/video_plot_pball.py
+++ b/examples/demo_env/video_plot_pball.py
@@ -11,7 +11,7 @@
 # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_pball.jpg'
 
 import numpy as np
-from rlberry.envs.benchmarks.ball_exploration import PBall2D
+from rlberry_research.envs.benchmarks.ball_exploration import PBall2D
 
 p = 5
 A = np.array([[1.0, 0.1], [-0.1, 1.0]])
diff --git a/examples/demo_env/video_plot_rooms.py b/examples/demo_env/video_plot_rooms.py
index 9cee6bf6f..5119c8957 100644
--- a/examples/demo_env/video_plot_rooms.py
+++ b/examples/demo_env/video_plot_rooms.py
@@ -10,8 +10,8 @@
 """
 # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_rooms.jpg'
 
-from rlberry.envs.benchmarks.grid_exploration.nroom import NRoom
-from rlberry.agents.dynprog import ValueIterationAgent
+from rlberry_research.envs.benchmarks.grid_exploration.nroom import NRoom
+from rlberry_scool.agents.dynprog import ValueIterationAgent
 
 env = NRoom(
     nrooms=9,
diff --git a/examples/demo_env/video_plot_twinrooms.py b/examples/demo_env/video_plot_twinrooms.py
index 22c36683a..f8ae6ab11 100644
--- a/examples/demo_env/video_plot_twinrooms.py
+++ b/examples/demo_env/video_plot_twinrooms.py
@@ -10,8 +10,8 @@
 """
 # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_twinrooms.jpg'
 
-from rlberry.envs.benchmarks.generalization.twinrooms import TwinRooms
-from rlberry.agents.mbqvi import MBQVIAgent
+from rlberry_research.envs.benchmarks.generalization.twinrooms import TwinRooms
+from rlberry_scool.agents.mbqvi import MBQVIAgent
 from rlberry.wrappers.discretize_state import DiscretizeStateWrapper
 from rlberry.seeding import Seeder
 
diff --git a/examples/demo_experiment/room.yaml b/examples/demo_experiment/room.yaml
index 3223015c2..977d239ed 100644
--- a/examples/demo_experiment/room.yaml
+++ b/examples/demo_experiment/room.yaml
@@ -3,7 +3,7 @@
 #  Demo: room.yaml
 #  =====================
 # """
-constructor: 'rlberry.envs.benchmarks.grid_exploration.nroom.NRoom'
+constructor: 'rlberry_research.envs.benchmarks.grid_exploration.nroom.NRoom'
 params:
   reward_free: false
   array_observation: true
diff --git a/examples/demo_experiment/rsucbvi.yaml b/examples/demo_experiment/rsucbvi.yaml
index 25cfadebc..e4a47b69e 100644
--- a/examples/demo_experiment/rsucbvi.yaml
+++ b/examples/demo_experiment/rsucbvi.yaml
@@ -3,7 +3,7 @@
 #  Demo: rsucbvi.yaml
 #  =====================
 # """
-agent_class: 'rlberry.agents.kernel_based.rs_ucbvi.RSUCBVIAgent'
+agent_class: 'rlberry_research.agents.kernel_based.rs_ucbvi.RSUCBVIAgent'
 init_kwargs:
   gamma: 1.0
   lp_metric: 2
diff --git a/examples/demo_network/run_server.py b/examples/demo_network/run_server.py
index c1b6a15b5..a8b9b04c7 100644
--- a/examples/demo_network/run_server.py
+++ b/examples/demo_network/run_server.py
@@ -3,11 +3,11 @@
  Demo: run_server
  =====================
 """
-from rlberry.network.interface import ResourceItem
-from rlberry.network.server import BerryServer
+from rlberry_research.network.interface import ResourceItem
+from rlberry_research.network.server import BerryServer
 from rlberry.agents import ValueIterationAgent
-from rlberry.agents.torch import REINFORCEAgent, A2CAgent
-from rlberry.envs import GridWorld, gym_make
+from rlberry_research.agents.torch import REINFORCEAgent, A2CAgent
+from rlberry_research.envs import GridWorld, gym_make
 from rlberry.utils.writers import DefaultWriter
 
 if __name__ == "__main__":
diff --git a/examples/plot_agent_manager.py b/examples/plot_agent_manager.py
index 338ee417d..076ccc5fd 100644
--- a/examples/plot_agent_manager.py
+++ b/examples/plot_agent_manager.py
@@ -17,7 +17,7 @@
 Finally, we compare with a baseline provided by a random policy using the Agent Manager class which trains, evaluates and gathers statistics about the two agents.
 """
 
-from rlberry.envs import GridWorld
+from rlberry_research.envs import GridWorld
 
 # Create a grid world environment and an agent with a value iteration policy
 env_ctor = GridWorld
diff --git a/examples/plot_kernels.py b/examples/plot_kernels.py
index 84b2b2cfc..6557426e2 100644
--- a/examples/plot_kernels.py
+++ b/examples/plot_kernels.py
@@ -8,7 +8,7 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
-from rlberry.agents.kernel_based.kernels import kernel_func
+from rlberry_research.agents.kernel_based.kernels import kernel_func
 
 kernel_types = [
     "uniform",
diff --git a/examples/plot_writer_wrapper.py b/examples/plot_writer_wrapper.py
index 069d4de00..63bcad943 100644
--- a/examples/plot_writer_wrapper.py
+++ b/examples/plot_writer_wrapper.py
@@ -23,9 +23,9 @@
 import numpy as np
 
 from rlberry.wrappers import WriterWrapper
-from rlberry.envs import GridWorld
+from rlberry_research.envs import GridWorld
 from rlberry.manager import plot_writer_data, ExperimentManager
-from rlberry.agents import UCBVIAgent
+from rlberry_scool.agents import UCBVIAgent
 import matplotlib.pyplot as plt
 
 # We wrape the default writer of the agent in a WriterWrapper
diff --git a/poetry.lock b/poetry.lock
index 38d4e07b7..9f1db6b27 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -26,7 +26,7 @@ files = [
 name = "ale-py"
 version = "0.8.1"
 description = "The Arcade Learning Environment (ALE) - a platform for AI research."
-optional = true
+optional = false
 python-versions = ">=3.7"
 files = [
     {file = "ale_py-0.8.1-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:b2aa2f69a4169742800615970efe6914fa856e33eaf7fa9133c0e06a617a80e2"},
@@ -82,7 +82,7 @@ tz = ["python-dateutil"]
 name = "autorom"
 version = "0.4.2"
 description = "Automated installation of Atari ROMs for Gym/ALE-Py"
-optional = true
+optional = false
 python-versions = ">=3.6"
 files = [
     {file = "AutoROM-0.4.2-py3-none-any.whl", hash = "sha256:719c9d363ef08391fdb7003d70df235b68f36de628d289a946c4a59a3adefa13"},
@@ -102,7 +102,7 @@ accept-rom-license = ["AutoROM.accept-rom-license"]
 name = "autorom-accept-rom-license"
 version = "0.6.1"
 description = "Automated installation of Atari ROMs for Gym/ALE-Py"
-optional = true
+optional = false
 python-versions = ">=3.7"
 files = [
     {file = "AutoROM.accept-rom-license-0.6.1.tar.gz", hash = "sha256:0c905a708d634a076f686802f672817d3585259ce3be0bde8713a4fb59e3159e"},
@@ -1343,6 +1343,16 @@ files = [
     {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"},
     {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"},
     {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"},
     {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"},
     {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"},
     {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"},
@@ -2365,6 +2375,7 @@ files = [
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
+    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
     {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
     {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
     {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
@@ -2372,8 +2383,15 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
+    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
     {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
+    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
     {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
@@ -2390,6 +2408,7 @@ files = [
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
+    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
     {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
     {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
     {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
@@ -2397,6 +2416,7 @@ files = [
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
     {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
     {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
     {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
@@ -2441,6 +2461,46 @@ requests = ">=2.0.0"
 [package.extras]
 rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
 
+[[package]]
+name = "rlberry-research"
+version = "0.6.1"
+description = "Algorithms and envs for research with rlberry"
+optional = false
+python-versions = "*"
+files = []
+develop = false
+
+[package.dependencies]
+rlberry = {git = "https://github.com/rlberry-py/rlberry"}
+
+[package.extras]
+deploy = ["sphinx", "sphinx_rtd_theme"]
+torch-agents = ["torch (>=1.6.0)"]
+
+[package.source]
+type = "git"
+url = "https://github.com/rlberry-py/rlberry-research.git"
+reference = "HEAD"
+resolved_reference = "973358e77d4e931361b4bb955e295b1537f5e7e9"
+
+[[package]]
+name = "rlberry-scool"
+version = "0.5.0.post29.dev0+2b871b8"
+description = "Teaching Reinforcement Learning made easy"
+optional = false
+python-versions = "*"
+files = []
+develop = false
+
+[package.dependencies]
+rlberry = "*"
+
+[package.source]
+type = "git"
+url = "https://github.com/rlberry-py/rlberry-scool.git"
+reference = "HEAD"
+resolved_reference = "b534a999289909c6c1b589658a71d22490452de7"
+
 [[package]]
 name = "rsa"
 version = "4.9"
@@ -2538,7 +2598,7 @@ testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jar
 name = "shimmy"
 version = "0.2.1"
 description = "API for converting popular non-gymnasium environments to a gymnasium compatible environment."
-optional = true
+optional = false
 python-versions = ">=3.7"
 files = [
     {file = "Shimmy-0.2.1-py3-none-any.whl", hash = "sha256:2d7d21c4ca679a64bb452e6a4232c6b0f5dba7589f5420454ddc1f0634334334"},
@@ -3177,10 +3237,10 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 
 [extras]
 doc = ["matplotlib", "myst-parser", "numpydoc", "sphinx", "sphinx-gallery", "sphinx-math-dollar", "sphinxcontrib-video"]
-extras = ["ffmpeg-python", "optuna", "pyopengl", "pyvirtualdisplay"]
-torch = ["Gymnasium", "ale-py", "gymnasium", "opencv-python", "stable-baselines3", "tensorboard", "torch"]
+extras = ["ffmpeg-python", "numba", "optuna", "pyopengl", "pyvirtualdisplay"]
+torch = ["ale-py", "opencv-python", "stable-baselines3", "tensorboard", "torch"]
 
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9, <3.13"
-content-hash = "bb56a4ca54235fed2aa5c567a5921f6d27bbf6b8d3da8a2feb5c6656c735c875"
+content-hash = "5edef0e50e0c75c099db79ac5e569d004998a988495ef3bc14f3fd9c6b426faa"
diff --git a/pyproject.toml b/pyproject.toml
index e70cfc756..660aa4270 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,6 +62,8 @@ pytest-xprocess = "^0.23.0"
 codecov = "^2.1.13"
 black = "23.9.1"
 pre-commit = "^3.5.0"
+rlberry-research = {git = "https://github.com/rlberry-py/rlberry-research.git"}
+rlberry-scool = {git = "https://github.com/rlberry-py/rlberry-scool.git"}
 
 [build-system]
 requires = ["poetry-core"]
diff --git a/rlberry/agents/__init__.py b/rlberry/agents/__init__.py
index 60fd5b8a4..49c0ff952 100644
--- a/rlberry/agents/__init__.py
+++ b/rlberry/agents/__init__.py
@@ -2,16 +2,3 @@
 from .agent import Agent
 from .agent import AgentWithSimplePolicy
 from .agent import AgentTorch
-
-# Basic agents (in alphabetical order)
-# basic = does not require torch, jax, etc...
-from .adaptiveql import AdaptiveQLAgent
-from .dynprog import ValueIterationAgent
-from .kernel_based import RSUCBVIAgent, RSKernelUCBVIAgent
-from .linear import LSVIUCBAgent
-from .mbqvi import MBQVIAgent
-from .optql import OptQLAgent
-from .psrl import PSRLAgent
-from .rlsvi import RLSVIAgent
-from .ucbvi import UCBVIAgent
-from .tabular_rl import QLAgent, SARSAAgent
diff --git a/rlberry/agents/adaptiveql/__init__.py b/rlberry/agents/adaptiveql/__init__.py
deleted file mode 100644
index b0498beee..000000000
--- a/rlberry/agents/adaptiveql/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .adaptiveql import AdaptiveQLAgent
diff --git a/rlberry/agents/adaptiveql/adaptiveql.py b/rlberry/agents/adaptiveql/adaptiveql.py
deleted file mode 100644
index 667ed54e0..000000000
--- a/rlberry/agents/adaptiveql/adaptiveql.py
+++ /dev/null
@@ -1,195 +0,0 @@
-import gymnasium.spaces as spaces
-import numpy as np
-from rlberry.agents import AgentWithSimplePolicy
-from rlberry.agents.adaptiveql.tree import MDPTreePartition
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class AdaptiveQLAgent(AgentWithSimplePolicy):
-    """
-    Adaptive Q-Learning algorithm [1]_ implemented for enviroments
-    with continuous (Box) states and **discrete actions**.
-
-    .. todo:: Handle continuous actios too.
-
-    Parameters
-    ----------
-    env : gym.Env
-        Environment with continuous states and discrete actions.
-    gamma : double, default: 1.0
-        Discount factor in [0, 1].
-    horizon : int, default: 50
-        Horizon of the objective function.
-    bonus_scale_factor : double, default: 1.0
-        Constant by which to multiply the exploration bonus, controls
-        the level of exploration.
-    bonus_type : string, default: "simplified_bernstein"
-        Type of exploration bonus. Currently, only "simplified_bernstein"
-        is implemented.
-
-    Attributes
-    ----------
-    gamma : double, default: 1.0
-        Discount factor in [0, 1].
-    horizon : int, default: 50
-        Horizon of the objective function.
-    bonus_scale_factor : double, default: 1.0
-        Constant by which to multiply the exploration bonus, controls
-        the level of exploration.
-    bonus_type : string, default: "simplified_bernstein"
-        Type of exploration bonus. Currently, only "simplified_bernstein"
-        is implemented.
-    v_max : ndarray
-        Array of the maximum state value as a function of the Horizon.
-    Qtree : MDPTreePartition
-        Tree structure to represent the MDP model of transition.
-    episode : int
-        Number of episodes done during training of the adaptiveql agent.
-
-    References
-    ----------
-    .. [1] Sinclair, Sean R., Siddhartha Banerjee, and Christina Lee Yu.
-    "Adaptive Discretization for Episodic Reinforcement Learning in Metric Spaces."
-     Proceedings of the ACM on Measurement and Analysis of Computing Systems 3.3 (2019): 1-44.
-
-    Notes
-    ------
-    Uses the metric induced by the l-infinity norm.
-    """
-
-    name = "AdaptiveQLearning"
-
-    def __init__(
-        self,
-        env,
-        gamma=1.0,
-        horizon=50,
-        bonus_scale_factor=1.0,
-        bonus_type="simplified_bernstein",
-        **kwargs
-    ):
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-
-        assert isinstance(self.env.observation_space, spaces.Box)
-        assert isinstance(self.env.action_space, spaces.Discrete)
-
-        self.gamma = gamma
-        self.horizon = horizon
-        self.bonus_scale_factor = bonus_scale_factor
-        self.bonus_type = bonus_type
-
-        # maximum value
-        r_range = self.env.reward_range[1] - self.env.reward_range[0]
-        if r_range == np.inf or r_range == 0.0:
-            logger.warning(
-                "{}: Reward range is  zero or infinity. ".format(self.name)
-                + "Setting it to 1."
-            )
-            r_range = 1.0
-
-        self.v_max = np.zeros(self.horizon)
-        self.v_max[-1] = r_range
-        for hh in reversed(range(self.horizon - 1)):
-            self.v_max[hh] = r_range + self.gamma * self.v_max[hh + 1]
-
-        self.reset()
-
-    def reset(self):
-        self.Qtree = MDPTreePartition(
-            self.env.observation_space, self.env.action_space, self.horizon
-        )
-
-        # info
-        self.episode = 0
-
-    def policy(self, observation):
-        action, _ = self.Qtree.get_argmax_and_node(observation, 0)
-        return action
-
-    def _get_action_and_node(self, observation, hh):
-        action, node = self.Qtree.get_argmax_and_node(observation, hh)
-        return action, node
-
-    def _update(self, node, state, action, next_state, reward, hh):
-        # split node if necessary
-        node_to_check = self.Qtree.update_counts(state, action, hh)
-        if node_to_check.n_visits >= (self.Qtree.dmax / node_to_check.radius) ** 2.0:
-            node_to_check.split()
-        assert id(node_to_check) == id(node)
-
-        tt = node.n_visits  # number of visits to the selected state-action node
-
-        # value at next_state
-        value_next_state = 0
-        if hh < self.horizon - 1:
-            value_next_state = min(
-                self.v_max[hh + 1],
-                self.Qtree.get_argmax_and_node(next_state, hh + 1)[1].qvalue,
-            )
-
-        # learning rate
-        alpha = (self.horizon + 1.0) / (self.horizon + tt)
-
-        bonus = self._compute_bonus(tt, hh)
-        target = reward + bonus + self.gamma * value_next_state
-
-        # update Q
-        node.qvalue = (1 - alpha) * node.qvalue + alpha * target
-
-    def _compute_bonus(self, n, hh):
-        if self.bonus_type == "simplified_bernstein":
-            bonus = self.bonus_scale_factor * np.sqrt(1.0 / n) + self.v_max[hh] / n
-            bonus = min(bonus, self.v_max[hh])
-            return bonus
-        else:
-            raise ValueError(
-                "Error: bonus type {} not implemented".format(self.bonus_type)
-            )
-
-    def _run_episode(self):
-        # interact for H steps
-        episode_rewards = 0
-        observation, info = self.env.reset()
-        for hh in range(self.horizon):
-            action, node = self._get_action_and_node(observation, hh)
-            next_observation, reward, terminated, truncated, info = self.env.step(
-                action
-            )
-            done = terminated or truncated
-            episode_rewards += reward
-
-            self._update(node, observation, action, next_observation, reward, hh)
-
-            observation = next_observation
-            if done:
-                break
-
-        # update info
-        self.episode += 1
-
-        # writer
-        if self.writer is not None:
-            self.writer.add_scalar("episode_rewards", episode_rewards, self.episode)
-
-        # return sum of rewards collected in the episode
-        return episode_rewards
-
-    def fit(self, budget: int, **kwargs):
-        """
-        Train the agent using the provided environment.
-
-        Parameters
-        ----------
-        budget: int
-            number of episodes. Each episode runs for self.horizon unless it
-            enconters a terminal state in which case it stops early.
-        """
-        del kwargs
-        n_episodes_to_run = budget
-        count = 0
-        while count < n_episodes_to_run:
-            self._run_episode()
-            count += 1
diff --git a/rlberry/agents/adaptiveql/tree.py b/rlberry/agents/adaptiveql/tree.py
deleted file mode 100644
index 4aaeb7948..000000000
--- a/rlberry/agents/adaptiveql/tree.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import gymnasium.spaces as spaces
-import numpy as np
-import matplotlib.pyplot as plt
-from rlberry.agents.adaptiveql.utils import bounds_contains, split_bounds
-
-
-class TreeNode:
-    """
-    Node representing an l-infinity ball in R^d, that points
-    to sub-balls (node children).
-    Stores a value, a number of visits, and (possibly) rewards and transition probability
-    to a list of other nodes.
-
-    This class is used to represent (and store data about)
-    a tuple (state, action, stage) = (x, a, h).
-
-    Parameters
-    ----------
-    bounds : numpy.ndarray
-        Bounds of each dimension [ [x0, y0], [x1, y1], ..., [xd, yd] ],
-        representing the cartesian product in R^d:
-        [x0, y0] X [x1, y1] X ... X [xd, yd]
-    depth: int
-        Node depth, root is at depth 0.
-    qvalue : double, default: 0
-        Initial node Q value
-    n_visits : int, default = 0
-        Number of visits to the node.
-
-    """
-
-    def __init__(self, bounds, depth, qvalue=0.0, n_visits=0):
-        self.dim = len(bounds)
-
-        self.radius = (bounds[:, 1] - bounds[:, 0]).max() / 2.0
-        assert self.radius > 0.0
-
-        self.bounds = bounds
-        self.depth = depth
-        self.qvalue = qvalue
-        self.n_visits = n_visits
-        self.children = []
-
-        #
-        # For AdaMB
-        #
-
-        # Value V, initialized as Q
-        self.vvalue = qvalue
-        # Reward estimate
-        self.reward_est = 0.0
-        # Dictionary node_id -> transition_prob
-        # node_id = id(node), where id() is a built-in python function
-        self.transition_probs = {}
-        # Dictionary node_id -> node
-        self.transition_nodes = {}
-
-    def is_leaf(self):
-        return len(self.children) == 0
-
-    def contains(self, x):
-        """Check if `x` is contained in the node/ball."""
-        return bounds_contains(self.bounds, x)
-
-    def split(self):
-        """Spawn children nodes by splitting the ball."""
-        child_bounds = split_bounds(self.bounds)
-        for bounds in child_bounds:
-            self.children.append(
-                TreeNode(bounds, self.depth + 1, self.qvalue, self.n_visits)
-            )
-
-
-class TreePartition:
-    """
-    Tree-based partition of an l-infinity ball in R^d.
-
-    Each node is of type TreeNode.
-
-    Parameters
-    ----------
-    space: gym.spaces.Box
-        Domain of the function.
-    initial_value: double
-        Value to initialize the root node.
-    """
-
-    def __init__(self, space, initial_value=0.0):
-        assert isinstance(space, spaces.Box)
-        assert space.is_bounded()
-
-        bounds = np.vstack((space.low, space.high)).T
-        self.root = TreeNode(bounds, depth=0, qvalue=initial_value)
-        self.dim = bounds.shape[0]
-        self.dmax = self.root.radius
-
-    def traverse(self, x, update=False):
-        """
-        Returns leaf node containing x.
-
-        If `update=true`, increments number of visits of each
-        node in the path.
-
-        Parameters
-        ----------
-        x : numpy.ndarray
-            Array of shape (d,)
-        """
-        node = self.root
-
-        # traverse the tree until leaf
-        while True:
-            if update:
-                node.n_visits += 1
-            if node.is_leaf():
-                break
-            for cc in node.children:
-                if cc.contains(x):
-                    node = cc
-                    break
-
-        # return value at leaf
-        return node
-
-    def plot(
-        self,
-        fignum="tree plot",
-        colormap_name="cool",
-        max_value=10,
-        node=None,
-        root=True,
-    ):
-        """
-        Visualize the function (2d domain only).
-        Shows the hierarchical partition.
-        """
-        if root:
-            assert (
-                self.dim == 2
-            ), "TreePartition plot only available for 2-dimensional spaces."
-            node = self.root
-            plt.figure(fignum)
-
-        # draw region corresponding to the leaf
-        if node.is_leaf():
-            x0, x1 = node.bounds[0, :]
-            y0, y1 = node.bounds[1, :]
-
-            colormap_fn = plt.get_cmap(colormap_name)
-            color = colormap_fn(node.qvalue / max_value)
-            rectangle = plt.Rectangle(
-                (x0, y0), x1 - x0, y1 - y0, ec="black", color=color
-            )
-            plt.gca().add_patch(rectangle)
-            plt.axis("scaled")
-
-        else:
-            for cc in node.children:
-                self.plot(
-                    max_value=max_value,
-                    colormap_name=colormap_name,
-                    node=cc,
-                    root=False,
-                )
-
-
-class MDPTreePartition:
-    """
-    Set of H x A TreePartition instances.
-
-    Used to store/manipulate a Q function, a reward function and a transition model.
-    """
-
-    def __init__(self, observation_space, action_space, horizon):
-        self.horizon = horizon
-        self.n_actions = action_space.n
-        self.trees = []
-        for hh in range(horizon):
-            self.trees.append({})
-            for aa in range(self.n_actions):
-                self.trees[hh][aa] = TreePartition(
-                    observation_space, initial_value=horizon - hh
-                )
-
-        self.dmax = self.trees[0][0].dmax
-
-    def get_argmax_and_node(self, x, hh):
-        """
-        Returns a* = argmax_a Q_h(x, a) and the node corresponding to (x, a*).
-        """
-        # trees for each action at hh
-        trees_hh = self.trees[hh]
-
-        best_action = 0
-        best_node = trees_hh[0].traverse(x, update=False)
-        best_val = best_node.qvalue
-        for aa in range(1, self.n_actions):
-            node = trees_hh[aa].traverse(x, update=False)
-            val = node.qvalue
-            if val > best_val:
-                best_val = val
-                best_action = aa
-                best_node = node
-
-        return best_action, best_node
-
-    def update_counts(self, x, aa, hh):
-        """
-        Increment counters associated to (x, aa, hh) and returns the node.
-        """
-        tree = self.trees[hh][aa]
-        node = tree.traverse(x, update=True)
-        return node
-
-    def plot(self, a, h):
-        """
-        Visualize Q_h(x, a)
-        """
-        self.trees[h][a].plot(max_value=self.horizon - h)
diff --git a/rlberry/agents/adaptiveql/utils.py b/rlberry/agents/adaptiveql/utils.py
deleted file mode 100644
index f04f96c3c..000000000
--- a/rlberry/agents/adaptiveql/utils.py
+++ /dev/null
@@ -1,61 +0,0 @@
-from rlberry.utils.jit_setup import numba_jit
-
-
-@numba_jit
-def bounds_contains(bounds, x):
-    """
-    Returns True if `x` is contained in the bounds, and False otherwise.
-
-    Parameters
-    ----------
-    bounds : numpy.ndarray
-        Array of shape (d, 2).
-        Bounds of each dimension [ [x0, y0], [x1, y1], ..., [xd, yd] ],
-        representing the following cartesian product in R^d:
-        [x0, y0] X [x1, y1] X ... X [xd, yd].
-    x : numpy.ndarray
-        Array of shape (d,)
-    """
-    dim = x.shape[0]
-    for dd in range(dim):
-        if x[dd] < bounds[dd, 0] or x[dd] > bounds[dd, 1]:
-            return False
-    return True
-
-
-def split_bounds(bounds, dim=0):
-    """
-    Split an array representing an l-infinity ball in R^d in R^d
-    into a list of 2^d arrays representing the ball split.
-
-    Parameters
-    ----------
-    bounds : numpy.ndarray
-        Array of shape (d, 2).
-        Bounds of each dimension [ [x0, y0], [x1, y1], ..., [xd, yd] ],
-        representing the cartesian product in R^d:
-        [x0, y0] X [x1, y1] X ... X [xd, yd].
-
-    dim : int, default: 0
-        Dimension from which to start splitting.
-
-    Returns
-    -------
-    List of arrays of shape (d, 2) containing the bounds to be split.
-    """
-    if dim == bounds.shape[0]:
-        return [bounds]
-    left = bounds[dim, 0]
-    right = bounds[dim, 1]
-    middle = (left + right) / 2.0
-
-    left_interval = bounds.copy()
-    right_interval = bounds.copy()
-
-    left_interval[dim, 0] = left
-    left_interval[dim, 1] = middle
-
-    right_interval[dim, 0] = middle
-    right_interval[dim, 1] = right
-
-    return split_bounds(left_interval, dim + 1) + split_bounds(right_interval, dim + 1)
diff --git a/rlberry/agents/bandits/__init__.py b/rlberry/agents/bandits/__init__.py
deleted file mode 100644
index b35c171cf..000000000
--- a/rlberry/agents/bandits/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from .bandit_base import BanditWithSimplePolicy
-from .index_agents import IndexAgent
-from .indices import (
-    makeBoundedIMEDIndex,
-    makeBoundedMOSSIndex,
-    makeBoundedNPTSIndex,
-    makeBoundedUCBIndex,
-    makeBoundedUCBVIndex,
-    makeETCIndex,
-    makeEXP3Index,
-    makeSubgaussianMOSSIndex,
-    makeSubgaussianUCBIndex,
-)
-from .priors import (
-    makeBetaPrior,
-    makeGaussianPrior,
-)
-from .randomized_agents import RandomizedAgent
-from .ts_agents import TSAgent
diff --git a/rlberry/agents/bandits/bandit_base.py b/rlberry/agents/bandits/bandit_base.py
deleted file mode 100644
index 2558b30a2..000000000
--- a/rlberry/agents/bandits/bandit_base.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import numpy as np
-from rlberry.agents import AgentWithSimplePolicy
-from .tools import BanditTracker
-import pickle
-
-from pathlib import Path
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class BanditWithSimplePolicy(AgentWithSimplePolicy):
-    """
-    Base class for bandits algorithms.
-
-    The fit function must result in self.optimal_action being set for the save
-    and load functions to work.
-
-    Parameters
-    -----------
-    env: rlberry bandit environment
-        See :class:`~rlberry.envs.bandits.Bandit`.
-
-    tracker_params: dict
-        Parameters for the tracker object, typically to decide what to store.
-
-    """
-
-    name = ""
-
-    def __init__(self, env, tracker_params={}, **kwargs):
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-        self.n_arms = self.env.action_space.n
-        self.arms = np.arange(self.n_arms)
-        self.tracker = BanditTracker(self, tracker_params)
-
-    @property
-    def total_time(self):
-        return self.tracker.t
-
-    def fit(self, budget=None, **kwargs):
-        """
-        Example fit function. Should be overwritten by your own implementation.
-
-        Parameters
-        ----------
-        budget: int
-            Total number of iterations, also called horizon.
-        """
-        horizon = budget
-        rewards = np.zeros(horizon)
-
-        for ep in range(horizon):
-            # choose the optimal action
-            # for demo purpose, we will always choose action 0
-            action = 0
-            _, reward, _, _, _ = self.env.step(action)
-            self.tracker.update(action, reward)
-            rewards[ep] = reward
-
-        self.optimal_action = 0
-        info = {"episode_reward": np.sum(rewards)}
-        return info
-
-    def policy(self, observation):
-        return self.optimal_action
-
-    def save(self, filename):
-        """
-        Save agent object.
-
-        Parameters
-        ----------
-        filename: Path or str
-            File in which to save the Agent.
-
-        Returns
-        -------
-        If save() is successful, a Path object corresponding to the filename is returned.
-        Otherwise, None is returned.
-        Important: the returned filename might differ from the input filename: For instance,
-        the method can append the correct suffix to the name before saving.
-
-        """
-
-        dico = {
-            "_writer": self.writer,
-            "seeder": self.seeder,
-            "_execution_metadata": self._execution_metadata,
-            "_unique_id": self._unique_id,
-            "_output_dir": self._output_dir,
-            "optimal_action": self.optimal_action,
-        }
-
-        # save
-        filename = Path(filename).with_suffix(".pickle")
-        filename.parent.mkdir(parents=True, exist_ok=True)
-        with filename.open("wb") as ff:
-            pickle.dump(dico, ff)
-
-        return filename
-
-    @classmethod
-    def load(cls, filename, **kwargs):
-        """Load agent object.
-
-        If overridden, save() method must also be overriden.
-
-        Parameters
-        ----------
-        **kwargs: dict
-            Arguments to required by the __init__ method of the Agent subclass.
-        """
-        filename = Path(filename).with_suffix(".pickle")
-
-        obj = cls(**kwargs)
-        with filename.open("rb") as ff:
-            tmp_dict = pickle.load(ff)
-
-        obj.__dict__.update(tmp_dict)
-
-        return obj
diff --git a/rlberry/agents/bandits/index_agents.py b/rlberry/agents/bandits/index_agents.py
deleted file mode 100644
index c7335aef7..000000000
--- a/rlberry/agents/bandits/index_agents.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import numpy as np
-from rlberry.agents.bandits import BanditWithSimplePolicy
-
-
-import rlberry
-
-logger = rlberry.logger
-
-# TODO : fix bug when doing several fit, the fit do not resume. Should define
-#        self.rewards and self.action and resume training.
-
-
-class IndexAgent(BanditWithSimplePolicy):
-    """
-    Agent for bandit environment using Index-based policy like UCB.
-
-    Parameters
-    -----------
-    env : rlberry bandit environment
-        See :class:`~rlberry.envs.bandits.Bandit`.
-
-    index_function : callable or None, default = None
-        Compute the index for an arm using the past rewards on this arm and
-        the current time t. If None, use UCB bound for Bernoulli.
-
-    **kwargs: arguments
-        Arguments to be passed to :class:`~rlberry.agents.bandit.BanditWithSimplePolicy`.
-        In particular, one may want to pass the following parameters:
-        tracker_params: dict
-            Parameters for the tracker object, typically to decide what to store.
-            in particular may contain a function "update", used to define additional statistics
-            that have to be saved in the tracker. See :class:~rlberry.agents.bandit.BanditTracker`.
-
-    Examples
-    --------
-    >>> from rlberry.agents.bandits import IndexAgent
-    >>> import numpy as np
-    >>> class UCBAgent(IndexAgent):
-    >>>     name = "UCB"
-    >>>     def __init__(self, env, **kwargs):
-    >>>     def index(tr):
-    >>>         return [
-    >>>             tr.mu_hat(arm)
-    >>>             + np.sqrt(
-    >>>                 np.log(tr.t ** 2)
-    >>>                 / (2 * tr.n_pulls(arm))
-    >>>             )
-    >>>             for arm in tr.arms
-    >>>         ]
-    >>>         IndexAgent.__init__(self, env, index, **kwargs)
-
-    """
-
-    name = "IndexAgent"
-
-    def __init__(self, env, index_function=None, **kwargs):
-        BanditWithSimplePolicy.__init__(self, env, **kwargs)
-        if index_function is None:
-
-            def index_function(tr):
-                return [
-                    tr.mu_hat(arm) + np.sqrt(np.log(tr.t**2) / (2 * tr.n_pulls(arm)))
-                    for arm in tr.arms
-                ]
-
-        self.index_function = index_function
-
-    def fit(self, budget=None, **kwargs):
-        """
-        Train the bandit using the provided environment.
-
-        Parameters
-        ----------
-        budget: int
-            Total number of iterations, also called horizon.
-        """
-        horizon = budget
-        total_reward = 0.0
-        indices = np.inf * np.ones(self.n_arms)
-
-        for ep in range(horizon):
-            # Warmup: play every arm one before starting computing indices
-            if ep < self.n_arms:
-                action = ep
-            else:
-                # Compute index for each arm and play the highest one
-                indices = self.index_function(self.tracker)
-                action = np.argmax(indices)
-
-            _, reward, _, _, _ = self.env.step(action)
-
-            # Feed the played action and the resulting reward to the tracker
-            self.tracker.update(action, reward)
-
-            total_reward += reward
-
-        # Best action in hinsight is the one with highest index
-        self.optimal_action = np.argmax(indices)
-
-        info = {"episode_reward": total_reward}
-        return info
diff --git a/rlberry/agents/bandits/indices.py b/rlberry/agents/bandits/indices.py
deleted file mode 100644
index ebea3ac3f..000000000
--- a/rlberry/agents/bandits/indices.py
+++ /dev/null
@@ -1,421 +0,0 @@
-import numpy as np
-from typing import Callable
-
-
-def makeETCIndex(A: int = 2, m: int = 1):
-    """
-    Explore-Then-Commit index, see Chapter 6 in [1].
-
-    Parameters
-    ----------
-    A: int
-        Number of arms.
-
-    m : int, default: 1
-        Number of exploration pulls per arm.
-
-    Return
-    ------
-    Callable
-        ETC index.
-
-    Dict
-        Extra parameters for the BanditTracker object.
-        By default the tracker stores the number of pulls and the
-        empirical average reward for each arm. If you want it to store
-        all rewards for instance, return {'store_rewards': True}.
-
-    References
-    ----------
-    .. [1] Lattimore, Tor, and Csaba Szepesvári. Bandit algorithms.
-            Cambridge University Press, 2020.
-    """
-
-    def index(tr):
-        return [-tr.n_pulls(arm) if tr.t < m * A else tr.mu_hat(arm) for arm in tr.arms]
-
-    return index, {}
-
-
-def makeSubgaussianUCBIndex(
-    sigma: float = 1.0,
-    delta: Callable = lambda t: 1 / (1 + (t + 1) * np.log(t + 1) ** 2),
-):
-    """
-    UCB index for sub-Gaussian distributions, see Chapters 7 & 8 in [1].
-
-    Parameters
-    ----------
-    sigma : float, default: 1.0
-        Sub-Gaussian parameter.
-
-    delta: Callable,
-        Confidence level. Default is tuned to have asymptotically optimal
-        regret, see Chapter 8 in [1].
-
-    Return
-    ------
-    Callable
-        UCB index for sigma-sub-Gaussian distributions.
-
-    Dict
-        Extra parameters for the BanditTracker object.
-        By default the tracker stores the number of pulls and the
-        empirical average reward for each arm. If you want it to store
-        all rewards for instance, return {'store_rewards': True}.
-
-    References
-    ----------
-    .. [1] Lattimore, Tor, and Csaba Szepesvári. Bandit algorithms.
-            Cambridge University Press, 2020.
-    """
-
-    def index(tr):
-        return [
-            tr.mu_hat(arm)
-            + sigma * np.sqrt(2 * np.log(1 / delta(tr.t)) / tr.n_pulls(arm))
-            for arm in tr.arms
-        ]
-
-    return index, {}
-
-
-def makeBoundedUCBIndex(
-    lower_bound: float = 0.0,
-    upper_bound: float = 1.0,
-    delta: Callable = lambda t: 1 / (1 + (t + 1) * np.log(t + 1) ** 2),
-):
-    """
-    UCB index for bounded distributions, see Chapters 7 & 8 in [1].
-    By Hoeffding's lemma, such distributions are sigma-sub-Gaussian with
-        sigma = (upper_bound - lower_bound) / 2.
-
-    Parameters
-    ----------
-    lower_bound: float, default: 0.0
-        Lower bound on the rewards.
-
-    upper_bound: float, default: 1.0
-        Upper bound on the rewards.
-
-    delta: Callable,
-        Confidence level. Default is tuned to have asymptotically optimal
-        regret, see Chapter 8 in [1].
-
-    Return
-    ------
-    Callable
-        UCB index for bounded distributions.
-
-    Dict
-        Extra parameters for the BanditTracker object.
-        By default the tracker stores the number of pulls and the
-        empirical average reward for each arm. If you want it to store
-        all rewards for instance, return {'store_rewards': True}.
-
-    References
-    ----------
-    .. [1] Lattimore, Tor, and Csaba Szepesvári. Bandit algorithms.
-            Cambridge University Press, 2020.
-    """
-    return makeSubgaussianUCBIndex((upper_bound - lower_bound) / 2, delta)
-
-
-def makeSubgaussianMOSSIndex(T: int = 1, A: int = 2, sigma: float = 1.0):
-    """
-    MOSS index for sub-Gaussian distributions, see Chapters 9 in [1].
-
-    Parameters
-    ----------
-    T: int
-        Time horizon.
-
-    A: int
-        Number of arms.
-
-    sigma : float, default: 1.0
-        Sub-Gaussian parameter.
-
-    Return
-    ------
-    Callable
-        MOSS index for sigma-sub-Gaussian distributions.
-
-    Dict
-        Extra parameters for the BanditTracker object.
-        By default the tracker stores the number of pulls and the
-        empirical average reward for each arm. If you want it to store
-        all rewards for instance, return {'store_rewards': True}.
-
-    References
-    ----------
-    .. [1] Lattimore, Tor, and Csaba Szepesvári. Bandit algorithms.
-            Cambridge University Press, 2020.
-    """
-
-    def index(tr):
-        return [
-            tr.mu_hat(arm)
-            + sigma
-            * np.sqrt(
-                4 / tr.n_pulls(arm) * np.maximum(0, np.log(T / (A * tr.n_pulls(arm))))
-            )
-            for arm in tr.arms
-        ]
-
-    return index, {}
-
-
-def makeBoundedMOSSIndex(
-    T: float = 1, A: float = 2, lower_bound: float = 0.0, upper_bound: float = 1.0
-):
-    """
-    MOSS index for bounded distributions, see Chapters 9 in [1].
-    By Hoeffding's lemma, such distributions are sigma-sub-Gaussian with
-        sigma = (upper_bound - lower_bound) / 2.
-
-    Parameters
-    ----------
-    T: int
-        Time horizon.
-
-    A: int
-        Number of arms.
-
-    lower_bound: float, default: 0.0
-        Lower bound on the rewards.
-
-    upper_bound: float, default: 1.0
-        Upper bound on the rewards.
-
-    Return
-    ------
-    Callable
-        MOSS index for bounded distributions.
-
-    Dict
-        Extra parameters for the BanditTracker object.
-        By default the tracker stores the number of pulls and the
-        empirical average reward for each arm. If you want it to store
-        all rewards for instance, return {'store_rewards': True}.
-
-    References
-    ----------
-    .. [1] Lattimore, Tor, and Csaba Szepesvári. Bandit algorithms.
-            Cambridge University Press, 2020.
-    """
-    return makeSubgaussianMOSSIndex(T, A, (upper_bound - lower_bound) / 2)
-
-
-def makeEXP3Index():
-    """
-    EXP3 index for distributions in [0, 1], see Chapters 11 in [1] and [2].
-
-    Return
-    ------
-    Callable
-        EXP3 index for [0, 1] distributions.
-
-    Dict
-        Extra parameters for the BanditTracker object.
-        By default the tracker stores the number of pulls and the
-        empirical average reward for each arm. If you want it to store
-        all rewards for instance, return {'store_rewards': True}.
-
-    References
-    ----------
-    .. [1] Lattimore, Tor, and Csaba Szepesvári. Bandit algorithms.
-            Cambridge University Press, 2020.
-
-    .. [2] Seldin, Yevgeny, et al. Evaluation and analysis of the
-            performance of the EXP3 algorithm in stochastic environments.
-            European Workshop on Reinforcement Learning. PMLR, 2013.
-    """
-
-    def prob(tr):
-        w = np.zeros(tr.n_arms)
-        for arm in tr.arms:
-            eta = np.minimum(
-                np.sqrt(np.log(tr.n_arms) / (tr.n_arms * (tr.t + 1))),
-                1 / tr.n_arms,
-            )
-            w[arm] = np.exp(eta * tr.iw_total_reward(arm))
-        w /= w.sum()
-        return (1 - tr.n_arms * eta) * w + eta * np.ones(tr.n_arms)
-
-    return prob, {"do_iwr": True}
-
-
-def makeBoundedIMEDIndex(upper_bound: float = 1.0):
-    """
-    IMED index for semi-bounded distributions, see [1].
-
-    Parameters
-    ----------
-    upper_bound: float, default: 1.0
-        Upper bound on the rewards.
-
-    Return
-    ------
-    Callable
-        IMED index for sigma-sub-Gaussian distributions.
-
-    Dict
-        Extra parameters for the BanditTracker object.
-        By default the tracker stores the number of pulls and the
-        empirical average reward for each arm. If you want it to store
-        all rewards for instance, return {'store_rewards': True}.
-
-    References
-    ----------
-    .. [1] Honda, Junya, and Akimichi Takemura. Non-asymptotic analysis of
-            a new bandit algorithm for semi-bounded rewards.
-            J. Mach. Learn. Res. 16 (2015): 3721-3756.
-    """
-    from scipy.optimize import minimize_scalar
-
-    def index(tr):
-        mu_hat_star = np.max([tr.mu_hat(arm) for arm in tr.arms])
-        indices = np.zeros(tr.n_arms)
-        for arm in tr.arms:
-            X = np.array(tr.rewards(arm))
-
-            def dual(u):
-                return -np.mean(np.log(1 - (X - mu_hat_star) * u))
-
-            eps = 1e-12
-            ret = minimize_scalar(
-                dual,
-                method="bounded",
-                bounds=(eps, 1.0 / (upper_bound - mu_hat_star + eps)),
-            )
-            if ret.success:
-                kinf = -ret.fun
-            else:
-                # if not successful, just make this arm ineligible this turn
-                kinf = np.inf
-
-            indices[arm] = -kinf * len(X) - np.log(len(X))
-        return indices
-
-    return index, {"store_rewards": True}
-
-
-def makeBoundedNPTSIndex(upper_bound: float = 1.0):
-    """
-    NPTS index for bounded distributions, see [1].
-
-    Parameters
-    ----------
-    upper_bound: float, default: 1.0
-        Upper bound on the rewards.
-
-
-    Return
-    ------
-    Callable
-        NPTS index for bounded distributions.
-
-    Dict
-        Extra parameters for the BanditTracker object.
-        By default the tracker stores the number of pulls and the
-        empirical average reward for each arm. If you want it to store
-        all rewards for instance, return {'store_rewards': True}.
-
-    References
-    ----------
-    .. [1] Riou, Charles, and Junya Honda. Bandit algorithms based on
-            thompson sampling for bounded reward distributions.
-            Algorithmic Learning Theory. PMLR, 2020.
-
-    """
-
-    def index(tr):
-        indices = np.zeros(tr.n_arms)
-        for arm in tr.arms:
-            X = np.array(tr.rewards(arm))
-            w = tr.rng.dirichlet(np.ones(len(X) + 1))
-            indices[arm] = w[:-1] @ X + upper_bound * w[-1]
-        return indices
-
-    return index, {"store_rewards": True}
-
-
-def makeBoundedUCBVIndex(
-    upper_bound: float = 1.0,
-    c: float = 0.34,
-    zeta: float = 1.0,
-    delta: Callable = lambda t: 1 / t,
-):
-    """
-    UCBV index for bounded distributions, see [1]. In particular, the index
-    recommended on p10 is implemented.
-    The empirical variance is computed sequentially using Welford's algorithm.
-    Parameters
-    ----------
-    upper_bound: float, default: 1.0
-        Upper bound on the rewards.
-
-    c: float, default: 0.34
-        Parameter in UCBV algorithm. See Equation (18) in [1]
-
-    zeta: float, default: 1.0
-        Parameter in UCBV algorithm. See Equation (18) in [1]
-
-    delta: Callable,
-        Confidence level. See [1].
-
-    Return
-    ------
-    Callable
-        UCBV index for bounded distributions.
-
-    Dict
-        Extra parameters for the BanditTracker object.
-        By default the tracker stores the number of pulls and the
-        empirical average reward for each arm. If you want it to store
-        all rewards for instance, return {'store_rewards': True}.
-
-    References
-    ----------
-    .. [1] Audibert, J. Y., Munos, R., & Szepesvári, C. (2009).
-        Exploration–exploitation tradeoff using variance estimates
-        in multi-armed bandits. Theoretical Computer Science, 410(19), 1876-1902.
-
-    """
-
-    def update_fun(tr, arm):
-        """
-        Sequentially add variance estimate to tracker
-        """
-        if tr.n_pulls(arm) == 1:
-            tr.add_scalars(arm, {"v_hat": 0})
-        else:
-            # compute variance sequentially using Welford's algorithm.
-            reward = tr.reward(arm)
-            old_muhat = (tr.total_reward(arm) - reward) / (
-                tr.n_pulls(arm) - 1
-            )  # compute mu at time n-1
-            new_muhat = tr.mu_hat(arm)
-            old_vhat = tr.read_last_tag_value("v_hat", arm)
-            new_vhat = (
-                old_vhat
-                + ((reward - old_muhat) * (reward - new_muhat) - old_vhat) / tr.t
-            )
-            tr.add_scalars(arm, {"v_hat": new_vhat})
-
-    def index(tr):
-        return [
-            tr.mu_hat(arm)
-            + np.sqrt(
-                2
-                * zeta
-                * tr.read_last_tag_value("v_hat", arm)
-                * np.log(1 / delta(tr.t))
-                / tr.n_pulls(arm)
-            )
-            + 3 * c * zeta * upper_bound * np.log(1 / delta(tr.t)) / tr.n_pulls(arm)
-            for arm in tr.arms
-        ]
-
-    return index, {"update": update_fun}
diff --git a/rlberry/agents/bandits/priors.py b/rlberry/agents/bandits/priors.py
deleted file mode 100644
index 060fb5036..000000000
--- a/rlberry/agents/bandits/priors.py
+++ /dev/null
@@ -1,151 +0,0 @@
-import numpy as np
-
-
-def makeBetaPrior():
-    """
-    Beta prior for Bernoulli bandits, see Chapter 3 in [1].
-
-    Parameters
-    ----------
-        None
-
-    Return
-    ------
-    Dict
-        Callable
-            Beta sampler.
-
-        Callable
-            Function that computes the parameters of the prior distribution
-            from the bandit tracker.
-
-        Callable
-            Function that computes the optimal action from the prior distributions.
-
-    Dict
-        Extra parameters for the BanditTracker object.
-        By default the tracker stores the number of pulls and the
-        empirical average reward for each arm. If you want it to store
-        all rewards for instance, return {'store_rewards': True}.
-
-    References
-    ----------
-    .. [1] Russo, Daniel J., et al. "A tutorial on Thompson Sampling."
-        Foundations and Trends in Machine Learning 11.1 (2018): 1-96.
-    """
-
-    def prior_params(tr):
-        """
-        The mean of a Bernoulli arm B(p) has prior distribution Beta(a, b),
-        where a is the number of success + 1, b the number of failures + 1.
-        """
-        return [
-            [
-                tr.total_reward(arm) + 1,
-                tr.n_pulls(arm) - tr.total_reward(arm) + 1,
-            ]
-            for arm in tr.arms
-        ]
-
-    def prior_sampler(tr):
-        """
-        Beta prior.
-        """
-        params = prior_params(tr)
-        return [tr.rng.beta(params[arm][0], params[arm][1]) for arm in tr.arms]
-
-    def optimal_action(tr):
-        """
-        The mean of a Bernoulli arm B(p) has prior distribution Beta(a, b),
-        where a is the number of success + 1, b the number of failures + 1.
-        The expectation of p is a / (a + b), therefore the optimal arm w.r.t
-        the Beta prior is the one with highest a / (a + b).
-        """
-        params = prior_params(tr)
-        return np.argmax(
-            [params[arm][0] / (params[arm][0] + params[arm][1]) for arm in tr.arms]
-        )
-
-    prior_info = {
-        "params": prior_params,
-        "sampler": prior_sampler,
-        "optimal_action": optimal_action,
-    }
-
-    return prior_info, {}
-
-
-def makeGaussianPrior(sigma: float = 1.0):
-    """
-    Gaussian prior for Gaussian bandits with known variance, see [1].
-
-    Parameters
-    ----------
-    sigma : float, default: 1.0
-        Gaussian standard deviation.
-
-    Return
-    ------
-    Dict
-        Callable
-            Gaussian sampler.
-
-        Callable
-            Function that computes the parameters of the prior distribution
-            from the bandit tracker.
-
-        Callable
-            Function that computes the optimal action from the prior distributions.
-
-    Dict
-        Extra parameters for the BanditTracker object.
-        By default the tracker stores the number of pulls and the
-        empirical average reward for each arm. If you want it to store
-        all rewards for instance, return {'store_rewards': True}.
-
-    References
-    ----------
-    .. [1] Korda, Nathaniel, Emilie Kaufmann, and Remi Munos.
-        "Thompson sampling for 1-dimensional exponential family bandits."
-        Advances in Neural Information Processing Systems 26 (2013).
-    """
-
-    def prior_params(tr):
-        """
-        The mean of a Gaussian arm N(mu, sigma^2) has prior distribution
-        N(mu_hat, s^2), where mu_hat is the empirical average reward and
-        s^2 = sigma^2 / n, n being the number of pulls for this arm.
-        """
-        return [
-            [
-                tr.mu_hat(arm),
-                sigma / np.sqrt(tr.n_pulls(arm)),
-            ]
-            for arm in tr.arms
-        ]
-
-    def prior_sampler(tr):
-        """
-        Normal prior.
-        """
-        params = prior_params(tr)
-        return [tr.rng.normal(params[arm][0], params[arm][1]) for arm in tr.arms]
-
-    def optimal_action(tr):
-        """
-        The mean of a Gaussian arm N(mu, sigma^2) has prior distribution
-        N(mu_hat, s^2), where mu_hat is the empirical average reward and
-        s^2 = sigma^2 / n, n being the number of pulls for this arm.
-        The expectation of mu is mu_hat, therefore the optimal arm w.r.t
-        the Gaussian prior is the one with highest mu_hat.
-        """
-        params = prior_params(tr)
-        return np.argmax([params[arm][0] for arm in tr.arms])
-
-    prior_info = {
-        "params": prior_params,
-        "sampler": prior_sampler,
-        "optimal_action": optimal_action,
-    }
-
-    return prior_info, {}
diff --git a/rlberry/agents/bandits/randomized_agents.py b/rlberry/agents/bandits/randomized_agents.py
deleted file mode 100644
index 76a82c97f..000000000
--- a/rlberry/agents/bandits/randomized_agents.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import numpy as np
-from rlberry.agents.bandits import BanditWithSimplePolicy
-
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class RandomizedAgent(BanditWithSimplePolicy):
-    """
-    Agent for bandit environment using randomized policy like EXP3.
-
-    Parameters
-    -----------
-    env : rlberry bandit environment
-        See :class:`~rlberry.envs.bandits.Bandit`.
-
-    index_function : callable or None, default = None
-        Compute the index for an arm using the past rewards and sampling
-        probability on this arm and the current time t.
-        If None, use loss-based importance weighted estimator.
-
-    prob_function : callable or None, default = None
-        Compute the sampling probability for an arm using its index.
-        If None, EXP3 softmax probabilities.
-        References: Seldin, Yevgeny, et al. "Evaluation and analysis of the
-        performance of the EXP3 algorithm in stochastic environments.".
-        European Workshop on Reinforcement Learning. PMLR, 2013.
-
-    **kwargs: arguments
-        Arguments to be passed to BanditWithSimplePolicy. In particular,
-        one may want to pass the following parameter:
-        tracker_params: dict
-            Parameters for the tracker object, typically to decide what to store.
-
-    Examples
-    --------
-    >>> from rlberry.agents.bandits import IndexAgent
-    >>> import numpy as np
-    >>> class EXP3Agent(RandomizedAgent):
-    >>>     name = "EXP3"
-    >>>     def __init__(self, env, **kwargs):
-    >>>         def prob_function(tr):
-    >>>             w = np.zeros(tr.n_arms)
-    >>>             for arm in tr.arms:
-    >>>                 eta = np.minimum(
-    >>>                     np.sqrt(
-    >>>                         np.log(tr.n_arms) / (tr.n_arms * (tr.t + 1))
-    >>>                     ),
-    >>>                     1 / tr.n_arms,
-    >>>                 )
-    >>>                 w[arm] = np.exp(eta * tr.iw_total_reward(arm))
-    >>>             w /= w.sum()
-    >>>             return (1 - tr.n_arms * eta) * w + eta * np.ones(tr.n_arms)
-    >>>
-    >>>         RandomizedAgent.__init__(self, env, index, prob, **kwargs)
-
-    """
-
-    name = "RandomizedAgent"
-
-    def __init__(self, env, prob_function=None, **kwargs):
-        BanditWithSimplePolicy.__init__(self, env, **kwargs)
-
-        if prob_function is None:
-
-            def prob_function(tr):
-                w = np.zeros(tr.n_arms)
-                for arm in tr.arms:
-                    eta = np.minimum(
-                        np.sqrt(np.log(tr.n_arms) / (tr.n_arms * (tr.t + 1))),
-                        1 / tr.n_arms,
-                    )
-                    w[arm] = np.exp(eta * tr.iw_total_reward(arm))
-                w /= w.sum()
-                return (1 - tr.n_arms * eta) * w + eta * np.ones(tr.n_arms)
-
-        self.prob_function = prob_function
-
-    def fit(self, budget=None, **kwargs):
-        """
-        Train the bandit using the provided environment.
-
-        Parameters
-        ----------
-        budget: int
-            Total number of iterations, also called horizon.
-        """
-        horizon = budget
-        total_reward = 0.0
-
-        for ep in range(horizon):
-            # Warmup: play every arm one before starting computing indices
-            if ep < self.n_arms:
-                action = ep
-                probs = [float(k == action) for k in self.arms]
-            else:
-                # Compute sampling probability for each arm
-                # and play one at random
-                probs = self.prob_function(self.tracker)
-                action = self.rng.choice(self.arms, p=probs)
-
-            _, reward, _, _, _ = self.env.step(action)
-
-            # Feed the played action and the resulting reward and sampling
-            # probability to the tracker.
-            self.tracker.update(action, reward, {"p": probs[action]})
-
-            total_reward += reward
-
-        # Best action in hinsight is the one with highest sampling probability
-        self.optimal_action = np.argmax(probs[:])
-        info = {"episode_reward": total_reward}
-        return info
diff --git a/rlberry/agents/bandits/tools/__init__.py b/rlberry/agents/bandits/tools/__init__.py
deleted file mode 100644
index 7237ffcb5..000000000
--- a/rlberry/agents/bandits/tools/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .tracker import BanditTracker
diff --git a/rlberry/agents/bandits/tools/tracker.py b/rlberry/agents/bandits/tools/tracker.py
deleted file mode 100644
index 0aa6a17b0..000000000
--- a/rlberry/agents/bandits/tools/tracker.py
+++ /dev/null
@@ -1,231 +0,0 @@
-from rlberry import metadata_utils
-from rlberry.utils.writers import DefaultWriter
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class BanditTracker(DefaultWriter):
-    """
-    Container class for rewards and various statistics (means...) collected
-    during the run of a bandit algorithm.
-
-    BanditTracker is a companion class for
-    :class:`~rlberry.agents.bandits.BanditWithSimplePolicy` (and other agents
-    based on it), where a default tracker is automatically constructed, and can
-    then be used e.g as an entry for an index function.
-
-    It inherits the logic of DefaultWriter to write/store/read
-    various data of interest for the execution of a bandit agent.
-
-    Data are stored in the data attribute and indexed by a specific tag.
-    Except for the tag "t" (corresponding to the running total number of time
-    steps played by the agent), all tags are arm-specific (n_pulls,
-    total_reward...). Each tag entry is stored as a deque with fixed maximum
-    length (FIFO). By default, this maximum length is set to 1, i.e each new
-    update to the tag erases the previously stored entry. The maximum length
-    can be changed on a tag-by-tag basis with the dict maxlen_by_tag.
-
-    Data can be interacted with by using the following DefaultWriter accessors:
-        * Read:
-            * read_last_tag_value(tag, arm)): returns the last entry of the
-            deque corresponding to arm-specific tag.
-            * read_tag_value(tag, arm)): returns the full deque corresponding
-            to the arm-specific tag.
-        * Write:
-            * add_scalar(tag, value): add a single scalar value to the deque
-            corresponding to the tag.
-            * add_scalars(arm, {tags: values}): add multiple arm-specific
-            tagged values to each corresponding deque.
-
-    For ease of use, wrapper methods are provided to access common tag such as
-    t, n_pulls, total_reward... without explicitly calling the
-    read_last_tag_value/read_tag_value methods.
-
-    Parameters
-    ----------
-    agent: rlberry bandit agent
-        See :class:`~rlberry.agents.bandits`.
-
-    params: dict
-        Other parameters to condition what to store and compute.
-        In particuler if params contains store_rewards=True, the
-        rewards will be saved for each arm at each step and if
-        store_actions=True, the actions are saved.
-        It can also contain a function named "update" that will
-        be called at the end of the update phase. def update(tr, arm): ...
-
-
-    Examples
-    --------
-    >>>  def index(tr):
-         ''' Compute UCB index for rewards in [0,1]'''
-         return [
-            tr.mu_hat(arm) +  np.sqrt(
-                0.5 * np.log(1 / delta(tr.t))) / tr.n_pulls(arm)
-            )
-            for arm in tr.arms
-          ]
-
-    """
-
-    name = "BanditTracker"
-
-    def __init__(self, agent, params={}):
-        self.n_arms = agent.n_arms
-        self.arms = agent.arms
-        self.rng = agent.rng
-
-        # Store rewards for each arm or not
-        self.store_rewards = params.get("store_rewards", False)
-        # Store the actions for each arm or not
-        self.store_actions = params.get("store_actions", False)
-        # Additional update function
-        self.additional_update = params.get("update", None)
-
-        # Add importance weighted rewards or not
-        self.do_iwr = params.get("do_iwr", False)
-
-        # By default, store a single attribute (the most recent)
-        maxlen = 1
-        # To store all rewards, override the maxlen for the corresponding tags
-        maxlen_by_tag = dict()
-        if self.store_rewards:
-            for arm in self.arms:
-                maxlen_by_tag[str(arm) + "_reward"] = None
-        if self.store_actions:
-            maxlen_by_tag["action"] = None
-
-        _tracker_kwargs = dict(
-            name="BanditTracker",
-            execution_metadata=metadata_utils.ExecutionMetadata(),
-            maxlen=maxlen,
-            maxlen_by_tag=maxlen_by_tag,
-        )
-        DefaultWriter.__init__(self, print_log=False, **_tracker_kwargs)
-
-        self.reset_tracker()
-
-    def reset_tracker(self):
-        self.add_scalar("t", 0)
-
-        tag_scalar_dict = dict()
-        for arm in self.arms:
-            tag_scalar_dict["n_pulls"] = 0
-            tag_scalar_dict["total_reward"] = 0.0
-            if self.do_iwr:
-                tag_scalar_dict["iw_total_reward"] = 0.0
-            self.add_scalars(arm, tag_scalar_dict)
-
-    @property
-    def t(self):
-        """
-        Current running time of the bandit algorithm played by the associated
-        bandit agent.
-        """
-        return self.read_last_tag_value("t")
-
-    def n_pulls(self, arm):
-        """
-        Current number of pulls by the associated bandit agent to a given arm.
-        """
-        return self.read_last_tag_value("n_pulls", arm)
-
-    def rewards(self, arm):
-        """
-        All rewards collected so far by the associated bandit agent for a given
-        arm and currently stored. If maxlen_by_tag[str(arm) + "_reward"] is None
-        or maxlen is None, all the reward history is stored at anytime.
-        """
-        return self.read_tag_value("reward", arm)
-
-    def reward(self, arm):
-        """
-        Last collected reward for a given arm.
-        """
-        return self.read_last_tag_value("reward", arm)
-
-    def actions(self, arm):
-        """
-        All actions collected so far by the associated bandit agent for a given
-        arm and currently stored. If maxlen_by_tag["action"] is None
-        or maxlen is None, all the action history is stored at anytime.
-        """
-        return self.read_tag_value("action")
-
-    def action(self, arm):
-        """
-        Last collected action for a given arm.
-        """
-        return self.read_last_tag_value("action")
-
-    def total_reward(self, arm):
-        """
-        Current total reward collected so far by the associated bandit agent
-        for a given arm.
-        """
-        return self.read_last_tag_value("total_reward", arm)
-
-    def mu_hat(self, arm):
-        """
-        Current empirical mean reward for a given arm estimated by the
-        associated bandit agent.
-        """
-        return self.read_last_tag_value("mu_hat", arm)
-
-    def iw_total_reward(self, arm):
-        """
-        Empirical Importance weighted total reward collected so far by the
-        associated bandit agent for a given arm. Used by randomized algorithms.
-        The IW total reward is the sum of rewards for a given arm inversely
-        weighted by the arm sampling probabilities at each pull.
-        In this implementation, we update the loss-based estimator, i.e for
-        a reward r in [0, 1], we weight 1 - r instead of r
-        (see Note 9, Chapter 11 of [1]).
-
-        .. [1] Lattimore, Tor, and Csaba Szepesvári. Bandit algorithms.
-                Cambridge University Press, 2020.
-        """
-        return self.read_last_tag_value("iw_total_reward", arm)
-
-    def update(self, arm, reward, params={}):
-        """
-        After the associated bandit agent played a given arm and collected a
-        given reward, update the stored data.
-        By default, only standard statistics are calculated and stored (number
-        of pulls, current reward, total reward and current empirical mean
-        reward). Special parameters can be passed in params, e.g the sampling
-        probability for randomized algorithms (to update the importance
-        weighted total reward).
-        """
-        # Update current running time
-        self.add_scalar("t", self.t + 1)
-
-        # Total number of pulls for current arm
-        n_pulls_arm = self.n_pulls(arm) + 1
-        # Sum of rewards for current arm
-        total_reward_arm = self.total_reward(arm) + reward
-
-        tag_scalar_dict = {
-            "n_pulls": n_pulls_arm,
-            "reward": reward,
-            "total_reward": total_reward_arm,
-            "mu_hat": total_reward_arm / n_pulls_arm,
-        }
-
-        # Importance weighted total rewards for randomized algorithns
-        if self.do_iwr:
-            p = params.get("p", 1.0)
-            iw_total_reward_arm = self.iw_total_reward(arm)
-            tag_scalar_dict["iw_total_reward"] = (
-                iw_total_reward_arm + 1 - (1 - reward) / p
-            )
-
-        # Write all tracked statistics
-        self.add_scalars(arm, tag_scalar_dict)
-        self.add_scalar("action", arm)
-
-        # Do the additional update
-        if self.additional_update is not None:
-            self.additional_update(self, arm)
diff --git a/rlberry/agents/bandits/ts_agents.py b/rlberry/agents/bandits/ts_agents.py
deleted file mode 100644
index 528fae0c0..000000000
--- a/rlberry/agents/bandits/ts_agents.py
+++ /dev/null
@@ -1,157 +0,0 @@
-import numpy as np
-from rlberry.agents.bandits import BanditWithSimplePolicy
-
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class TSAgent(BanditWithSimplePolicy):
-    """
-    Agent for bandit environment using Thompson sampling.
-
-    Parameters
-    -----------
-    env : rlberry bandit environment
-        See :class:`~rlberry.envs.bandits.Bandit`.
-
-    prior : str in {"gaussian", "beta"}
-        Family of priors used in Thompson sampling algorithm.
-
-    prior_params : arary of size (2,n_actions) or None, default = None
-        Only used if prior = "gaussian", means and std of the gaussian prior distributions.
-        If None, use an array of all 0 and an array of all 1.
-
-
-    Examples
-    --------
-    >>> from rlberry.agents.bandits import TSAgent
-    >>> import numpy as np
-    >>> class BernoulliTSAgent(TSAgent):
-    >>>     name = "TS"
-    >>>     def __init__(self, env, **kwargs):
-    >>>     def prior_params(tr):
-    >>>            return [
-    >>>                [
-    >>>                    tr.total_reward(arm) + 1,
-    >>>                    tr.n_pulls(arm) - tr.total_reward(arm) + 1,
-    >>>                ]
-    >>>                for arm in tr.arms
-    >>>            ]
-    >>>
-    >>>        def prior_sampler(tr):
-    >>>            params = prior_params(tr)
-    >>>            return [tr.rng.beta(params[arm][0], params[arm][1]) for arm in tr.arms]
-    >>>
-    >>>        def optimal_action(tr):
-    >>>            params = prior_params(tr)
-    >>>            return np.argmax(
-    >>>                [
-    >>>                    params[arm][0] / (params[arm][0] + params[arm][1])
-    >>>                    for arm in tr.arms
-    >>>                ]
-    >>>            )
-    >>>
-    >>>        prior = {
-    >>>            "params": prior_params,
-    >>>            "sampler": prior_sampler,
-    >>>            "optimal_action": optimal_action,
-    >>>        }
-    >>>
-    >>>        TSAgent.__init__(self, env, prior, **kwargs)
-
-    """
-
-    name = "TSAgent"
-
-    def __init__(self, env, prior_info=None, **kwargs):
-        BanditWithSimplePolicy.__init__(self, env, **kwargs)
-        if prior_info is None:
-            # Beta-Bernoulli prior by default
-            def prior_params(tr):
-                """
-                The mean of a Bernoulli arm B(p) has prior distribution Beta(a, b),
-                where a is the number of success + 1, b the number of failures + 1.
-                """
-                return [
-                    [
-                        tr.total_reward(arm) + 1,
-                        tr.n_pulls(arm) - tr.total_reward(arm) + 1,
-                    ]
-                    for arm in tr.arms
-                ]
-
-            def prior_sampler(tr):
-                """
-                Beta prior.
-                """
-                params = prior_params(tr)
-                return [tr.rng.beta(params[arm][0], params[arm][1]) for arm in tr.arms]
-
-            def optimal_action(tr):
-                """
-                The mean of a Bernoulli arm B(p) has prior distribution Beta(a, b),
-                where a is the number of success + 1, b the number of failures + 1.
-                The expectation of p is a / (a + b), therefore the optimal arm w.r.t
-                the Beta prior is the one with highest a / (a + b).
-                """
-                params = prior_params(tr)
-                return np.argmax(
-                    [
-                        params[arm][0] / (params[arm][0] + params[arm][1])
-                        for arm in tr.arms
-                    ]
-                )
-
-            self.prior_info = {
-                "params": prior_params,
-                "sampler": prior_sampler,
-                "optimal_action": optimal_action,
-            }
-        else:
-            self.prior_info = prior_info
-
-    @property
-    def prior_sampler(self):
-        return self.prior_info.get("sampler")
-
-    @property
-    def get_optimal_action(self):
-        return self.prior_info.get("optimal_action")
-
-    def fit(self, budget=None, **kwargs):
-        """
-        Train the bandit using the provided environment.
-
-        Parameters
-        ----------
-        budget: int
-            Total number of iterations, also called horizon.
-        """
-        horizon = budget
-
-        total_reward = 0.0
-
-        for ep in range(horizon):
-            # Warmup: play every arm one before starting computing indices
-            if ep < self.n_arms:
-                action = ep
-            else:
-                # Sample from mean parameters from prior distributions
-                sample_mu = self.prior_sampler(self.tracker)
-                # Play the best sampled mean
-                action = np.argmax(sample_mu)
-
-            _, reward, _, _, _ = self.env.step(action)
-
-            # Feed the played action and the resulting reward to the tracker
-            self.tracker.update(action, reward)
-
-            total_reward += reward
-
-        # Best action in hinsight is the one with highest index
-        self.optimal_action = self.get_optimal_action(self.tracker)
-
-        info = {"episode_reward": total_reward}
-        return info
diff --git a/rlberry/agents/dynprog/__init__.py b/rlberry/agents/dynprog/__init__.py
deleted file mode 100644
index 8af8271d3..000000000
--- a/rlberry/agents/dynprog/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .value_iteration import ValueIterationAgent
diff --git a/rlberry/agents/dynprog/utils.py b/rlberry/agents/dynprog/utils.py
deleted file mode 100644
index 0d01c93b4..000000000
--- a/rlberry/agents/dynprog/utils.py
+++ /dev/null
@@ -1,272 +0,0 @@
-import numpy as np
-from rlberry.utils.jit_setup import numba_jit
-
-
-@numba_jit
-def backward_induction(R, P, horizon, gamma=1.0, vmax=np.inf):
-    """Backward induction to compute Q and V functions in the finite horizon
-    setting.
-
-    Parameters
-    ----------
-    R : numpy.ndarray
-        array of shape (S, A) contaning the rewards, where S is the number
-        of states and A is the number of actions
-    P : numpy.ndarray
-        array of shape (S, A, S) such that P[s,a,ns] is the probability of
-        arriving at ns by taking action a in state s.
-    horizon : int
-        problem horizon
-    gamma : double, default: 1.0
-        discount factor
-    vmax : double, default: np.inf
-        maximum possible value in V
-
-    Returns
-    --------
-    tuple (Q, V) containing the Q and V functions, of shapes (horizon, S, A)
-    and (horizon, S), respectively.
-    """
-    S, A = R.shape
-    V = np.zeros((horizon, S))
-    Q = np.zeros((horizon, S, A))
-    for hh in range(horizon - 1, -1, -1):
-        for ss in range(S):
-            max_q = -np.inf
-            for aa in range(A):
-                q_aa = R[ss, aa]
-                if hh < horizon - 1:
-                    # not using .dot instead of loop to avoid scipy dependency
-                    # (numba seems to require scipy for linear
-                    # algebra operations in numpy)
-                    for ns in range(S):
-                        q_aa += gamma * P[ss, aa, ns] * V[hh + 1, ns]
-                if q_aa > max_q:
-                    max_q = q_aa
-                Q[hh, ss, aa] = q_aa
-            V[hh, ss] = max_q
-            if V[hh, ss] > vmax:
-                V[hh, ss] = vmax
-    return Q, V
-
-
-@numba_jit
-def backward_induction_reward_sd(Q, V, R, P, gamma=1.0, vmax=np.inf):
-    """
-    Backward induction to compute Q and V functions in
-    the finite horizon setting.
-
-    Assumes R is stage-dependent, but P is stage-independent.
-
-    Takes as input the arrays where to store Q and V.
-
-    Parameters
-    ----------
-    Q:  numpy.ndarray
-        array of shape (horizon, S, A) where to store the Q function
-    V:  numpy.ndarray
-        array of shape (horizon, S) where to store the V function
-    R : numpy.ndarray
-        array of shape (horizon, S, A) contaning the rewards, where S is the number
-        of states and A is the number of actions
-    P : numpy.ndarray
-        array of shape (S, A, S) such that P[s,a,ns] is the probability of
-        arriving at ns by taking action a in state s.
-    horizon : int
-        problem horizon
-    gamma : double
-        discount factor, default = 1.0
-    vmax : double
-        maximum possible value in V
-        default = np.inf
-    """
-    H, S, A = R.shape
-    horizon = H
-    for hh in range(horizon - 1, -1, -1):
-        for ss in range(S):
-            max_q = -np.inf
-            for aa in range(A):
-                q_aa = R[hh, ss, aa]
-                if hh < horizon - 1:
-                    # not using .dot instead of loop to avoid scipy dependency
-                    # (numba seems to require scipy for linear algebra
-                    #  operations in numpy)
-                    for ns in range(S):
-                        q_aa += gamma * P[ss, aa, ns] * V[hh + 1, ns]
-                if q_aa > max_q:
-                    max_q = q_aa
-                Q[hh, ss, aa] = q_aa
-            V[hh, ss] = max_q
-            if V[hh, ss] > vmax:
-                V[hh, ss] = vmax
-
-
-@numba_jit
-def backward_induction_in_place(Q, V, R, P, horizon, gamma=1.0, vmax=np.inf):
-    """
-    Backward induction to compute Q and V functions in
-    the finite horizon setting.
-    Takes as input the arrays where to store Q and V.
-
-    Parameters
-    ----------
-    Q:  numpy.ndarray
-        array of shape (horizon, S, A) where to store the Q function
-    V:  numpy.ndarray
-        array of shape (horizon, S) where to store the V function
-    R : numpy.ndarray
-        array of shape (S, A) contaning the rewards, where S is the number
-        of states and A is the number of actions
-    P : numpy.ndarray
-        array of shape (S, A, S) such that P[s,a,ns] is the probability of
-        arriving at ns by taking action a in state s.
-    horizon : int
-        problem horizon
-    gamma : double
-        discount factor, default = 1.0
-    vmax : double
-        maximum possible value in V
-        default = np.inf
-    """
-    S, A = R.shape
-    for hh in range(horizon - 1, -1, -1):
-        for ss in range(S):
-            max_q = -np.inf
-            for aa in range(A):
-                q_aa = R[ss, aa]
-                if hh < horizon - 1:
-                    # not using .dot instead of loop to avoid scipy dependency
-                    # (numba seems to require scipy for linear algebra
-                    #  operations in numpy)
-                    for ns in range(S):
-                        q_aa += gamma * P[ss, aa, ns] * V[hh + 1, ns]
-                if q_aa > max_q:
-                    max_q = q_aa
-                Q[hh, ss, aa] = q_aa
-            V[hh, ss] = max_q
-            if V[hh, ss] > vmax:
-                V[hh, ss] = vmax
-
-
-@numba_jit
-def backward_induction_sd(Q, V, R, P, gamma=1.0, vmax=np.inf):
-    """
-    In-place implementation of backward induction to compute Q and V functions
-    in the finite horizon setting.
-
-    Assumes R and P are stage-dependent.
-
-    Parameters
-    ----------
-    Q:  numpy.ndarray
-        array of shape (H, S, A) where to store the Q function
-    V:  numpy.ndarray
-        array of shape (H, S) where to store the V function
-    R : numpy.ndarray
-        array of shape (H, S, A) contaning the rewards, where S is the number
-        of states and A is the number of actions
-    P : numpy.ndarray
-        array of shape (H, S, A, S) such that P[h, s, a, ns] is the probability of
-        arriving at ns by taking action a in state s at stage h.
-    gamma : double, default: 1.0
-        discount factor
-    vmax : double, default: np.inf
-        maximum possible value in V
-
-    """
-    H, S, A = R.shape
-    for hh in range(H - 1, -1, -1):
-        for ss in range(S):
-            max_q = -np.inf
-            for aa in range(A):
-                q_aa = R[hh, ss, aa]
-                if hh < H - 1:
-                    # not using .dot instead of loop to avoid scipy dependency
-                    # (numba seems to require scipy for linear
-                    # algebra operations in numpy)
-                    for ns in range(S):
-                        q_aa += gamma * P[hh, ss, aa, ns] * V[hh + 1, ns]
-                if q_aa > max_q:
-                    max_q = q_aa
-                Q[hh, ss, aa] = q_aa
-            V[hh, ss] = max_q
-            # clip V
-            if V[hh, ss] > vmax:
-                V[hh, ss] = vmax
-
-
-@numba_jit
-def value_iteration(R, P, gamma, epsilon=1e-6):
-    """
-    Value iteration for discounted problems.
-
-    Parameters
-    ----------
-    R : numpy.ndarray
-        array of shape (S, A) contaning the rewards, where S is the number
-        of states and A is the number of actions
-    P : numpy.ndarray
-        array of shape (S, A, S) such that P[s,a,ns] is the probability of
-        arriving at ns by taking action a in state s.
-    gamma : double
-        discount factor
-    epsilon : double
-        precision
-
-    Returns
-    --------
-    tuple (Q, V, n_it) containing the epsilon-optimal Q and V functions,
-    of shapes (S, A) and (S,), respectively, and n_it, the number of iterations
-    """
-    S, A = R.shape
-    Q = np.zeros((S, A))
-    Q_aux = np.full((S, A), np.inf)
-    n_it = 0
-    while np.abs(Q - Q_aux).max() > epsilon:
-        Q_aux = Q
-        Q = bellman_operator(Q, R, P, gamma)
-        n_it += 1
-    V = np.zeros(S)
-    # numba does not support np.max(Q, axis=1)
-    for ss in range(S):
-        V[ss] = Q[ss, :].max()
-    return Q, V, n_it
-
-
-@numba_jit
-def bellman_operator(Q, R, P, gamma):
-    """
-    Bellman optimality operator for Q functions
-
-    Parameters
-    ----------
-    Q : numpy.ndarray
-        array of shape (S, A) containing the Q function to which apply
-        the operator
-    R : numpy.ndarray
-        array of shape (S, A) contaning the rewards, where S is the number
-        of states and A is the number of actions
-    P : numpy.ndarray
-        array of shape (S, A, S) such that P[s,a,ns] is the probability of
-        arriving at ns by taking action a in state s.
-    gamma : double
-        discount factor
-
-    Returns
-    --------
-    TQ, array of shape (S, A) containing the result of the Bellman operator
-    applied to the input Q
-    """
-    S, A = Q.shape
-    TQ = np.zeros((S, A))
-    V = np.zeros(S)
-    # numba does not support np.max(Q, axis=1)
-    for ss in range(S):
-        V[ss] = Q[ss, :].max()
-    #
-    for ss in range(S):
-        for aa in range(A):
-            TQ[ss, aa] = R[ss, aa]
-            for ns in range(S):
-                TQ[ss, aa] += gamma * P[ss, aa, ns] * V[ns]
-    return TQ
diff --git a/rlberry/agents/dynprog/value_iteration.py b/rlberry/agents/dynprog/value_iteration.py
deleted file mode 100644
index fd9a4ec3a..000000000
--- a/rlberry/agents/dynprog/value_iteration.py
+++ /dev/null
@@ -1,82 +0,0 @@
-from rlberry.agents.agent import AgentWithSimplePolicy
-from rlberry.agents.dynprog.utils import backward_induction, value_iteration
-from rlberry.envs.finite.finite_mdp import FiniteMDP
-
-
-class ValueIterationAgent(AgentWithSimplePolicy):
-    """
-    Value iteration for enviroments of type FiniteMDP
-    (rlberry.envs.finite.finite_mdp.FiniteMDP)
-
-    Important: the discount gamma is also used if the problem is
-    finite horizon, but, in this case, gamma can be set to 1.0.
-
-    Parameters
-    -----------
-    env : rlberry.envs.finite.finite_mdp.FiniteMDP
-        Environment used to fit the agent.
-    gamma : double
-        Discount factor in [0, 1]
-    horizon : int
-        Horizon, if the problem is finite-horizon. if None, the discounted
-        problem is solved
-        default = None
-    epsilon : double
-        Precision of value iteration, only used in discounted problems
-        (when horizon is None).
-
-    """
-
-    name = "ValueIteration"
-
-    def __init__(self, env, gamma=0.95, horizon=None, epsilon=1e-6, **kwargs):
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-
-        # initialize base class
-        assert isinstance(
-            self.env, FiniteMDP
-        ), "Value iteration requires a FiniteMDP model."
-        #
-
-        self.gamma = gamma  # attribute gamma
-
-        self.horizon = horizon
-        self.epsilon = epsilon
-
-        # value functions
-        self.Q = None
-        self.V = None
-
-    def fit(self, budget=None, **kwargs):
-        """
-        Run value iteration.
-
-        Parameters
-        ----------
-        budget: None
-            Not used. Only defined for compatibility purpose with rlberry.
-            Changing `budget` value has no effect.
-        """
-        del kwargs
-        info = {}
-        if self.horizon is None:
-            assert self.gamma < 1.0, "The discounted setting requires gamma < 1.0"
-            self.Q, self.V, n_it = value_iteration(
-                self.env.R, self.env.P, self.gamma, self.epsilon
-            )
-            info["n_iterations"] = n_it
-            info["precision"] = self.epsilon
-        else:
-            self.Q, self.V = backward_induction(
-                self.env.R, self.env.P, self.horizon, self.gamma
-            )
-            info["n_iterations"] = self.horizon
-            info["precision"] = 0.0
-        return info
-
-    def policy(self, observation):
-        state = observation
-        if self.horizon is None:
-            return self.Q[state, :].argmax()
-        else:
-            return self.Q[0, state, :].argmax()
diff --git a/rlberry/agents/experimental/__init__.py b/rlberry/agents/experimental/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/rlberry/agents/experimental/tests/__init__.py b/rlberry/agents/experimental/tests/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/rlberry/agents/experimental/torch/__init__.py b/rlberry/agents/experimental/torch/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/rlberry/agents/features/__init__.py b/rlberry/agents/features/__init__.py
deleted file mode 100644
index 1e473c0dc..000000000
--- a/rlberry/agents/features/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .feature_map import FeatureMap
diff --git a/rlberry/agents/features/feature_map.py b/rlberry/agents/features/feature_map.py
deleted file mode 100644
index d847556aa..000000000
--- a/rlberry/agents/features/feature_map.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from abc import ABC, abstractmethod
-
-
-class FeatureMap(ABC):
-    """
-    Class representing a feature map, from (observation, action) pairs
-    to numpy arrays.
-
-    Attributes
-    ----------
-    shape : tuple
-        Shape of feature array.
-
-    Methods
-    --------
-    map()
-        Maps a (observation, action) pair to a numpy array.
-    """
-
-    def __init__(self):
-        ABC.__init__(self)
-        self.shape = ()
-
-    @abstractmethod
-    def map(self, observation, action):
-        """
-        Maps a (observation, action) pair to a numpy array.
-        """
-        pass
diff --git a/rlberry/agents/kernel_based/__init__.py b/rlberry/agents/kernel_based/__init__.py
deleted file mode 100644
index 275e51c86..000000000
--- a/rlberry/agents/kernel_based/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .rs_ucbvi import RSUCBVIAgent
-from .rs_kernel_ucbvi import RSKernelUCBVIAgent
diff --git a/rlberry/agents/kernel_based/common.py b/rlberry/agents/kernel_based/common.py
deleted file mode 100644
index 33757f66a..000000000
--- a/rlberry/agents/kernel_based/common.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import numpy as np
-from rlberry.utils.jit_setup import numba_jit
-from rlberry.utils.metrics import metric_lp
-
-
-@numba_jit
-def map_to_representative(
-    state,
-    lp_metric,
-    representative_states,
-    n_representatives,
-    min_dist,
-    scaling,
-    accept_new_repr,
-):
-    """Map state to representative state."""
-    dist_to_closest = np.inf
-    argmin = -1
-    for ii in range(n_representatives):
-        dist = metric_lp(state, representative_states[ii, :], lp_metric, scaling)
-        if dist < dist_to_closest:
-            dist_to_closest = dist
-            argmin = ii
-
-    max_representatives = representative_states.shape[0]
-    if (
-        (dist_to_closest > min_dist)
-        and (n_representatives < max_representatives)
-        and accept_new_repr
-    ):
-        new_index = n_representatives
-        representative_states[new_index, :] = state
-        return new_index
-    return argmin
diff --git a/rlberry/agents/kernel_based/kernels.py b/rlberry/agents/kernel_based/kernels.py
deleted file mode 100644
index 88954432c..000000000
--- a/rlberry/agents/kernel_based/kernels.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import numpy as np
-from rlberry.utils.jit_setup import numba_jit
-
-
-@numba_jit
-def kernel_func(z, kernel_type):
-    """
-    Returns a kernel function to the real value z.
-
-    Kernel types:
-
-    "uniform"      : 1.0*(abs(z) <= 1)
-    "triangular"   : max(0, 1 - abs(z))
-    "gaussian"     : exp(-z^2/2)
-    "epanechnikov" : max(0, 1-z^2)
-    "quartic"      : (1-z^2)^2 *(abs(z) <= 1)
-    "triweight"    : (1-z^2)^3 *(abs(z) <= 1)
-    "tricube"      : (1-abs(z)^3)^3 *(abs(z) <= 1)
-    "cosine"       : cos( z * (pi/2) ) *(abs(z) <= 1)
-    "exp-n"        : exp(-abs(z)^n/2), for n integer
-
-    Parameters
-    ----------
-    z : double
-    kernel_type : string
-    """
-    if kernel_type == "uniform":
-        return 1.0 * (np.abs(z) <= 1)
-    elif kernel_type == "triangular":
-        return (1.0 - np.abs(z)) * (np.abs(z) <= 1)
-    elif kernel_type == "gaussian":
-        return np.exp(-np.power(z, 2.0) / 2.0)
-    elif kernel_type == "epanechnikov":
-        return (1.0 - np.power(z, 2.0)) * (np.abs(z) <= 1)
-    elif kernel_type == "quartic":
-        return np.power((1.0 - np.power(z, 2.0)), 2.0) * (np.abs(z) <= 1)
-    elif kernel_type == "triweight":
-        return np.power((1.0 - np.power(z, 2.0)), 3.0) * (np.abs(z) <= 1)
-    elif kernel_type == "tricube":
-        return np.power((1.0 - np.power(np.abs(z), 3.0)), 3.0) * (np.abs(z) <= 1)
-    elif kernel_type == "cosine":
-        return np.cos(z * np.pi / 2) * (np.abs(z) <= 1)
-    elif "exp-" in kernel_type:
-        exponent = _str_to_int(kernel_type.split("-")[1])
-        return np.exp(-np.power(np.abs(z), exponent) / 2.0)
-    else:
-        raise NotImplementedError("Unknown kernel type.")
-
-
-@numba_jit
-def _str_to_int(s):
-    """
-    Source: https://github.com/numba/numba/issues/5650#issuecomment-623511109
-    """
-    final_index, result = len(s) - 1, 0
-    for i, v in enumerate(s):
-        result += (ord(v) - 48) * (10 ** (final_index - i))
-    return result
diff --git a/rlberry/agents/kernel_based/rs_kernel_ucbvi.py b/rlberry/agents/kernel_based/rs_kernel_ucbvi.py
deleted file mode 100644
index f27449577..000000000
--- a/rlberry/agents/kernel_based/rs_kernel_ucbvi.py
+++ /dev/null
@@ -1,390 +0,0 @@
-import numpy as np
-from rlberry.utils.jit_setup import numba_jit
-
-import gymnasium.spaces as spaces
-from rlberry.agents import AgentWithSimplePolicy
-from rlberry.agents.dynprog.utils import backward_induction
-from rlberry.agents.dynprog.utils import backward_induction_in_place
-from rlberry.utils.metrics import metric_lp
-from rlberry.agents.kernel_based.kernels import kernel_func
-from rlberry.agents.kernel_based.common import map_to_representative
-
-import rlberry
-
-logger = rlberry.logger
-
-
-@numba_jit
-def update_model(
-    repr_state,
-    action,
-    repr_next_state,
-    reward,
-    n_representatives,
-    repr_states,
-    lp_metric,
-    scaling,
-    bandwidth,
-    bonus_scale_factor,
-    beta,
-    v_max,
-    bonus_type,
-    kernel_type,
-    N_sa,
-    B_sa,
-    P_hat,
-    R_hat,
-):
-    """
-    Model update function, lots of arguments so we can use JIT :)
-    """
-    # aux var for transition update
-    dirac_next_s = np.zeros(n_representatives)
-    dirac_next_s[repr_next_state] = 1.0
-
-    for u_repr_state in range(n_representatives):
-        # compute weight
-        dist = metric_lp(
-            repr_states[repr_state, :], repr_states[u_repr_state, :], lp_metric, scaling
-        )
-        weight = kernel_func(dist / bandwidth, kernel_type=kernel_type)
-
-        # aux variables
-        prev_N_sa = beta + N_sa[u_repr_state, action]  # regularization beta
-        current_N_sa = prev_N_sa + weight
-
-        # update weights
-        N_sa[u_repr_state, action] += weight
-
-        # update transitions
-        P_hat[u_repr_state, action, :n_representatives] = (
-            dirac_next_s * weight / current_N_sa
-            + (prev_N_sa / current_N_sa)
-            * P_hat[u_repr_state, action, :n_representatives]
-        )
-
-        # update rewards
-        R_hat[u_repr_state, action] = (
-            weight * reward / current_N_sa
-            + (prev_N_sa / current_N_sa) * R_hat[u_repr_state, action]
-        )
-
-        # update bonus
-        B_sa[u_repr_state, action] = compute_bonus(
-            N_sa[u_repr_state, action], beta, bonus_scale_factor, v_max, bonus_type
-        )
-
-
-@numba_jit
-def compute_bonus(sum_weights, beta, bonus_scale_factor, v_max, bonus_type):
-    n = beta + sum_weights
-    if bonus_type == "simplified_bernstein":
-        return bonus_scale_factor * np.sqrt(1.0 / n) + (1 + beta) * (v_max) / n
-    else:
-        raise NotImplementedError("Error: unknown bonus type.")
-
-
-class RSKernelUCBVIAgent(AgentWithSimplePolicy):
-    """
-    Implements KernelUCBVI [1] with representative states [2, 3].
-
-    Value iteration with exploration bonuses for continuous-state environments,
-    using a online discretization strategy + kernel smoothing:
-    - Build (online) a set of representative states
-    - Using smoothing kernels, estimate transtions an rewards on the
-    finite set of representative states and actions.
-
-    Criterion: finite-horizon with discount factor gamma.
-    If the discount is not 1, only the Q function at h=0 is used.
-
-    The recommended policy after all the episodes is computed without
-    exploration bonuses.
-
-
-    Parameters
-    ----------
-    env : Model
-        Online model with continuous (Box) state space and discrete actions
-    gamma : double
-        Discount factor in [0, 1]. If gamma is 1.0, the problem is set to
-        be finite-horizon.
-    horizon : int
-        Horizon of the objective function. If None and gamma<1, set to
-        1/(1-gamma).
-    lp_metric: int
-        The metric on the state space is the one induced by the p-norm,
-        where p = lp_metric. Default = 2, for the Euclidean metric.
-    kernel_type : string
-        See rlberry.agents.kernel_based.kernels.kernel_func for
-        possible kernel types.
-    scaling: numpy.ndarray
-        Must have the same size as state array, used to scale the states
-        before computing the metric.
-        If None, set to:
-        - (env.observation_space.high - env.observation_space.low) if high
-            and low are bounded
-        - np.ones(env.observation_space.shape[0]) if high or low
-        are unbounded
-    bandwidth : double
-        Kernel bandwidth.
-    min_dist : double
-        Minimum distance between two representative states
-    max_repr : int
-        Maximum number of representative states.
-        If None, it is set to  (sqrt(d)/min_dist)**d, where d
-        is the dimension of the state space
-    bonus_scale_factor : double
-        Constant by which to multiply the exploration bonus,
-        controls the level of exploration.
-    beta : double
-        Regularization constant.
-    bonus_type : string
-            Type of exploration bonus. Currently, only "simplified_bernstein"
-            is implemented.
-
-
-    References
-    ----------
-    [1] Domingues et al., 2020
-        Regret Bounds for Kernel-Based Reinforcement Learning
-        https://arxiv.org/abs/2004.05599
-    [2] Domingues et al., 2020
-        A Kernel-Based Approach to Non-Stationary Reinforcement Learning
-        in Metric Spaces
-        https://arxiv.org/abs/2007.05078
-    [3] Kveton & Theocharous, 2012
-        Kernel-Based Reinforcement Learning on Representative States
-        https://www.aaai.org/ocs/index.php/AAAI/AAAI12/paper/viewFile/4967/5509
-    """
-
-    name = "RSKernelUCBVI"
-
-    def __init__(
-        self,
-        env,
-        gamma=0.99,
-        horizon=None,
-        lp_metric=2,
-        kernel_type="epanechnikov",
-        scaling=None,
-        bandwidth=0.05,
-        min_dist=0.1,
-        max_repr=1000,
-        bonus_scale_factor=1.0,
-        beta=0.01,
-        bonus_type="simplified_bernstein",
-        **kwargs
-    ):
-        # init base class
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-
-        self.gamma = gamma
-        self.horizon = horizon
-        self.lp_metric = lp_metric
-        self.kernel_type = kernel_type
-        self.bandwidth = bandwidth
-        self.min_dist = min_dist
-        self.bonus_scale_factor = bonus_scale_factor
-        self.beta = beta
-        self.bonus_type = bonus_type
-
-        # check environment
-        assert isinstance(self.env.observation_space, spaces.Box)
-        assert isinstance(self.env.action_space, spaces.Discrete)
-
-        # other checks
-        assert gamma >= 0 and gamma <= 1.0
-        if self.horizon is None:
-            assert gamma < 1.0, "If no horizon is given, gamma must be smaller than 1."
-            self.horizon = int(np.ceil(1.0 / (1.0 - gamma)))
-
-        # state dimension
-        self.state_dim = self.env.observation_space.shape[0]
-
-        # compute scaling, if it is None
-        if scaling is None:
-            # if high and low are bounded
-            if (self.env.observation_space.high == np.inf).sum() == 0 and (
-                self.env.observation_space.low == -np.inf
-            ).sum() == 0:
-                scaling = (
-                    self.env.observation_space.high - self.env.observation_space.low
-                )
-                # if high or low are unbounded
-            else:
-                scaling = np.ones(self.state_dim)
-        else:
-            assert scaling.ndim == 1
-            assert scaling.shape[0] == self.state_dim
-        self.scaling = scaling
-
-        # maximum value
-        r_range = self.env.reward_range[1] - self.env.reward_range[0]
-        if r_range == np.inf or r_range == 0.0:
-            logger.warning(
-                "{}: Reward range is  zero or infinity. ".format(self.name)
-                + "Setting it to 1."
-            )
-            r_range = 1.0
-
-        if self.gamma == 1.0:
-            self.v_max = r_range * horizon
-        else:
-            self.v_max = (
-                r_range
-                * (1.0 - np.power(self.gamma, self.horizon))
-                / (1.0 - self.gamma)
-            )
-
-        # number of representative states and number of actions
-        if max_repr is None:
-            max_repr = int(
-                np.ceil(
-                    (1.0 * np.sqrt(self.state_dim) / self.min_dist) ** self.state_dim
-                )
-            )
-        self.max_repr = max_repr
-
-        # current number of representative states
-        self.M = None
-        self.A = self.env.action_space.n
-
-        # declaring variables
-        self.episode = None  # current episode
-        self.representative_states = None  # coordinates of all repr states
-        self.N_sa = None  # sum of weights at (s, a)
-        self.B_sa = None  # bonus at (s, a)
-        self.R_hat = None  # reward  estimate
-        self.P_hat = None  # transitions estimate
-        self.Q = None  # Q function
-        self.V = None  # V function
-
-        self.Q_policy = None  # Q function for recommended policy
-
-        # initialize
-        self.reset()
-
-    def reset(self, **kwargs):
-        self.M = 0
-        self.representative_states = np.zeros((self.max_repr, self.state_dim))
-        self.N_sa = np.zeros((self.max_repr, self.A))
-        self.B_sa = self.v_max * np.ones((self.max_repr, self.A))
-
-        self.R_hat = np.zeros((self.max_repr, self.A))
-        self.P_hat = np.zeros((self.max_repr, self.A, self.max_repr))
-
-        self.V = np.zeros((self.horizon, self.max_repr))
-        self.Q = np.zeros((self.horizon, self.max_repr, self.A))
-        self.Q_policy = None
-
-        self.episode = 0
-
-    def policy(self, observation):
-        state = observation
-        assert self.Q_policy is not None
-        repr_state = self._map_to_repr(state, False)
-        return self.Q_policy[0, repr_state, :].argmax()
-
-    def fit(self, budget: int, **kwargs):
-        """
-        Train the agent using the provided environment.
-
-        Parameters
-        ----------
-        budget: int
-            number of episodes. Each episode runs for self.horizon unless it
-            enconters a terminal state in which case it stops early.
-        """
-        del kwargs
-        for _ in range(budget):
-            self._run_episode()
-
-        # compute Q function for the recommended policy
-        self.Q_policy, _ = backward_induction(
-            self.R_hat[: self.M, :],
-            self.P_hat[: self.M, :, : self.M],
-            self.horizon,
-            self.gamma,
-        )
-
-    def _map_to_repr(self, state, accept_new_repr=True):
-        repr_state = map_to_representative(
-            state,
-            self.lp_metric,
-            self.representative_states,
-            self.M,
-            self.min_dist,
-            self.scaling,
-            accept_new_repr,
-        )
-        # check if new representative state
-        if repr_state == self.M:
-            self.M += 1
-        return repr_state
-
-    def _update(self, state, action, next_state, reward):
-        repr_state = self._map_to_repr(state)
-        repr_next_state = self._map_to_repr(next_state)
-
-        update_model(
-            repr_state,
-            action,
-            repr_next_state,
-            reward,
-            self.M,
-            self.representative_states,
-            self.lp_metric,
-            self.scaling,
-            self.bandwidth,
-            self.bonus_scale_factor,
-            self.beta,
-            self.v_max,
-            self.bonus_type,
-            self.kernel_type,
-            self.N_sa,
-            self.B_sa,
-            self.P_hat,
-            self.R_hat,
-        )
-
-    def _get_action(self, state, hh=0):
-        assert self.Q is not None
-        repr_state = self._map_to_repr(state, False)
-        return self.Q[hh, repr_state, :].argmax()
-
-    def _run_episode(self):
-        # interact for H steps
-        episode_rewards = 0
-        observation, info = self.env.reset()
-        for hh in range(self.horizon):
-            action = self._get_action(observation, hh)
-            next_observation, reward, terminated, truncated, info = self.env.step(
-                action
-            )
-            done = terminated or truncated
-            self._update(observation, action, next_observation, reward)
-            observation = next_observation
-            episode_rewards += reward
-
-            if done:
-                break
-
-        # run backward induction
-        backward_induction_in_place(
-            self.Q[:, : self.M, :],
-            self.V[:, : self.M],
-            self.R_hat[: self.M, :] + self.B_sa[: self.M, :],
-            self.P_hat[: self.M, :, : self.M],
-            self.horizon,
-            self.gamma,
-            self.v_max,
-        )
-
-        self.episode += 1
-        #
-        if self.writer is not None:
-            self.writer.add_scalar("episode_rewards", episode_rewards, self.episode)
-            self.writer.add_scalar("representative states", self.M, self.episode)
-
-        # return sum of rewards collected in the episode
-        return episode_rewards
diff --git a/rlberry/agents/kernel_based/rs_ucbvi.py b/rlberry/agents/kernel_based/rs_ucbvi.py
deleted file mode 100644
index cee45ce56..000000000
--- a/rlberry/agents/kernel_based/rs_ucbvi.py
+++ /dev/null
@@ -1,332 +0,0 @@
-from rlberry.agents.agent import AgentWithSimplePolicy
-import numpy as np
-
-import gymnasium.spaces as spaces
-from rlberry.agents.dynprog.utils import backward_induction
-from rlberry.agents.dynprog.utils import backward_induction_in_place
-from rlberry.agents.kernel_based.common import map_to_representative
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class RSUCBVIAgent(AgentWithSimplePolicy):
-    """
-    Value iteration with exploration bonuses for continuous-state environments,
-    using a online discretization strategy.
-
-    The strategy:
-    - Build (online) a set of representative states
-    - Estimate transtions an rewards on the finite set of representative states
-    and actions.
-
-    Criterion: finite-horizon with discount factor gamma.
-    If the discount is not 1, only the Q function at h=0 is used.
-
-    The recommended policy after all the episodes is computed without
-    exploration bonuses.
-
-    Parameters
-    ----------
-    env : Model
-        Online model with continuous (Box) state space and discrete actions
-    gamma : double
-        Discount factor in [0, 1]. If gamma is 1.0, the problem is set to
-        be finite-horizon.
-    horizon : int
-        Horizon of the objective function. If None and gamma<1, set to
-        1/(1-gamma).
-    lp_metric: int
-        The metric on the state space is the one induced by the p-norm,
-        where p = lp_metric. Default = 2, for the Euclidean metric.
-    scaling: numpy.ndarray
-        Must have the same size as state array, used to scale the states
-        before computing the metric.
-        If None, set to:
-        - (env.observation_space.high - env.observation_space.low) if high
-        and low are bounded
-        - np.ones(env.observation_space.shape[0]) if high or low are
-        unbounded
-    min_dist: double
-        Minimum distance between two representative states
-    max_repr: int
-        Maximum number of representative states.
-        If None, it is set to  (sqrt(d)/min_dist)**d, where d
-        is the dimension of the state space
-    bonus_scale_factor : double
-        Constant by which to multiply the exploration bonus, controls
-        the level of exploration.
-    bonus_type : string
-        Type of exploration bonus. Currently, only "simplified_bernstein"
-        is implemented. If `reward_free` is true, this parameter is ignored
-        and the algorithm uses 1/n bonuses.
-    reward_free : bool
-        If true, ignores rewards and uses only 1/n bonuses.
-
-    References
-    ----------
-    .. [1] Azar, Mohammad Gheshlaghi, Ian Osband, and Rémi Munos.
-    "Minimax regret bounds for reinforcement learning."
-    Proceedings of the 34th ICML, 2017.
-
-    .. [2] Strehl, Alexander L., and Michael L. Littman.
-    "An analysis of model-based interval estimation for Markov decision
-    processes."
-     Journal of Computer and System Sciences 74.8 (2008): 1309-1331.
-
-    .. [3] Kveton, Branislav, and Georgios Theocharous.
-    "Kernel-Based Reinforcement Learning on Representative States."
-    AAAI, 2012.
-
-    .. [4] Domingues, O. D., Ménard, P., Pirotta, M., Kaufmann, E., & Valko, M.(2020).
-    A kernel-based approach to non-stationary reinforcement learning in metric
-    spaces.
-    arXiv preprint arXiv:2007.05078.
-    """
-
-    name = "RSUCBVI"
-
-    def __init__(
-        self,
-        env,
-        gamma=0.99,
-        horizon=100,
-        lp_metric=2,
-        scaling=None,
-        min_dist=0.1,
-        max_repr=1000,
-        bonus_scale_factor=1.0,
-        bonus_type="simplified_bernstein",
-        reward_free=False,
-        **kwargs
-    ):
-        # init base class
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-
-        self.gamma = gamma
-        self.horizon = horizon
-        self.lp_metric = lp_metric
-        self.min_dist = min_dist
-        self.bonus_scale_factor = bonus_scale_factor
-        self.bonus_type = bonus_type
-        self.reward_free = reward_free
-
-        # check environment
-        assert isinstance(self.env.observation_space, spaces.Box)
-        assert isinstance(self.env.action_space, spaces.Discrete)
-
-        # other checks
-        assert gamma >= 0 and gamma <= 1.0
-        if self.horizon is None:
-            assert gamma < 1.0, "If no horizon is given, gamma must be smaller than 1."
-            self.horizon = int(np.ceil(1.0 / (1.0 - gamma)))
-
-        # state dimension
-        self.state_dim = self.env.observation_space.shape[0]
-
-        # compute scaling, if it is None
-        if scaling is None:
-            # if high and low are bounded
-            if (self.env.observation_space.high == np.inf).sum() == 0 and (
-                self.env.observation_space.low == -np.inf
-            ).sum() == 0:
-                scaling = (
-                    self.env.observation_space.high - self.env.observation_space.low
-                )
-                # if high or low are unbounded
-            else:
-                scaling = np.ones(self.state_dim)
-        else:
-            assert scaling.ndim == 1
-            assert scaling.shape[0] == self.state_dim
-        self.scaling = scaling
-
-        # maximum value
-        r_range = self.env.reward_range[1] - self.env.reward_range[0]
-        if r_range == np.inf or r_range == 0.0:
-            logger.warning(
-                "{}: Reward range is  zero or infinity. ".format(self.name)
-                + "Setting it to 1."
-            )
-            r_range = 1.0
-
-        if self.gamma == 1.0:
-            self.v_max = r_range * horizon
-        else:
-            self.v_max = (
-                r_range
-                * (1.0 - np.power(self.gamma, self.horizon))
-                / (1.0 - self.gamma)
-            )
-
-        # number of representative states and number of actions
-        if max_repr is None:
-            max_repr = int(
-                np.ceil(
-                    (1.0 * np.sqrt(self.state_dim) / self.min_dist) ** self.state_dim
-                )
-            )
-        self.max_repr = max_repr
-
-        # current number of representative states
-        self.M = None
-        self.A = self.env.action_space.n
-
-        # declaring variables
-        self.episode = None  # current episode
-        self.representative_states = None  # coordinates of all repr states
-        self.N_sa = None  # visits to (s, a)
-        self.N_sas = None  # visits to (s, a, s')
-        self.S_sa = None  # sum of rewards at (s, a)
-        self.B_sa = None  # bonus at (s, a)
-        self.Q = None  # Q function
-        self.V = None  # V function
-
-        self.Q_policy = None  # Q function for recommended policy
-
-        # initialize
-        self.reset()
-
-    def reset(self, **kwargs):
-        self.M = 0
-        self.representative_states = np.zeros((self.max_repr, self.state_dim))
-        self.N_sa = np.zeros((self.max_repr, self.A))
-        self.N_sas = np.zeros((self.max_repr, self.A, self.max_repr))
-        self.S_sa = np.zeros((self.max_repr, self.A))
-        self.B_sa = self.v_max * np.ones((self.max_repr, self.A))
-
-        self.R_hat = np.zeros((self.max_repr, self.A))
-        self.P_hat = np.zeros((self.max_repr, self.A, self.max_repr))
-
-        self.V = np.zeros((self.horizon, self.max_repr))
-        self.Q = np.zeros((self.horizon, self.max_repr, self.A))
-        self.Q_policy = None
-
-        self.episode = 0
-
-    def policy(self, observation):
-        state = observation
-        assert self.Q_policy is not None
-        repr_state = self._map_to_repr(state, False)
-        return self.Q_policy[0, repr_state, :].argmax()
-
-    def fit(self, budget: int, **kwargs):
-        """
-        Train the agent using the provided environment.
-
-        Parameters
-        ----------
-        budget: int
-            number of episodes. Each episode runs for self.horizon unless it
-            enconters a terminal state in which case it stops early.
-        """
-        del kwargs
-        n_episodes_to_run = budget
-        count = 0
-        while count < n_episodes_to_run:
-            self._run_episode()
-            count += 1
-
-        # compute Q function for the recommended policy
-        self.Q_policy, _ = backward_induction(
-            self.R_hat[: self.M, :],
-            self.P_hat[: self.M, :, : self.M],
-            self.horizon,
-            self.gamma,
-        )
-
-    def _map_to_repr(self, state, accept_new_repr=True):
-        repr_state = map_to_representative(
-            state,
-            self.lp_metric,
-            self.representative_states,
-            self.M,
-            self.min_dist,
-            self.scaling,
-            accept_new_repr,
-        )
-        # check if new representative state
-        if repr_state == self.M:
-            self.M += 1
-        return repr_state
-
-    def _update(self, state, action, next_state, reward):
-        repr_state = self._map_to_repr(state)
-        repr_next_state = self._map_to_repr(next_state)
-
-        self.N_sa[repr_state, action] += 1
-        self.N_sas[repr_state, action, repr_next_state] += 1
-        self.S_sa[repr_state, action] += reward
-
-        self.R_hat[repr_state, action] = (
-            self.S_sa[repr_state, action] / self.N_sa[repr_state, action]
-        )
-        self.P_hat[repr_state, action, :] = (
-            self.N_sas[repr_state, action, :] / self.N_sa[repr_state, action]
-        )
-        self.B_sa[repr_state, action] = self._compute_bonus(
-            self.N_sa[repr_state, action]
-        )
-
-    def _compute_bonus(self, n):
-        # reward-free
-        if self.reward_free:
-            bonus = 1.0 / n
-            return bonus
-
-        # not reward-free
-        if self.bonus_type == "simplified_bernstein":
-            bonus = self.bonus_scale_factor * np.sqrt(1.0 / n) + self.v_max / n
-            bonus = min(bonus, self.v_max)
-            return bonus
-        else:
-            raise NotImplementedError(
-                "Error: bonus type {} not implemented".format(self.bonus_type)
-            )
-
-    def _get_action(self, state, hh=0):
-        assert self.Q is not None
-        repr_state = self._map_to_repr(state, False)
-        return self.Q[hh, repr_state, :].argmax()
-
-    def _run_episode(self):
-        # interact for H steps
-        episode_rewards = 0
-        observation, info = self.env.reset()
-        for hh in range(self.horizon):
-            action = self._get_action(observation, hh)
-            next_observation, reward, terminated, truncated, info = self.env.step(
-                action
-            )
-            done = terminated or truncated
-            episode_rewards += reward  # used for logging only
-
-            if self.reward_free:
-                reward = 0.0  # set to zero before update if reward_free
-
-            self._update(observation, action, next_observation, reward)
-
-            observation = next_observation
-            if done:
-                break
-
-        # run backward induction
-        backward_induction_in_place(
-            self.Q[:, : self.M, :],
-            self.V[:, : self.M],
-            self.R_hat[: self.M, :] + self.B_sa[: self.M, :],
-            self.P_hat[: self.M, :, : self.M],
-            self.horizon,
-            self.gamma,
-            self.v_max,
-        )
-
-        self.episode += 1
-        #
-        if self.writer is not None:
-            self.writer.add_scalar("episode_rewards", episode_rewards, self.episode)
-            self.writer.add_scalar("representative states", self.M, self.episode)
-
-        # return sum of rewards collected in the episode
-        return episode_rewards
diff --git a/rlberry/agents/linear/__init__.py b/rlberry/agents/linear/__init__.py
deleted file mode 100644
index 3db7865c3..000000000
--- a/rlberry/agents/linear/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .lsvi_ucb import LSVIUCBAgent
diff --git a/rlberry/agents/linear/lsvi_ucb.py b/rlberry/agents/linear/lsvi_ucb.py
deleted file mode 100644
index e777d05d9..000000000
--- a/rlberry/agents/linear/lsvi_ucb.py
+++ /dev/null
@@ -1,356 +0,0 @@
-import numpy as np
-from rlberry.agents import AgentWithSimplePolicy
-from gymnasium.spaces import Discrete
-from rlberry.utils.jit_setup import numba_jit
-
-import rlberry
-
-logger = rlberry.logger
-
-
-@numba_jit
-def run_lsvi_jit(
-    dim,
-    horizon,
-    bonus_factor,
-    lambda_mat_inv,
-    reward_hist,
-    gamma,
-    feat_hist,
-    n_actions,
-    feat_ns_all_actions,
-    v_max,
-    total_time_steps,
-):
-    """
-    Jit version of Least-Squares Value Iteration.
-
-    Parameters
-    ----------
-    dim : int
-        Dimension of the features
-    horiton : int
-
-    bonus_factor : int
-
-    lambda_mat_inv : numpy array (dim, dim)
-        Inverse of the design matrix
-
-    reward_hist : numpy array (time,)
-
-    gamma : double
-
-    feat_hist : numpy array (time, dim)
-
-    n_actions : int
-
-    feat_ns_all_actions : numpy array (time, n_actions, dim)
-        History of next state features for all actions
-
-    vmax : double
-        Maximum value of the value function
-
-    total_time_steps : int
-        Current step count
-    """
-    # run value iteration
-    q_w = np.zeros((horizon + 1, dim))
-    for hh in range(horizon - 1, -1, -1):
-        T = total_time_steps
-        b = np.zeros(dim)
-        for tt in range(T):
-            # compute q function at next state, q_ns
-            q_ns = np.zeros(n_actions)
-            for aa in range(n_actions):
-                #
-                feat_ns_aa = feat_ns_all_actions[tt, aa, :]
-                inverse_counts = feat_ns_aa.dot(lambda_mat_inv.T.dot(feat_ns_aa))
-                bonus = bonus_factor * np.sqrt(
-                    inverse_counts
-                ) + v_max * inverse_counts * (bonus_factor > 0.0)
-                #
-                q_ns[aa] = feat_ns_aa.dot(q_w[hh + 1, :]) + bonus
-                q_ns[aa] = min(q_ns[aa], v_max)
-
-            # compute regretion targets
-            target = reward_hist[tt] + gamma * q_ns.max()
-            feat = feat_hist[tt, :]
-            b = b + target * feat
-
-        # solve M x = b, where x = q_w, and M = self.lambda_mat
-        q_w[hh, :] = lambda_mat_inv.T @ b
-    return q_w
-
-
-class LSVIUCBAgent(AgentWithSimplePolicy):
-    """
-    A version of Least-Squares Value Iteration with UCB (LSVI-UCB),
-    proposed by Jin et al. (2020).
-
-    If bonus_scale_factor is 0.0, performs random exploration.
-
-    Notes
-    -----
-    The computation of exploration bonuses was adapted to match the "simplified Bernstein"
-    bonuses that works well empirically for UCBVI in the tabular case.
-
-    The transition probabilities are assumed to be *independent* of the timestep h.
-
-    Parameters
-    ----------
-    env : Model
-        Online model of an environment.
-    horizon : int
-        Maximum length of each episode.
-    feature_map_fn : function(env, kwargs)
-        Function that returns a feature map instance
-        (rlberry.agents.features.FeatureMap class).
-    feature_map_kwargs:
-        kwargs for feature_map_fn
-    gamma : double
-        Discount factor.
-    bonus_scale_factor : double
-        Constant by which to multiply the exploration bonus.
-    reg_factor : double
-        Linear regression regularization factor.
-
-    References
-    ----------
-    Jin, C., Yang, Z., Wang, Z., & Jordan, M. I. (2020, July).
-    Provably efficient reinforcement learning with linear
-    function approximation. In Conference on Learning Theory (pp. 2137-2143).
-    """
-
-    name = "LSVI-UCB"
-
-    def __init__(
-        self,
-        env,
-        horizon,
-        feature_map_fn,
-        feature_map_kwargs=None,
-        gamma=0.99,
-        bonus_scale_factor=1.0,
-        reg_factor=0.1,
-        **kwargs
-    ):
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-
-        self.n_episodes = None
-        self.horizon = horizon
-        self.gamma = gamma
-        self.bonus_scale_factor = bonus_scale_factor
-        self.reg_factor = reg_factor
-        feature_map_kwargs = feature_map_kwargs or {}
-        self.feature_map = feature_map_fn(self.env, **feature_map_kwargs)
-
-        #
-        if self.bonus_scale_factor == 0.0:
-            self.name = "LSVI-Random-Expl"
-
-        # maximum value
-        r_range = self.env.reward_range[1] - self.env.reward_range[0]
-        if r_range == np.inf:
-            logger.warning(
-                "{}: Reward range is infinity. ".format(self.name) + "Clipping it to 1."
-            )
-            r_range = 1.0
-
-        if self.gamma == 1.0:
-            self.v_max = r_range * horizon
-        else:
-            self.v_max = (
-                r_range
-                * (1.0 - np.power(self.gamma, self.horizon))
-                / (1.0 - self.gamma)
-            )
-
-        #
-        assert isinstance(
-            self.env.action_space, Discrete
-        ), "LSVI-UCB requires discrete actions."
-
-        #
-        assert len(self.feature_map.shape) == 1
-        self.dim = self.feature_map.shape[0]
-
-        # attributes initialized in reset()
-        self.episode = None
-        self.lambda_mat = None  # lambda matrix
-        self.lambda_mat_inv = None  # inverse of lambda matrix
-        self.w_vec = None  # vector representation of Q
-        self.w_policy = None  # representation of Q for final policy
-        self.reward_hist = None  # reward history
-        self.state_hist = None  # state history
-        self.action_hist = None  # action history
-        self.nstate_hist = None  # next state history
-
-        self.feat_hist = None  # feature history
-        self.feat_ns_all_actions = None  # next state features for all actions
-        #
-
-        # aux variables (init in reset() too)
-        self._rewards = None
-
-    def reset(self):
-        self.episode = 0
-        self.total_time_steps = 0
-        self.lambda_mat = self.reg_factor * np.eye(self.dim)
-        self.lambda_mat_inv = (1.0 / self.reg_factor) * np.eye(self.dim)
-        self.w_vec = np.zeros((self.horizon + 1, self.dim))
-        self.reward_hist = np.zeros(self.n_episodes * self.horizon)
-        self.state_hist = []
-        self.action_hist = []
-        self.nstate_hist = []
-        # episode rewards
-        self._rewards = np.zeros(self.n_episodes)
-        #
-        self.feat_hist = np.zeros((self.n_episodes * self.horizon, self.dim))
-        self.feat_ns_all_actions = np.zeros(
-            (self.n_episodes * self.horizon, self.env.action_space.n, self.dim)
-        )
-        #
-        self.w_policy = None
-
-    def fit(self, budget, **kwargs):
-        """
-        Train the agent using the provided environment.
-
-        Parameters
-        ----------
-        budget: int
-            number of episodes. Each episode runs for self.horizon unless it
-            enconters a terminal state in which case it stops early.
-            Warning: Calling fit() more than once will reset the algorithm
-            (to realocate memory according to the number of episodes)
-        """
-        del kwargs
-
-        # Allocate memory according to budget.
-        # TODO: avoid the need to reset() the algorithm if fit() is called again.
-        if self.n_episodes is not None:
-            logger.warning(
-                "[LSVI-UCB]: Calling fit() more than once will reset the algorithm"
-                + " (to realocate memory according to the number of episodes)."
-            )
-        self.n_episodes = budget
-        self.reset()
-
-        for ep in range(self.n_episodes):
-            self.run_episode()
-            if self.bonus_scale_factor > 0.0 or ep == self.n_episodes - 1:
-                # update Q function representation
-                self.w_vec = self._run_lsvi(self.bonus_scale_factor)
-
-        self.w_policy = self._run_lsvi(bonus_factor=0.0)[0, :]
-
-    def policy(self, observation):
-        q_w = self.w_policy
-        assert q_w is not None
-        #
-        q_vec = self._compute_q_vec(q_w, observation, 0.0)
-        return q_vec.argmax()
-
-    def _optimistic_policy(self, observation, hh):
-        q_w = self.w_vec[hh, :]
-        q_vec = self._compute_q_vec(q_w, observation, self.bonus_scale_factor)
-        return q_vec.argmax()
-
-    def run_episode(self):
-        observation, info = self.env.reset()
-        episode_rewards = 0
-        for hh in range(self.horizon):
-            if self.bonus_scale_factor == 0.0:
-                action = self.env.action_space.sample()
-            else:
-                action = self._optimistic_policy(observation, hh)
-
-            next_observation, reward, terminated, truncated, info = self.env.step(
-                action
-            )
-            done = terminated or truncated
-
-            feat = self.feature_map.map(observation, action)
-            outer_prod = np.outer(feat, feat)
-            inv = self.lambda_mat_inv
-
-            #
-            self.lambda_mat += np.outer(feat, feat)
-            # update inverse
-            self.lambda_mat_inv -= (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat)
-
-            # update history
-            self.reward_hist[self.total_time_steps] = reward
-            self.state_hist.append(observation)
-            self.action_hist.append(action)
-            self.nstate_hist.append(next_observation)
-
-            #
-            tt = self.total_time_steps
-            self.feat_hist[tt, :] = self.feature_map.map(observation, action)
-            for aa in range(self.env.action_space.n):
-                self.feat_ns_all_actions[tt, aa, :] = self.feature_map.map(
-                    next_observation, aa
-                )
-
-            # increments
-            self.total_time_steps += 1
-            episode_rewards += reward
-
-            #
-            observation = next_observation
-            if done:
-                break
-
-        # store data
-        self._rewards[self.episode] = episode_rewards
-
-        # update ep
-        self.episode += 1
-
-        #
-        if self.writer is not None:
-            self.writer.add_scalar("episode_rewards", episode_rewards, self.episode)
-
-        return episode_rewards
-
-    def _compute_q(self, q_w, state, action, bonus_factor):
-        """q_w is the vector representation of the Q function."""
-        feat = self.feature_map.map(state, action)
-        inverse_counts = feat @ (self.lambda_mat_inv.T @ feat)
-        bonus = bonus_factor * np.sqrt(inverse_counts) + self.v_max * inverse_counts * (
-            bonus_factor > 0.0
-        )
-        q = feat.dot(q_w) + bonus
-        return q
-
-    def _compute_q_vec(self, q_w, state, bonus_factor):
-        A = self.env.action_space.n
-        q_vec = np.zeros(A)
-        for aa in range(A):
-            # q_vec[aa] = self._compute_q(q_w, state, aa, bonus_factor)
-            feat = self.feature_map.map(state, aa)
-            inverse_counts = feat @ (self.lambda_mat_inv.T @ feat)
-            bonus = bonus_factor * np.sqrt(
-                inverse_counts
-            ) + self.v_max * inverse_counts * (bonus_factor > 0.0)
-            q_vec[aa] = feat.dot(q_w) + bonus
-            # q_vec[aa] = min(q_vec[aa], self.v_max)   # !!!!!!!!!
-        return q_vec
-
-    def _run_lsvi(self, bonus_factor):
-        # run value iteration
-        q_w = run_lsvi_jit(
-            self.dim,
-            self.horizon,
-            bonus_factor,
-            self.lambda_mat_inv,
-            self.reward_hist,
-            self.gamma,
-            self.feat_hist,
-            self.env.action_space.n,
-            self.feat_ns_all_actions,
-            self.v_max,
-            self.total_time_steps,
-        )
-        return q_w
diff --git a/rlberry/agents/mbqvi/__init__.py b/rlberry/agents/mbqvi/__init__.py
deleted file mode 100644
index 4856b69b5..000000000
--- a/rlberry/agents/mbqvi/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .mbqvi import MBQVIAgent
diff --git a/rlberry/agents/mbqvi/mbqvi.py b/rlberry/agents/mbqvi/mbqvi.py
deleted file mode 100644
index 83031a168..000000000
--- a/rlberry/agents/mbqvi/mbqvi.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import numpy as np
-
-
-from rlberry.agents import AgentWithSimplePolicy
-from rlberry.agents.dynprog.utils import backward_induction, value_iteration
-from gymnasium.spaces import Discrete
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class MBQVIAgent(AgentWithSimplePolicy):
-    """
-    Model-Basel Q-Value iteration (MBQVI).
-
-    Builds an empirical MDP and runs value iteration on it.
-    Corresponds to the "indirect" algorithm studied by Kearns and Singh (1999).
-
-    Parameters
-    -----------
-    env : Model
-        generative model with finite state-action space
-    n_samples : int
-        number of samples *per state-action pair* used to estimate
-        the empirical MDP.
-    gamma : double
-        discount factor in [0, 1]
-    horizon : int
-        horizon, if the problem is finite-horizon. if None, the discounted
-        problem is solved. default = None
-    epsilon : double
-        precision of value iteration, only used in discounted problems
-        (when horizon is None).
-
-
-    References
-    ----------
-    Kearns, Michael J., and Satinder P. Singh.
-    "Finite-sample convergence rates for Q-learning and indirect algorithms."
-    Advances in neural information processing systems. 1999.
-    """
-
-    name = "MBQVI"
-
-    def __init__(
-        self, env, n_samples=10, gamma=0.99, horizon=None, epsilon=1e-6, **kwargs
-    ):
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-
-        # initialize base class
-        assert self.env.is_generative(), "MBQVI requires a generative model."
-        assert isinstance(
-            self.env.observation_space, Discrete
-        ), "MBQVI requires a finite state space."
-        assert isinstance(
-            self.env.action_space, Discrete
-        ), "MBQVI requires a finite action space."
-
-        #
-        self.n_samples = n_samples
-        self.gamma = gamma
-        self.horizon = horizon
-        self.epsilon = epsilon
-
-        # empirical MDP, created in fit()
-        self.R_hat = None
-        self.P_hat = None
-
-        # value functions
-        self.V = None
-        self.Q = None
-
-    def _update(self, state, action, next_state, reward):
-        """Update model statistics."""
-        self.N_sa[state, action] += 1
-        self.N_sas[state, action, next_state] += 1
-        self.S_sa[state, action] += reward
-
-    def fit(self, budget=None, **kwargs):
-        """
-        Build empirical MDP and run value iteration.
-
-        Parameters
-        ----------
-        budget: None
-            Not used. Only defined for compatibility purpose with rlberry.
-            Changing `budget` value has no effect.
-        """
-        del kwargs
-        S = self.env.observation_space.n
-        A = self.env.action_space.n
-        self.N_sa = np.zeros((S, A))
-        self.N_sas = np.zeros((S, A, S))
-        self.S_sa = np.zeros((S, A))
-
-        # collect data
-        total_samples = S * A * self.n_samples
-        count = 0
-        logger.debug(
-            f"[{self.name}] collecting {self.n_samples} samples per (s,a)"
-            f", total = {total_samples} samples."
-        )
-        for ss in range(S):
-            for aa in range(A):
-                for _ in range(self.n_samples):
-                    next_state, reward, _, _, _ = self.env.sample(ss, aa)
-                    self._update(ss, aa, next_state, reward)
-
-                    count += 1
-                    if count % 10000 == 0:
-                        completed = 100 * count / total_samples
-                        logger.debug(
-                            "[{}] ... {}/{} ({:0.0f}%)".format(
-                                self.name, count, total_samples, completed
-                            )
-                        )
-
-        # build model and run VI
-        logger.debug(f"{self.name} building model and running backward induction...")
-
-        N_sa = np.maximum(self.N_sa, 1)
-        self.R_hat = self.S_sa / N_sa
-        self.P_hat = np.zeros((S, A, S))
-        for ss in range(S):
-            self.P_hat[:, :, ss] = self.N_sas[:, :, ss] / N_sa
-
-        info = {}
-        info["n_samples"] = self.n_samples
-        info["total_samples"] = total_samples
-        if self.horizon is None:
-            assert self.gamma < 1.0, "The discounted setting requires gamma < 1.0"
-            self.Q, self.V, n_it = value_iteration(
-                self.R_hat, self.P_hat, self.gamma, self.epsilon
-            )
-            info["n_iterations"] = n_it
-            info["precision"] = self.epsilon
-        else:
-            self.Q, self.V = backward_induction(
-                self.R_hat, self.P_hat, self.horizon, self.gamma
-            )
-            info["n_iterations"] = self.horizon
-            info["precision"] = 0.0
-        return info
-
-    def policy(self, observation):
-        state = observation
-        assert self.env.observation_space.contains(state)
-        if self.horizon is None:
-            return self.Q[state, :].argmax()
-        else:
-            return self.Q[0, state, :].argmax()
diff --git a/rlberry/agents/optql/__init__.py b/rlberry/agents/optql/__init__.py
deleted file mode 100644
index 5c538141b..000000000
--- a/rlberry/agents/optql/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .optql import OptQLAgent
diff --git a/rlberry/agents/optql/optql.py b/rlberry/agents/optql/optql.py
deleted file mode 100644
index 951d1d834..000000000
--- a/rlberry/agents/optql/optql.py
+++ /dev/null
@@ -1,206 +0,0 @@
-import numpy as np
-
-import gymnasium.spaces as spaces
-from rlberry.agents import AgentWithSimplePolicy
-from rlberry.exploration_tools.discrete_counter import DiscreteCounter
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class OptQLAgent(AgentWithSimplePolicy):
-    """
-    Optimistic Q-Learning [1]_ with custom exploration bonuses.
-
-    Parameters
-    ----------
-    env : gym.Env
-        Environment with discrete states and actions.
-    gamma : double, default: 1.0
-        Discount factor in [0, 1].
-    horizon : int
-        Horizon of the objective function.
-    bonus_scale_factor : double, default: 1.0
-        Constant by which to multiply the exploration bonus, controls
-        the level of exploration.
-    bonus_type : {"simplified_bernstein"}
-        Type of exploration bonus. Currently, only "simplified_bernstein"
-        is implemented.
-    add_bonus_after_update : bool, default: False
-        If True, add bonus to the Q function after performing the update,
-        instead of adding it to the update target.
-
-    References
-    ----------
-    .. [1] Jin et al., 2018
-           Is Q-Learning Provably Efficient?
-           https://arxiv.org/abs/1807.03765
-    """
-
-    name = "OptQL"
-
-    def __init__(
-        self,
-        env,
-        gamma=1.0,
-        horizon=100,
-        bonus_scale_factor=1.0,
-        bonus_type="simplified_bernstein",
-        add_bonus_after_update=False,
-        **kwargs
-    ):
-        # init base class
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-
-        self.gamma = gamma
-        self.horizon = horizon
-        self.bonus_scale_factor = bonus_scale_factor
-        self.bonus_type = bonus_type
-        self.add_bonus_after_update = add_bonus_after_update
-
-        # check environment
-        assert isinstance(self.env.observation_space, spaces.Discrete)
-        assert isinstance(self.env.action_space, spaces.Discrete)
-
-        # maximum value
-        r_range = self.env.reward_range[1] - self.env.reward_range[0]
-        if r_range == np.inf or r_range == 0.0:
-            logger.warning(
-                "{}: Reward range is  zero or infinity. ".format(self.name)
-                + "Setting it to 1."
-            )
-            r_range = 1.0
-
-        self.v_max = np.zeros(self.horizon)
-        self.v_max[-1] = r_range
-        for hh in reversed(range(self.horizon - 1)):
-            self.v_max[hh] = r_range + self.gamma * self.v_max[hh + 1]
-
-        # initialize
-        self.reset()
-
-    def reset(self, **kwargs):
-        H = self.horizon
-        S = self.env.observation_space.n
-        A = self.env.action_space.n
-
-        # (s, a) visit counter
-        self.N_sa = np.zeros((H, S, A))
-
-        # Value functions
-        self.V = np.ones((H + 1, S))
-        self.V[H, :] = 0
-        self.Q = np.ones((H, S, A))
-        self.Q_bar = np.ones((H, S, A))
-        for hh in range(self.horizon):
-            self.V[hh, :] *= self.horizon - hh
-            self.Q[hh, :, :] *= self.horizon - hh
-            self.Q_bar[hh, :, :] *= self.horizon - hh
-
-        if self.add_bonus_after_update:
-            self.Q *= 0.0
-
-        # ep counter
-        self.episode = 0
-
-        # useful object to compute total number of visited states & entropy of visited states
-        self.counter = DiscreteCounter(
-            self.env.observation_space, self.env.action_space
-        )
-
-    def policy(self, observation):
-        """Recommended policy."""
-        state = observation
-        return self.Q_bar[0, state, :].argmax()
-
-    def _get_action(self, state, hh=0):
-        """Sampling policy."""
-        return self.Q_bar[hh, state, :].argmax()
-
-    def _compute_bonus(self, n, hh):
-        if self.bonus_type == "simplified_bernstein":
-            bonus = self.bonus_scale_factor * np.sqrt(1.0 / n) + self.v_max[hh] / n
-            bonus = min(bonus, self.v_max[hh])
-            return bonus
-        else:
-            raise ValueError(
-                "Error: bonus type {} not implemented".format(self.bonus_type)
-            )
-
-    def _update(self, state, action, next_state, reward, hh):
-        self.N_sa[hh, state, action] += 1
-        nn = self.N_sa[hh, state, action]
-
-        # learning rate
-        alpha = (self.horizon + 1.0) / (self.horizon + nn)
-        bonus = self._compute_bonus(nn, hh)
-
-        # bonus in the update
-        if not self.add_bonus_after_update:
-            target = reward + bonus + self.gamma * self.V[hh + 1, next_state]
-            self.Q[hh, state, action] = (1 - alpha) * self.Q[
-                hh, state, action
-            ] + alpha * target
-            self.V[hh, state] = min(self.v_max[hh], self.Q[hh, state, :].max())
-            self.Q_bar[hh, state, action] = self.Q[hh, state, action]
-        # bonus outside the update
-        else:
-            target = reward + self.gamma * self.V[hh + 1, next_state]  # bonus not here
-            self.Q[hh, state, action] = (1 - alpha) * self.Q[
-                hh, state, action
-            ] + alpha * target
-            self.Q_bar[hh, state, action] = (
-                self.Q[hh, state, action] + bonus
-            )  # bonus here
-            self.V[hh, state] = min(self.v_max[hh], self.Q_bar[hh, state, :].max())
-
-    def _run_episode(self):
-        # interact for H steps
-        episode_rewards = 0
-        observation, info = self.env.reset()
-        for hh in range(self.horizon):
-            action = self._get_action(observation, hh)
-            next_observation, reward, terminated, truncated, info = self.env.step(
-                action
-            )
-            done = terminated or truncated
-            episode_rewards += reward  # used for logging only
-
-            self.counter.update(observation, action)
-
-            self._update(observation, action, next_observation, reward, hh)
-
-            observation = next_observation
-            if done:
-                break
-
-        # update info
-        self.episode += 1
-
-        # writer
-        if self.writer is not None:
-            self.writer.add_scalar("episode_rewards", episode_rewards, self.episode)
-            self.writer.add_scalar(
-                "n_visited_states", self.counter.get_n_visited_states(), self.episode
-            )
-
-        # return sum of rewards collected in the episode
-        return episode_rewards
-
-    def fit(self, budget: int, **kwargs):
-        """
-        Train the agent using the provided environment.
-
-        Parameters
-        ----------
-        budget: int
-            number of episodes. Each episode runs for self.horizon unless it
-            enconters a terminal state in which case it stops early.
-        """
-        del kwargs
-        n_episodes_to_run = budget
-        count = 0
-        while count < n_episodes_to_run:
-            self._run_episode()
-            count += 1
diff --git a/rlberry/agents/psrl/__init__.py b/rlberry/agents/psrl/__init__.py
deleted file mode 100644
index 417e37106..000000000
--- a/rlberry/agents/psrl/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .psrl import PSRLAgent
diff --git a/rlberry/agents/psrl/psrl.py b/rlberry/agents/psrl/psrl.py
deleted file mode 100644
index dac8440b9..000000000
--- a/rlberry/agents/psrl/psrl.py
+++ /dev/null
@@ -1,257 +0,0 @@
-import numpy as np
-
-import gymnasium.spaces as spaces
-from rlberry.agents import AgentWithSimplePolicy
-from rlberry.exploration_tools.discrete_counter import DiscreteCounter
-from rlberry.agents.dynprog.utils import (
-    backward_induction_in_place,
-    backward_induction_sd,
-)
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class PSRLAgent(AgentWithSimplePolicy):
-    """
-    PSRL algorithm from [1] with beta prior for the "Bernoullized" rewards
-    (instead of Gaussian-gamma prior).
-
-    Notes
-    -----
-    The recommended policy after all the episodes is computed without
-    exploration bonuses.
-
-    Parameters
-    ----------
-    env : gym.Env
-        Environment with discrete states and actions.
-    gamma : double, default: 1.0
-        Discount factor in [0, 1]. If gamma is 1.0, the problem is set to
-        be finite-horizon.
-    horizon : int
-        Horizon of the objective function. If None and gamma<1, set to
-        1/(1-gamma).
-    scale_prior_reward : double, delfault: 1.0
-        scale of the Beta (uniform) prior,
-        i.e prior is Beta(scale_prior_reward*(1,1))
-    scale_prior_transition : double, default: 1/number of state
-        scale of the (uniform) Dirichlet prior,
-        i.e prior is Dirichlet(scale_prior_transition*(1,...,1))
-    bernoullized_reward: bool, default: True
-        If true the rewards are Bernoullized
-    reward_free : bool, default: False
-        If true, ignores rewards and uses only 1/n bonuses.
-    stage_dependent : bool, default: False
-        If true, assume that transitions and rewards can change with the stage h.
-
-    References
-    ----------
-    .. [1] Osband et al., 2013
-        (More) Efficient Reinforcement Learning via Posterior Sampling
-        https://arxiv.org/abs/1306.0940
-
-    """
-
-    name = "PSRL"
-
-    def __init__(
-        self,
-        env,
-        gamma=1.0,
-        horizon=100,
-        scale_prior_reward=1,
-        scale_prior_transition=None,
-        bernoullized_reward=True,
-        reward_free=False,
-        stage_dependent=False,
-        **kwargs
-    ):
-        # init base class
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-
-        self.gamma = gamma
-        self.horizon = horizon
-        self.scale_prior_reward = scale_prior_reward
-        self.scale_prior_transition = scale_prior_transition
-        if scale_prior_transition is None:
-            self.scale_prior_transition = 1.0 / self.env.observation_space.n
-        self.bernoullized_reward = bernoullized_reward
-        self.reward_free = reward_free
-        self.stage_dependent = stage_dependent
-
-        # check environment
-        assert isinstance(self.env.observation_space, spaces.Discrete)
-        assert isinstance(self.env.action_space, spaces.Discrete)
-
-        # other checks
-        assert gamma >= 0 and gamma <= 1.0
-        if self.horizon is None:
-            assert gamma < 1.0, "If no horizon is given, gamma must be smaller than 1."
-            self.horizon = int(np.ceil(1.0 / (1.0 - gamma)))
-
-        # maximum value
-        r_range = self.env.reward_range[1] - self.env.reward_range[0]
-        if r_range == np.inf or r_range == 0.0:
-            logger.warning(
-                "{}: Reward range is  zero or infinity. ".format(self.name)
-                + "Setting it to 1."
-            )
-            r_range = 1.0
-
-        self.v_max = np.zeros(self.horizon)
-        self.v_max[-1] = r_range
-        for hh in reversed(range(self.horizon - 1)):
-            self.v_max[hh] = r_range + self.gamma * self.v_max[hh + 1]
-
-        # initialize
-        self.reset()
-
-    def reset(self, **kwargs):
-        H = self.horizon
-        S = self.env.observation_space.n
-        A = self.env.action_space.n
-
-        if self.stage_dependent:
-            shape_hsa = (H, S, A)
-            shape_hsas = (H, S, A, S)
-        else:
-            shape_hsa = (S, A)
-            shape_hsas = (S, A, S)
-
-        # Prior transitions
-        self.N_sas = self.scale_prior_transition * np.ones(shape_hsas)
-
-        # Prior rewards
-        self.M_sa = self.scale_prior_reward * np.ones(shape_hsa + (2,))
-
-        # Value functions
-        self.V = np.zeros((H, S))
-        self.Q = np.zeros((H, S, A))
-        # for rec. policy
-        self.V_policy = np.zeros((H, S))
-        self.Q_policy = np.zeros((H, S, A))
-
-        # ep counter
-        self.episode = 0
-
-        # useful object to compute total number of visited states & entropy of visited states
-        self.counter = DiscreteCounter(
-            self.env.observation_space, self.env.action_space
-        )
-
-    def policy(self, observation):
-        state = observation
-        assert self.Q_policy is not None
-        return self.Q_policy[0, state, :].argmax()
-
-    def _get_action(self, state, hh=0):
-        """Sampling policy."""
-        assert self.Q is not None
-        return self.Q[hh, state, :].argmax()
-
-    def _update(self, state, action, next_state, reward, hh):
-        bern_reward = reward
-        if self.bernoullized_reward:
-            bern_reward = self.rng.binomial(1, reward)
-        # update posterior
-        if self.stage_dependent:
-            self.N_sas[hh, state, action, next_state] += 1
-            self.M_sa[hh, state, action, 0] += bern_reward
-            self.M_sa[hh, state, action, 1] += 1 - bern_reward
-
-        else:
-            self.N_sas[state, action, next_state] += 1
-            self.M_sa[state, action, 0] += bern_reward
-            self.M_sa[state, action, 1] += 1 - bern_reward
-
-    def _run_episode(self):
-        # sample reward and transitions from posterior
-        self.R_sample = self.rng.beta(self.M_sa[..., 0], self.M_sa[..., 1])
-        self.P_sample = self.rng.gamma(self.N_sas)
-        self.P_sample = self.P_sample / self.P_sample.sum(-1, keepdims=True)
-        # run backward induction
-        if self.stage_dependent:
-            backward_induction_sd(
-                self.Q, self.V, self.R_sample, self.P_sample, self.gamma, self.v_max[0]
-            )
-        else:
-            backward_induction_in_place(
-                self.Q,
-                self.V,
-                self.R_sample,
-                self.P_sample,
-                self.horizon,
-                self.gamma,
-                self.v_max[0],
-            )
-        # interact for H steps
-        episode_rewards = 0
-        observation, info = self.env.reset()
-        for hh in range(self.horizon):
-            action = self._get_action(observation, hh)
-            next_observation, reward, terminated, truncated, info = self.env.step(
-                action
-            )
-            done = terminated or truncated
-            episode_rewards += reward  # used for logging only
-
-            self.counter.update(observation, action)
-
-            if self.reward_free:
-                reward = 0.0  # set to zero before update if reward_free
-
-            self._update(observation, action, next_observation, reward, hh)
-
-            observation = next_observation
-            if done:
-                break
-
-        # update info
-        self.episode += 1
-
-        # writer
-        if self.writer is not None:
-            self.writer.add_scalar("episode_rewards", episode_rewards, self.episode)
-            self.writer.add_scalar(
-                "n_visited_states", self.counter.get_n_visited_states(), self.episode
-            )
-
-        # return sum of rewards collected in the episode
-        return episode_rewards
-
-    def fit(self, budget: int, **kwargs):
-        """
-        Train the agent using the provided environment.
-
-        Parameters
-        ----------
-        budget: int
-            number of episodes. Each episode runs for self.horizon unless it
-            enconters a terminal state in which case it stops early.
-        """
-        del kwargs
-        n_episodes_to_run = budget
-        count = 0
-        while count < n_episodes_to_run:
-            self._run_episode()
-            count += 1
-
-        # compute Q function for the recommended policy
-        R_hat = self.M_sa[..., 0] / (self.M_sa[..., 0] + self.M_sa[..., 1])
-        P_hat = self.N_sas / self.N_sas.sum(-1, keepdims=True)
-        if self.stage_dependent:
-            backward_induction_sd(
-                self.Q_policy, self.V_policy, R_hat, P_hat, self.gamma, self.v_max[0]
-            )
-        else:
-            backward_induction_in_place(
-                self.Q_policy,
-                self.V_policy,
-                R_hat,
-                P_hat,
-                self.horizon,
-                self.gamma,
-                self.v_max[0],
-            )
diff --git a/rlberry/agents/rlsvi/__init__.py b/rlberry/agents/rlsvi/__init__.py
deleted file mode 100644
index 11c5adc67..000000000
--- a/rlberry/agents/rlsvi/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .rlsvi import RLSVIAgent
diff --git a/rlberry/agents/rlsvi/rlsvi.py b/rlberry/agents/rlsvi/rlsvi.py
deleted file mode 100644
index 6e3c2c120..000000000
--- a/rlberry/agents/rlsvi/rlsvi.py
+++ /dev/null
@@ -1,280 +0,0 @@
-import numpy as np
-
-import gymnasium.spaces as spaces
-from rlberry.agents import AgentWithSimplePolicy
-from rlberry.exploration_tools.discrete_counter import DiscreteCounter
-from rlberry.agents.dynprog.utils import (
-    backward_induction_in_place,
-    backward_induction_reward_sd,
-    backward_induction_sd,
-)
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class RLSVIAgent(AgentWithSimplePolicy):
-    """
-    RLSVI algorithm from [1,2] with Gaussian noise.
-
-    Notes
-    -----
-    The recommended policy after all the episodes is computed with the empirical
-    MDP.
-    The std of the noise is of the form:
-    scale/sqrt(n)+ V_max/n
-    as for simplified Bernstein bonuses.
-
-    Parameters
-    ----------
-    env : gym.Env
-        Environment with discrete states and actions.
-    gamma : double, default: 1.0
-        Discount factor in [0, 1]. If gamma is 1.0, the problem is set to
-        be finite-horizon.
-    horizon : int
-        Horizon of the objective function. If None and gamma<1, set to
-        1/(1-gamma).
-    scale_std_noise : double, delfault: 1.0
-        scale the std of the noise. At step h the std is
-        scale_std_noise/sqrt(n)+(H-h+1)/n
-    reward_free : bool, default: False
-        If true, ignores rewards.
-    stage_dependent : bool, default: False
-        If true, assume that transitions and rewards can change with the stage h.
-
-    References
-    ----------
-    .. [1] Osband et al., 2014
-        Generalization and Exploration via Randomized Value Functions
-        https://arxiv.org/abs/1402.0635
-
-    .. [2] Russo, 2019
-        Worst-Case Regret Bounds for Exploration via Randomized Value Functions
-        https://arxiv.org/abs/1906.02870
-
-    """
-
-    name = "RLSVI"
-
-    def __init__(
-        self,
-        env,
-        gamma=1.0,
-        horizon=100,
-        scale_std_noise=1.0,
-        reward_free=False,
-        stage_dependent=False,
-        **kwargs
-    ):
-        # init base class
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-
-        self.gamma = gamma
-        self.horizon = horizon
-        self.scale_std_noise = scale_std_noise
-        self.reward_free = reward_free
-        self.stage_dependent = stage_dependent
-
-        # check environment
-        assert isinstance(self.env.observation_space, spaces.Discrete)
-        assert isinstance(self.env.action_space, spaces.Discrete)
-
-        # other checks
-        assert gamma >= 0 and gamma <= 1.0
-        if self.horizon is None:
-            assert gamma < 1.0, "If no horizon is given, gamma must be smaller than 1."
-            self.horizon = int(np.ceil(1.0 / (1.0 - gamma)))
-
-        # maximum value
-        r_range = self.env.reward_range[1] - self.env.reward_range[0]
-        if r_range == np.inf or r_range == 0.0:
-            logger.warning(
-                "{}: Reward range is  zero or infinity. ".format(self.name)
-                + "Setting it to 1."
-            )
-            r_range = 1.0
-
-        self.v_max = np.zeros(self.horizon)
-        self.v_max[-1] = r_range
-        for hh in reversed(range(self.horizon - 1)):
-            self.v_max[hh] = r_range + self.gamma * self.v_max[hh + 1]
-
-        # initialize
-        self.reset()
-
-    def reset(self, **kwargs):
-        H = self.horizon
-        S = self.env.observation_space.n
-        A = self.env.action_space.n
-
-        if self.stage_dependent:
-            shape_hsa = (H, S, A)
-            shape_hsas = (H, S, A, S)
-        else:
-            shape_hsa = (S, A)
-            shape_hsas = (S, A, S)
-
-        # stds prior
-        self.std1_sa = self.scale_std_noise * np.ones((H, S, A))
-        self.std2_sa = np.ones((H, S, A))
-        # visit counter
-        self.N_sa = np.ones(shape_hsa)
-
-        # MDP estimator
-        self.R_hat = np.zeros(shape_hsa)
-        self.P_hat = np.ones(shape_hsas) * 1.0 / S
-
-        # Value functions
-        self.V = np.zeros((H, S))
-        self.Q = np.zeros((H, S, A))
-        # for rec. policy
-        self.V_policy = np.zeros((H, S))
-        self.Q_policy = np.zeros((H, S, A))
-
-        # Init V and variances
-        for hh in range(self.horizon):
-            self.std2_sa[hh, :, :] *= self.v_max[hh]
-
-        # ep counter
-        self.episode = 0
-
-        # useful object to compute total number of visited states & entropy of visited states
-        self.counter = DiscreteCounter(
-            self.env.observation_space, self.env.action_space
-        )
-
-    def policy(self, observation):
-        state = observation
-        assert self.Q_policy is not None
-        return self.Q_policy[0, state, :].argmax()
-
-    def _get_action(self, state, hh=0):
-        """Sampling policy."""
-        assert self.Q is not None
-        return self.Q[hh, state, :].argmax()
-
-    def _update(self, state, action, next_state, reward, hh):
-        if self.stage_dependent:
-            self.N_sa[hh, state, action] += 1
-
-            nn = self.N_sa[hh, state, action]
-            prev_r = self.R_hat[hh, state, action]
-            prev_p = self.P_hat[hh, state, action, :]
-
-            self.R_hat[hh, state, action] = (
-                1.0 - 1.0 / nn
-            ) * prev_r + reward * 1.0 / nn
-
-            self.P_hat[hh, state, action, :] = (1.0 - 1.0 / nn) * prev_p
-            self.P_hat[hh, state, action, next_state] += 1.0 / nn
-
-        else:
-            self.N_sa[state, action] += 1
-
-            nn = self.N_sa[state, action]
-            prev_r = self.R_hat[state, action]
-            prev_p = self.P_hat[state, action, :]
-
-            self.R_hat[state, action] = (1.0 - 1.0 / nn) * prev_r + reward * 1.0 / nn
-
-            self.P_hat[state, action, :] = (1.0 - 1.0 / nn) * prev_p
-            self.P_hat[state, action, next_state] += 1.0 / nn
-
-    def _run_episode(self):
-        # interact for H steps
-        episode_rewards = 0
-        # stds scale/sqrt(n)+(H-h+1)/n
-        std_sa = self.std1_sa / np.sqrt(self.N_sa) + self.std2_sa / self.N_sa
-        noise_sa = self.rng.normal(self.R_hat, std_sa)
-        # run backward noisy induction
-        if self.stage_dependent:
-            backward_induction_sd(
-                self.Q,
-                self.V,
-                self.R_hat + noise_sa,
-                self.P_hat,
-                self.gamma,
-                self.v_max[0],
-            )
-        else:
-            backward_induction_reward_sd(
-                self.Q,
-                self.V,
-                self.R_hat + noise_sa,
-                self.P_hat,
-                self.gamma,
-                self.v_max[0],
-            )
-
-        observation, info = self.env.reset()
-        for hh in range(self.horizon):
-            action = self._get_action(observation, hh)
-            next_observation, reward, terminated, truncated, info = self.env.step(
-                action
-            )
-            done = terminated or truncated
-            episode_rewards += reward  # used for logging only
-
-            self.counter.update(observation, action)
-
-            if self.reward_free:
-                reward = 0.0  # set to zero before update if reward_free
-
-            self._update(observation, action, next_observation, reward, hh)
-
-            observation = next_observation
-            if done:
-                break
-
-        # update info
-        self.episode += 1
-
-        # writer
-        if self.writer is not None:
-            self.writer.add_scalar("episode_rewards", episode_rewards, self.episode)
-            self.writer.add_scalar(
-                "n_visited_states", self.counter.get_n_visited_states(), self.episode
-            )
-
-        # return sum of rewards collected in the episode
-        return episode_rewards
-
-    def fit(self, budget: int, **kwargs):
-        """
-        Train the agent using the provided environment.
-
-        Parameters
-        ----------
-        budget: int
-            number of episodes. Each episode runs for self.horizon unless it
-            enconters a terminal state in which case it stops early.
-        """
-        del kwargs
-        n_episodes_to_run = budget
-        count = 0
-        while count < n_episodes_to_run:
-            self._run_episode()
-            count += 1
-
-        # compute Q function for the recommended policy
-        if self.stage_dependent:
-            backward_induction_sd(
-                self.Q_policy,
-                self.V_policy,
-                self.R_hat,
-                self.P_hat,
-                self.gamma,
-                self.v_max[0],
-            )
-        else:
-            backward_induction_in_place(
-                self.Q_policy,
-                self.V_policy,
-                self.R_hat,
-                self.P_hat,
-                self.horizon,
-                self.gamma,
-                self.v_max[0],
-            )
diff --git a/rlberry/agents/tabular_rl/__init__.py b/rlberry/agents/tabular_rl/__init__.py
deleted file mode 100644
index 5eefb55d5..000000000
--- a/rlberry/agents/tabular_rl/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .qlearning import QLAgent
-from .sarsa import SARSAAgent
diff --git a/rlberry/agents/tabular_rl/qlearning.py b/rlberry/agents/tabular_rl/qlearning.py
deleted file mode 100644
index 024147acb..000000000
--- a/rlberry/agents/tabular_rl/qlearning.py
+++ /dev/null
@@ -1,127 +0,0 @@
-from typing import Optional, Literal
-import numpy as np
-from gymnasium import spaces
-from scipy.special import softmax
-
-from rlberry import types
-from rlberry.agents import AgentWithSimplePolicy
-
-
-class QLAgent(AgentWithSimplePolicy):
-    """Q-Learning Agent.
-
-    Parameters
-    ----------
-    env: :class:`~rlberry.types.Env`
-        Environment with discrete states and actions.
-    gamma: float, default = 0.99
-        Discount factor.
-    alpha: float, default = 0.1
-        Learning rate.
-    exploration_type: {"epsilon", "boltzmann"}, default: None
-        If "epsilon": Epsilon-Greedy exploration.
-        If "boltzmann": Boltzmann exploration.
-        If None: No exploration.
-    exploration_rate: float, default: None
-        epsilon parameter for Epsilon-Greedy exploration or tau parameter for Boltzmann exploration.
-
-    Attributes
-    ----------
-    Q : ndarray
-        2D array that stores the estimation ofexpected rewards for state-action pairs.
-
-    Examples
-    --------
-    >>> from rlberry.envs import GridWorld
-    >>>
-    >>> env = GridWorld(walls=(), nrows=5, ncols=5)
-    >>> agent = QLAgent()
-    >>> agent.fit(budget=1000)
-    >>> agent.policy(env.observation_space.sample())
-    >>> agent.reset()
-    """
-
-    name = "QL"
-
-    def __init__(
-        self,
-        env: types.Env,
-        gamma: float = 0.99,
-        alpha: float = 0.1,
-        exploration_type: Optional[Literal["epsilon", "boltzmann"]] = None,
-        exploration_rate: Optional[float] = None,
-        **kwargs
-    ):
-        # init base class
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-
-        self.gamma = gamma
-        self.alpha = alpha
-        self.exploration_type = exploration_type
-        self.exploration_rate = exploration_rate
-        # check environment
-        assert isinstance(self.env.observation_space, spaces.Discrete)
-        assert isinstance(self.env.action_space, spaces.Discrete)
-
-        # check exploration type
-        if self.exploration_type is not None:
-            assert (
-                exploration_type == "epsilon" or "boltzmann"
-            ) and exploration_rate is not None
-
-        self.Q = np.zeros((self.env.observation_space.n, self.env.action_space.n))
-
-    def reset(self, **kwargs):
-        self.Q.fill(0)
-
-    def policy(self, observation):
-        return self.Q[observation].argmax()
-
-    def get_action(self, observation):
-        if (
-            self.exploration_type == "epsilon"
-            and np.random.random() <= self.exploration_rate
-        ):
-            return np.random.choice(self.env.action_space.n)
-        elif self.exploration_type == "boltzmann":
-            return np.random.choice(
-                self.env.action_space.n,
-                p=softmax(self.exploration_rate * self.Q[observation]),
-            )
-        else:
-            return self.Q[observation].argmax()
-
-    def fit(self, budget: int, **kwargs):
-        """
-        Train the agent using the provided environment.
-        Parameters
-        ----------
-        budget: int
-            number of Q updates.
-        """
-        del kwargs
-        observation, info = self.env.reset()
-        episode_rewards = 0
-        for i in range(budget):
-            action = self.get_action(observation)
-            next_observation, reward, terminated, truncated, info = self.env.step(
-                action
-            )
-            done = terminated or truncated
-            episode_rewards += reward
-            if self.writer is not None:
-                self.writer.add_scalar("episode_rewards", episode_rewards, i)
-            if done:
-                self.Q[observation, action] = reward
-            else:
-                self.Q[observation, action] = self.Q[
-                    observation, action
-                ] + self.alpha * (
-                    reward
-                    + self.gamma * np.amax(self.Q[next_observation])
-                    - self.Q[observation, action]
-                )
-            observation = next_observation
-            if done:
-                observation, info = self.env.reset()
-                episode_rewards = 0
diff --git a/rlberry/agents/tabular_rl/sarsa.py b/rlberry/agents/tabular_rl/sarsa.py
deleted file mode 100644
index 3f097d4d4..000000000
--- a/rlberry/agents/tabular_rl/sarsa.py
+++ /dev/null
@@ -1,125 +0,0 @@
-from typing import Optional, Literal
-import numpy as np
-from gymnasium import spaces
-from scipy.special import softmax
-
-from rlberry import types
-from rlberry.agents import AgentWithSimplePolicy
-
-
-class SARSAAgent(AgentWithSimplePolicy):
-    """SARSA Agent.
-
-    Parameters
-    ----------
-    env: :class:`~rlberry.types.Env`
-        Environment with discrete states and actions.
-    gamma: float, default = 0.99
-        Discount factor.
-    alpha: float, default = 0.1
-        Learning rate.
-    exploration_type: {"epsilon", "boltzmann"}, default: None
-        If "epsilon": Epsilon-Greedy exploration.
-        If "boltzmann": Boltzmann exploration.
-        If None: No exploration.
-    exploration_rate: float, default: None
-        epsilon parameter for Epsilon-Greedy exploration or tau parameter for Boltzmann exploration.
-
-    Attributes
-    ----------
-    Q : ndarray
-        2D array that stores the estimation ofexpected rewards for state-action pairs.
-    Examples
-    --------
-    >>> from rlberry.envs import GridWorld
-    >>>
-    >>> env = GridWorld(walls=(), nrows=5, ncols=5)
-    >>> agent = SARSAAgent()
-    >>> agent.fit(budget=1000)
-    >>> agent.policy(env.observation_space.sample())
-    >>> agent.reset()
-    """
-
-    def __init__(
-        self,
-        env: types.Env,
-        gamma: float = 0.99,
-        alpha: float = 0.1,
-        exploration_type: Optional[Literal["epsilon", "boltzmann"]] = None,
-        exploration_rate: Optional[float] = None,
-        **kwargs
-    ):
-        # init base class
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-
-        self.gamma = gamma
-        self.alpha = alpha
-        self.exploration_type = exploration_type
-        self.exploration_rate = exploration_rate
-        # check environment
-        assert isinstance(self.env.observation_space, spaces.Discrete)
-        assert isinstance(self.env.action_space, spaces.Discrete)
-
-        # check exploration type
-        if self.exploration_type is not None:
-            assert (
-                exploration_type == "epsilon" or "boltzmann"
-            ) and exploration_rate is not None
-
-        self.Q = np.zeros((self.env.observation_space.n, self.env.action_space.n))
-
-    def reset(self, **kwargs):
-        self.Q.fill(0)
-
-    def policy(self, observation):
-        return self.Q[observation].argmax()
-
-    def get_action(self, observation):
-        if (
-            self.exploration_type == "epsilon"
-            and np.random.random() <= self.exploration_rate
-        ):
-            return np.random.choice(self.env.action_space.n)
-        elif self.exploration_type == "boltzmann":
-            return np.random.choice(
-                self.env.action_space.n,
-                p=softmax(self.exploration_rate * self.Q[observation]),
-            )
-        else:
-            return self.Q[observation].argmax()
-
-    def fit(self, budget: int, **kwargs):
-        """
-        Train the agent using the provided environment.
-        Parameters
-        ----------
-        budget: int
-            number of Q updates.
-        """
-        del kwargs
-        observation, info = self.env.reset()
-        episode_rewards = 0
-        for i in range(budget):
-            action = self.get_action(observation)
-            next_observation, reward, terminated, truncated, info = self.env.step(
-                action
-            )
-            done = terminated or truncated
-            episode_rewards += reward
-            if self.writer is not None:
-                self.writer.add_scalar("episode_rewards", episode_rewards, i)
-            if done:
-                self.Q[observation, action] = reward
-            else:
-                next_action = self.get_action(next_observation)
-                self.Q[observation, action] = self.Q[
-                    observation, action
-                ] + self.alpha * (
-                    reward
-                    + self.gamma * self.Q[next_observation, next_action]
-                    - self.Q[observation, action]
-                )
-            observation = next_observation
-            if done:
-                observation, info = self.env.reset()
-                episode_rewards = 0
diff --git a/rlberry/agents/tests/test_adaptiveql.py b/rlberry/agents/tests/test_adaptiveql.py
deleted file mode 100644
index 4079dcbf9..000000000
--- a/rlberry/agents/tests/test_adaptiveql.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from rlberry.agents import AdaptiveQLAgent
-from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
-import matplotlib.pyplot as plt
-
-
-def test_adaptive_ql():
-    env = get_benchmark_env(level=2)
-    agent = AdaptiveQLAgent(env, horizon=30)
-    agent.fit(budget=50)
-    agent.policy(env.observation_space.sample())
-    agent.Qtree.plot(0, 20)
-    plt.clf()
diff --git a/rlberry/agents/tests/test_bandits.py b/rlberry/agents/tests/test_bandits.py
deleted file mode 100644
index 441e5a3f0..000000000
--- a/rlberry/agents/tests/test_bandits.py
+++ /dev/null
@@ -1,131 +0,0 @@
-from rlberry.envs.bandits import NormalBandit, BernoulliBandit
-from rlberry.agents.bandits import (
-    IndexAgent,
-    RandomizedAgent,
-    TSAgent,
-    BanditWithSimplePolicy,
-    makeBetaPrior,
-    makeBoundedIMEDIndex,
-    makeBoundedMOSSIndex,
-    makeBoundedNPTSIndex,
-    makeBoundedUCBIndex,
-    makeETCIndex,
-    makeGaussianPrior,
-    makeEXP3Index,
-    makeSubgaussianMOSSIndex,
-    makeSubgaussianUCBIndex,
-    makeBoundedUCBVIndex,
-)
-from rlberry.utils import check_bandit_agent
-
-
-TEST_SEED = 42
-
-
-def test_base_bandit():
-    assert check_bandit_agent(BanditWithSimplePolicy, NormalBandit, seed=TEST_SEED)
-
-
-bounded_indices = {
-    "IMED": makeBoundedIMEDIndex,
-    "MOSS": makeBoundedMOSSIndex,
-    "NPTS": makeBoundedNPTSIndex,
-    "UCB": makeBoundedUCBIndex,
-    "UCBV": makeBoundedUCBVIndex,
-}
-subgaussian_indices = {
-    "UCB": makeSubgaussianUCBIndex,
-    "MOSS": makeSubgaussianMOSSIndex,
-}
-misc_indices = {
-    "ETC": makeETCIndex,
-}
-
-
-def test_bounded_indices():
-    for agent_name, makeIndex in bounded_indices.items():
-
-        class Agent(IndexAgent):
-            name = agent_name
-
-            def __init__(self, env, **kwargs):
-                index, tracker_params = makeIndex()
-                IndexAgent.__init__(
-                    self, env, index, tracker_params=tracker_params, **kwargs
-                )
-
-        assert check_bandit_agent(
-            Agent, BernoulliBandit, seed=TEST_SEED
-        ), "Agent not reproducible"
-
-
-def test_subgaussian_indices():
-    for agent_name, makeIndex in subgaussian_indices.items():
-
-        class Agent(IndexAgent):
-            name = agent_name
-
-            def __init__(self, env, **kwargs):
-                index, tracker_params = makeIndex()
-                IndexAgent.__init__(
-                    self, env, index, tracker_params=tracker_params, **kwargs
-                )
-
-        assert check_bandit_agent(
-            Agent, NormalBandit, seed=TEST_SEED
-        ), "Agent not reproducible"
-
-
-def test_misc_indices():
-    for agent_name, makeIndex in misc_indices.items():
-
-        class Agent(IndexAgent):
-            name = agent_name
-
-            def __init__(self, env, **kwargs):
-                index, tracker_params = makeIndex()
-                IndexAgent.__init__(
-                    self, env, index, tracker_params=tracker_params, **kwargs
-                )
-
-        assert check_bandit_agent(
-            Agent, BernoulliBandit, seed=TEST_SEED
-        ), "Agent not reproducible"
-
-
-def test_randomized_bandits():
-    class EXP3Agent(RandomizedAgent):
-        name = "EXP3"
-
-        def __init__(self, env, **kwargs):
-            prob, tracker_params = makeEXP3Index()
-            RandomizedAgent.__init__(
-                self, env, prob, tracker_params=tracker_params, **kwargs
-            )
-
-    assert check_bandit_agent(
-        EXP3Agent, BernoulliBandit, seed=TEST_SEED
-    ), "Agent not reproducible"
-
-
-priors = {
-    "Beta": (makeBetaPrior, BernoulliBandit),
-    "Gaussian": (makeGaussianPrior, NormalBandit),
-}
-
-
-def test_TS():
-    for agent_name, (makePrior, Bandit) in priors.items():
-
-        class Agent(TSAgent):
-            name = agent_name
-
-            def __init__(self, env, **kwargs):
-                prior_info, tracker_params = makePrior()
-                TSAgent.__init__(
-                    self, env, prior_info, tracker_params=tracker_params, **kwargs
-                )
-
-        assert check_bandit_agent(
-            Agent, Bandit, seed=TEST_SEED
-        ), "Agent not reproducible"
diff --git a/rlberry/agents/tests/test_dynprog.py b/rlberry/agents/tests/test_dynprog.py
deleted file mode 100644
index 6d96b8f49..000000000
--- a/rlberry/agents/tests/test_dynprog.py
+++ /dev/null
@@ -1,156 +0,0 @@
-import numpy as np
-import pytest
-
-import rlberry.seeding as seeding
-from rlberry.agents.dynprog import ValueIterationAgent
-from rlberry.agents.dynprog.utils import backward_induction
-from rlberry.agents.dynprog.utils import backward_induction_in_place
-from rlberry.agents.dynprog.utils import backward_induction_sd
-from rlberry.agents.dynprog.utils import backward_induction_reward_sd
-from rlberry.agents.dynprog.utils import bellman_operator
-from rlberry.agents.dynprog.utils import value_iteration
-from rlberry.envs.finite import FiniteMDP
-
-_rng = seeding.Seeder(123).rng
-
-
-def get_random_mdp(S, A):
-    R = _rng.uniform(0.0, 1.0, (S, A))
-    P = _rng.uniform(0.0, 1.0, (S, A, S))
-    for ss in range(S):
-        for aa in range(A):
-            P[ss, aa, :] /= P[ss, aa, :].sum()
-    return R, P
-
-
-@pytest.mark.parametrize(
-    "gamma, S, A",
-    [
-        (0.001, 2, 1),
-        (0.25, 2, 1),
-        (0.5, 2, 1),
-        (0.75, 2, 1),
-        (0.999, 2, 1),
-        (0.001, 4, 2),
-        (0.25, 4, 2),
-        (0.5, 4, 2),
-        (0.75, 4, 2),
-        (0.999, 4, 2),
-        (0.001, 20, 4),
-        (0.25, 20, 4),
-        (0.5, 20, 4),
-        (0.75, 20, 4),
-        (0.999, 20, 4),
-    ],
-)
-def test_bellman_operator_monotonicity_and_contraction(gamma, S, A):
-    rng = seeding.Seeder(123).rng
-    vmax = 1.0 / (1.0 - gamma)
-    for _ in range(10):
-        # generate random MDP
-        R, P = get_random_mdp(S, A)
-
-        # generate random Q functions
-        Q0 = rng.uniform(-vmax, vmax, (S, A))
-        Q1 = rng.uniform(-vmax, vmax, (S, A))
-        # apply Bellman operator
-        TQ0 = bellman_operator(Q0, R, P, gamma)
-        TQ1 = bellman_operator(Q1, R, P, gamma)
-
-        # test contraction
-        norm_tq = np.abs(TQ1 - TQ0).max()
-        norm_q = np.abs(Q1 - Q0).max()
-        assert norm_tq <= gamma * norm_q
-
-        # test monotonicity
-        Q2 = rng.uniform(-vmax / 2, vmax / 2, (S, A))
-        Q3 = Q2 + rng.uniform(0.0, vmax / 2, (S, A))
-        TQ2 = bellman_operator(Q2, R, P, gamma)
-        TQ3 = bellman_operator(Q3, R, P, gamma)
-        assert np.greater(TQ2, TQ3).sum() == 0
-
-
-@pytest.mark.parametrize(
-    "gamma, S, A",
-    [(0.01, 10, 4), (0.25, 10, 4), (0.5, 10, 4), (0.75, 10, 4), (0.99, 10, 4)],
-)
-def test_value_iteration(gamma, S, A):
-    for epsilon in np.logspace(-1, -6, num=5):
-        for sim in range(5):
-            # generate random MDP
-            R, P = get_random_mdp(S, A)
-
-            # run value iteration
-            Q, V, n_it = value_iteration(R, P, gamma, epsilon)
-            # check precision
-            TQ = bellman_operator(Q, R, P, gamma)
-            assert np.abs(TQ - Q).max() <= epsilon
-
-
-@pytest.mark.parametrize("horizon, S, A", [(10, 5, 4), (20, 10, 4)])
-def test_backward_induction(horizon, S, A):
-    for sim in range(5):
-        # generate random MDP
-        R, P = get_random_mdp(S, A)
-
-        # run backward induction
-        Q, V = backward_induction(R, P, horizon)
-
-        assert Q.max() <= horizon
-        assert V.max() <= horizon
-
-        # run backward with clipping V to 1.0
-        Q, V = backward_induction(R, P, horizon, vmax=1.0)
-        assert V.max() <= 1.0
-
-        # run bacward induction in place
-        Q2 = np.zeros((horizon, S, A))
-        V2 = np.zeros((horizon, S))
-        backward_induction_in_place(Q2, V2, R, P, horizon, vmax=1.0)
-        assert np.array_equal(Q, Q2)
-        assert np.array_equal(V, V2)
-
-
-@pytest.mark.parametrize("horizon, S, A", [(10, 5, 4), (20, 10, 4)])
-def test_backward_induction_sd(horizon, S, A):
-    """
-    Test stage-dependent MDPs
-    """
-    for sim in range(5):
-        # generate random MDP
-        Rstat, Pstat = get_random_mdp(S, A)
-        R = np.zeros((horizon, S, A))
-        P = np.zeros((horizon, S, A, S))
-        for ii in range(horizon):
-            R[ii, :, :] = Rstat
-            P[ii, :, :, :] = Pstat
-
-        # run backward induction in stationary MDP
-        Qstat, Vstat = backward_induction(Rstat, Pstat, horizon)
-
-        # run backward induction in stage-dependent MDP
-        Q = np.zeros((horizon, S, A))
-        V = np.zeros((horizon, S))
-        backward_induction_sd(Q, V, R, P)
-
-        # run backward induction with stage-dependent rewards
-        Q2 = np.zeros((horizon, S, A))
-        V2 = np.zeros((horizon, S))
-        backward_induction_reward_sd(Q2, V2, R, Pstat)
-
-        assert np.array_equal(Q, Qstat)
-        assert np.array_equal(V, Vstat)
-        assert np.array_equal(Q2, Qstat)
-        assert np.array_equal(V2, Vstat)
-
-
-@pytest.mark.parametrize("horizon, gamma, S, A", [(None, 0.5, 10, 4), (10, 1.0, 10, 4)])
-def test_value_iteration_agent(horizon, gamma, S, A):
-    for sim in range(5):
-        # generate random MDP
-        R, P = get_random_mdp(S, A)
-        # create env and agent
-        env = FiniteMDP(R, P)
-        agent = ValueIterationAgent(env, gamma=gamma, horizon=horizon)
-        # run
-        agent.fit()
diff --git a/rlberry/agents/tests/test_kernel_based.py b/rlberry/agents/tests/test_kernel_based.py
deleted file mode 100644
index 65abac706..000000000
--- a/rlberry/agents/tests/test_kernel_based.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import pytest
-from rlberry.agents.kernel_based import RSKernelUCBVIAgent
-from rlberry.agents.kernel_based import RSUCBVIAgent
-from rlberry.agents.kernel_based.kernels import _str_to_int
-from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
-
-
-@pytest.mark.parametrize(
-    "kernel_type",
-    [
-        "uniform",
-        "triangular",
-        "gaussian",
-        "epanechnikov",
-        "quartic",
-        "triweight",
-        "tricube",
-        "cosine",
-        "exp-2",
-    ],
-)
-def test_rs_kernel_ucbvi(kernel_type):
-    for horizon in [None, 30]:
-        env = get_benchmark_env(level=1)
-        agent = RSKernelUCBVIAgent(
-            env,
-            gamma=0.95,
-            horizon=horizon,
-            bonus_scale_factor=0.01,
-            min_dist=0.2,
-            bandwidth=0.05,
-            beta=1.0,
-            kernel_type=kernel_type,
-        )
-        agent.fit(budget=5)
-        agent.policy(env.observation_space.sample())
-
-
-def test_str_to_int():
-    for ii in range(100):
-        assert _str_to_int(str(ii)) == ii
-
-
-def test_rs_ucbvi():
-    env = get_benchmark_env(level=1)
-    agent = RSUCBVIAgent(env, gamma=0.99, horizon=30, bonus_scale_factor=0.1)
-    agent.fit(budget=5)
-    agent.policy(env.observation_space.sample())
-
-
-def test_rs_ucbvi_reward_free():
-    env = get_benchmark_env(level=1)
-    agent = RSUCBVIAgent(
-        env, gamma=0.99, horizon=30, bonus_scale_factor=0.1, reward_free=True
-    )
-    agent.fit(budget=5)
-    agent.policy(env.observation_space.sample())
-    assert agent.R_hat.sum() == 0.0
diff --git a/rlberry/agents/tests/test_lsvi_ucb.py b/rlberry/agents/tests/test_lsvi_ucb.py
deleted file mode 100644
index 03299b747..000000000
--- a/rlberry/agents/tests/test_lsvi_ucb.py
+++ /dev/null
@@ -1,218 +0,0 @@
-import numpy as np
-import pytest
-from rlberry.agents.features import FeatureMap
-from rlberry.agents.linear.lsvi_ucb import LSVIUCBAgent
-from rlberry.agents.dynprog import ValueIterationAgent
-from rlberry.envs.finite import GridWorld
-
-
-class OneHotFeatureMap(FeatureMap):
-    def __init__(self, S, A):
-        self.S = S
-        self.A = A
-        self.shape = (S * A,)
-
-    def map(self, observation, action):
-        feat = np.zeros((self.S, self.A))
-        feat[observation, action] = 1.0
-        return feat.flatten()
-
-
-class RandomFeatMap(FeatureMap):
-    def __init__(self, S, A):
-        self.feat_mat = np.random.randn(S, A, 10)
-        self.shape = (10,)
-
-    def map(self, observation, action):
-        feat = self.feat_mat[observation, action, :]
-        return feat.copy()
-
-
-@pytest.mark.parametrize("FeatMapClass", [OneHotFeatureMap, RandomFeatMap])
-def test_lsvi_ucb_matrix_inversion(FeatMapClass):
-    env = GridWorld(nrows=3, ncols=3, walls=())
-    env.reseed(123)
-
-    def feature_map_fn(_env):
-        return FeatMapClass(_env.observation_space.n, _env.action_space.n)
-
-    reg_factor = 0.1
-    agent = LSVIUCBAgent(
-        env, feature_map_fn=feature_map_fn, horizon=10, reg_factor=reg_factor
-    )
-    agent.reseed(123)
-    agent.fit(budget=50)
-    assert np.allclose(np.linalg.inv(agent.lambda_mat), agent.lambda_mat_inv)
-    assert agent.episode == 50
-    agent.policy(env.observation_space.sample())
-
-    # Check counts
-    if FeatMapClass != OneHotFeatureMap:
-        return
-
-    S = env.observation_space.n
-    A = env.action_space.n
-    N_sa = np.zeros((S, A))
-    for state, action in zip(agent.state_hist, agent.action_hist):
-        N_sa[state, action] += 1.0
-
-    assert np.allclose(
-        agent.lambda_mat_inv.diagonal(), 1.0 / (N_sa.flatten() + reg_factor)
-    )
-
-    for ss in range(S):
-        for aa in range(A):
-            feat = agent.feature_map.map(ss, aa)
-            assert np.allclose(
-                feat @ (agent.lambda_mat_inv.T @ feat),
-                1.0 / (N_sa[ss, aa] + reg_factor),
-            )
-
-
-def test_lsvi_without_bonus():
-    def lsvi_debug_gather_data(agent):
-        """
-        Function to gather data sampling uniformly
-        states and actions
-        """
-        N = agent.n_episodes * agent.horizon
-        count = 0
-        while count < N:
-            state = agent.env.observation_space.sample()
-            action = agent.env.action_space.sample()
-            next_state, reward, terminated, truncated, info = agent.env.sample(
-                state, action
-            )
-            done = terminated or truncated
-            #
-            #
-            feat = agent.feature_map.map(state, action)
-            outer_prod = np.outer(feat, feat)
-            inv = agent.lambda_mat_inv
-
-            #
-            agent.lambda_mat += np.outer(feat, feat)
-            # update inverse
-            agent.lambda_mat_inv -= (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat)
-
-            # update history
-            agent.reward_hist[count] = reward
-            agent.state_hist.append(state)
-            agent.action_hist.append(action)
-            agent.nstate_hist.append(next_state)
-
-            #
-            tt = agent.total_time_steps
-            agent.feat_hist[tt, :] = agent.feature_map.map(state, action)
-            for aa in range(agent.env.action_space.n):
-                agent.feat_ns_all_actions[tt, aa, :] = agent.feature_map.map(
-                    next_state, aa
-                )
-
-            # increments
-            agent.total_time_steps += 1
-            count += 1
-
-    env = GridWorld(nrows=2, ncols=2, walls=(), success_probability=0.95)
-    env.reseed(123)
-
-    def feature_map_fn(_env):
-        return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n)
-
-    agent = LSVIUCBAgent(
-        env, feature_map_fn=feature_map_fn, horizon=20, gamma=0.99, reg_factor=1e-5
-    )
-    agent.reseed(123)
-    agent.n_episodes = 100
-    agent.reset()
-
-    lsvi_debug_gather_data(agent)
-    # estimated Q
-    S = env.observation_space.n
-    Q_est = agent._run_lsvi(bonus_factor=0.0)[0, :].reshape((S, -1))
-
-    # near optimal Q
-    agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=20)
-    agent_opt.fit()
-    Q = agent_opt.Q[0, :, :]
-
-    print(Q)
-    print("---")
-    print(Q_est)
-
-    print("-------")
-    print(np.abs(Q - Q_est))
-    # Check error
-    assert Q_est == pytest.approx(Q, rel=0.01)
-
-
-def test_lsvi_random_exploration():
-    env = GridWorld(nrows=2, ncols=2, walls=(), success_probability=0.95)
-    env.reseed(123)
-
-    def feature_map_fn(_env):
-        return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n)
-
-    agent = LSVIUCBAgent(
-        env,
-        feature_map_fn=feature_map_fn,
-        horizon=20,
-        gamma=0.99,
-        reg_factor=1e-5,
-        bonus_scale_factor=0.0,
-    )
-    agent.reseed(123)
-    agent.fit(budget=250)
-
-    # estimated Q
-    S = env.observation_space.n
-    Q_est = agent._run_lsvi(bonus_factor=0.0)[0, :].reshape((S, -1))
-
-    # near optimal Q
-    agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=20)
-    agent_opt.fit()
-    Q = agent_opt.Q[0, :, :]
-
-    print(Q)
-    print("---")
-    print(Q_est)
-
-    print("-------")
-    print(np.abs(Q - Q_est))
-    # Check error
-    assert np.abs(Q - Q_est).mean() < 0.1
-
-
-def test_lsvi_optimism():
-    env = GridWorld(nrows=2, ncols=2, walls=())
-
-    def feature_map_fn(_env):
-        return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n)
-
-    agent = LSVIUCBAgent(
-        env,
-        gamma=0.99,
-        feature_map_fn=feature_map_fn,
-        horizon=3,
-        bonus_scale_factor=3,
-        reg_factor=0.000001,
-    )
-    agent.fit(budget=250)
-
-    # near optimal Q
-    agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=3)
-    agent_opt.fit()
-    Q = agent_opt.Q[0, :, :]
-
-    # optimistic Q
-    S = env.observation_space.n
-    A = env.action_space.n
-    Q_optimistic = np.zeros((S, A))
-    for ss in range(S):
-        Q_optimistic[ss, :] = agent._compute_q_vec(
-            agent.w_vec[0, :], ss, agent.bonus_scale_factor
-        )
-
-    print(Q)
-    print(Q_optimistic)
-    assert (Q_optimistic - Q).min() >= -1e-5
diff --git a/rlberry/agents/tests/test_mbqvi.py b/rlberry/agents/tests/test_mbqvi.py
deleted file mode 100644
index cafdb5566..000000000
--- a/rlberry/agents/tests/test_mbqvi.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import numpy as np
-import pytest
-
-from rlberry.seeding import Seeder
-from rlberry.agents.mbqvi import MBQVIAgent
-from rlberry.envs.finite import FiniteMDP
-
-
-@pytest.mark.parametrize("S, A", [(5, 2), (10, 4)])
-def test_mbqvi(S, A):
-    rng = Seeder(123).rng
-
-    for sim in range(5):
-        # generate random MDP with deterministic transitions
-        R = rng.uniform(0.0, 1.0, (S, A))
-        P = np.zeros((S, A, S))
-        for ss in range(S):
-            for aa in range(A):
-                ns = rng.integers(0, S)
-                P[ss, aa, ns] = 1
-
-        # run MBQVI and check exactness of estimators
-        env = FiniteMDP(R, P)
-        agent = MBQVIAgent(env, n_samples=1)
-        agent.fit()
-        assert np.abs(R - agent.R_hat).max() < 1e-16
-        assert np.abs(P - agent.P_hat).max() < 1e-16
diff --git a/rlberry/agents/tests/test_optql.py b/rlberry/agents/tests/test_optql.py
deleted file mode 100644
index 35adf21d6..000000000
--- a/rlberry/agents/tests/test_optql.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from rlberry.agents.optql import OptQLAgent
-from rlberry.envs.finite import GridWorld
-
-
-def test_optql():
-    env = GridWorld(walls=(), nrows=5, ncols=5)
-    agent = OptQLAgent(env, horizon=11, gamma=0.99, bonus_scale_factor=0.1)
-    agent.fit(budget=50)
-    agent.policy(env.observation_space.sample())
diff --git a/rlberry/agents/tests/test_psrl.py b/rlberry/agents/tests/test_psrl.py
deleted file mode 100644
index 325777f6d..000000000
--- a/rlberry/agents/tests/test_psrl.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import pytest
-from rlberry.agents.psrl import PSRLAgent
-from rlberry.envs.finite import GridWorld
-
-
-@pytest.mark.parametrize(
-    "gamma, stage_dependent, bernoullized_reward",
-    [
-        (1.0, True, True),
-        (1.0, True, False),
-        (1.0, False, True),
-        (1.0, False, False),
-        (0.9, True, True),
-        (0.9, True, False),
-        (0.9, False, True),
-        (0.9, False, False),
-    ],
-)
-def test_ucbvi(gamma, stage_dependent, bernoullized_reward):
-    env = GridWorld(walls=(), nrows=5, ncols=5)
-    agent = PSRLAgent(
-        env,
-        horizon=11,
-        bernoullized_reward=bernoullized_reward,
-        stage_dependent=stage_dependent,
-        gamma=gamma,
-    )
-    agent.fit(budget=50)
-    agent.policy(env.observation_space.sample())
diff --git a/rlberry/agents/tests/test_replay.py b/rlberry/agents/tests/test_replay.py
index bad1a297e..167bf80c5 100644
--- a/rlberry/agents/tests/test_replay.py
+++ b/rlberry/agents/tests/test_replay.py
@@ -1,7 +1,7 @@
 import pytest
 import numpy as np
 from rlberry.agents.utils import replay
-from rlberry.envs.finite import GridWorld
+from rlberry_research.envs.finite import GridWorld
 from gymnasium.wrappers import TimeLimit
 
 
@@ -56,27 +56,31 @@ def test_replay_size():
 
 
 @pytest.mark.parametrize("sampling_mode", ["uniform", "prioritized"])
-def test_replay_sampling(sampling_mode):
+@pytest.mark.parametrize("max_replay_size", [128, 500])
+def test_replay_sampling(sampling_mode, max_replay_size):
     batch_size = 128
     chunk_size = 256
 
     # get replay buffer
-    buffer, _ = _get_filled_replay(max_replay_size=500)
+    buffer, _ = _get_filled_replay(max_replay_size=max_replay_size)
 
     # Sample batches, check shape and dtype
     for _ in range(10):
         batch = buffer.sample(
             batch_size=batch_size, chunk_size=chunk_size, sampling_mode=sampling_mode
         )
-        for tag in buffer.tags:
-            assert batch.data[tag].shape[:2] == (batch_size, chunk_size)
-            assert batch.data[tag].dtype == buffer.dtypes[tag]
-            assert np.array_equal(
-                np.array(buffer.data[tag], dtype=buffer.dtypes[tag])[
-                    batch.info["indices"]
-                ],
-                batch.data[tag],
-            )
+        if chunk_size > max_replay_size:
+            assert batch is None
+        else:
+            for tag in buffer.tags:
+                assert batch.data[tag].shape[:2] == (batch_size, chunk_size)
+                assert batch.data[tag].dtype == buffer.dtypes[tag]
+                assert np.array_equal(
+                    np.array(buffer.data[tag], dtype=buffer.dtypes[tag])[
+                        batch.info["indices"]
+                    ],
+                    batch.data[tag],
+                )
 
 
 def test_replay_priority_update():
diff --git a/rlberry/agents/tests/test_rlsvi.py b/rlberry/agents/tests/test_rlsvi.py
deleted file mode 100644
index 0907d8d33..000000000
--- a/rlberry/agents/tests/test_rlsvi.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import pytest
-from rlberry.agents.rlsvi import RLSVIAgent
-from rlberry.envs.finite import GridWorld
-
-
-@pytest.mark.parametrize(
-    "gamma, stage_dependent",
-    [
-        (1.0, True),
-        (1.0, False),
-        (0.9, True),
-        (0.9, False),
-    ],
-)
-def test_rlsvi(gamma, stage_dependent):
-    env = GridWorld(walls=(), nrows=5, ncols=5)
-    agent = RLSVIAgent(env, horizon=11, stage_dependent=stage_dependent, gamma=gamma)
-    agent.fit(budget=50)
-    agent.policy(env.observation_space.sample())
diff --git a/rlberry/agents/tests/test_tabular_rl.py b/rlberry/agents/tests/test_tabular_rl.py
deleted file mode 100644
index ab7f618a3..000000000
--- a/rlberry/agents/tests/test_tabular_rl.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import pytest
-from rlberry.agents import QLAgent, SARSAAgent
-from rlberry.envs import GridWorld
-
-
-@pytest.mark.parametrize(
-    "exploration_type, exploration_rate",
-    [("epsilon", 0.5), ("boltzmann", 0.5), (None, None)],
-)
-def test_ql(exploration_type, exploration_rate):
-    env = GridWorld(walls=(), nrows=5, ncols=5)
-    agent = QLAgent(
-        env, exploration_type=exploration_type, exploration_rate=exploration_rate
-    )
-    agent.fit(budget=50)
-    agent.policy(env.observation_space.sample())
-    agent.reset()
-    assert not agent.Q.any()
-
-
-@pytest.mark.parametrize(
-    "exploration_type, exploration_rate",
-    [("epsilon", 0.5), ("boltzmann", 0.5), (None, None)],
-)
-def test_sarsa(exploration_type, exploration_rate):
-    env = GridWorld(walls=(), nrows=5, ncols=5)
-    agent = SARSAAgent(
-        env, exploration_type=exploration_type, exploration_rate=exploration_rate
-    )
-    agent.fit(budget=50)
-    agent.policy(env.observation_space.sample())
-    agent.reset()
-    assert not agent.Q.any()
diff --git a/rlberry/agents/tests/test_ucbvi.py b/rlberry/agents/tests/test_ucbvi.py
deleted file mode 100644
index 641fe0c02..000000000
--- a/rlberry/agents/tests/test_ucbvi.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import pytest
-from rlberry.agents.ucbvi import UCBVIAgent
-from rlberry.envs.finite import GridWorld
-
-
-@pytest.mark.parametrize(
-    "gamma, stage_dependent, real_time_dp",
-    [
-        (1.0, True, True),
-        (1.0, True, False),
-        (1.0, False, True),
-        (1.0, False, False),
-        (0.9, True, True),
-        (0.9, True, False),
-        (0.9, False, True),
-        (0.9, False, False),
-    ],
-)
-def test_ucbvi(gamma, stage_dependent, real_time_dp):
-    env = GridWorld(walls=(), nrows=5, ncols=5)
-    agent = UCBVIAgent(
-        env,
-        horizon=11,
-        stage_dependent=stage_dependent,
-        gamma=gamma,
-        real_time_dp=real_time_dp,
-        bonus_scale_factor=0.1,
-    )
-    agent.fit(budget=50)
-    agent.policy(env.observation_space.sample())
diff --git a/rlberry/agents/torch/__init__.py b/rlberry/agents/torch/__init__.py
deleted file mode 100644
index 896403dc6..000000000
--- a/rlberry/agents/torch/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Torch agents (in alphabetical order)
-from .a2c import A2CAgent
-from .dqn import DQNAgent
-from .dqn import MunchausenDQNAgent
-from .ppo import PPOAgent
-from .reinforce import REINFORCEAgent
-from .sac import SACAgent
diff --git a/rlberry/agents/torch/a2c/__init__.py b/rlberry/agents/torch/a2c/__init__.py
deleted file mode 100644
index 4581caf68..000000000
--- a/rlberry/agents/torch/a2c/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .a2c import A2CAgent
diff --git a/rlberry/agents/torch/a2c/a2c.py b/rlberry/agents/torch/a2c/a2c.py
deleted file mode 100644
index 9907677e1..000000000
--- a/rlberry/agents/torch/a2c/a2c.py
+++ /dev/null
@@ -1,338 +0,0 @@
-import torch
-import torch.nn as nn
-
-import gymnasium.spaces as spaces
-import numpy as np
-from rlberry.agents import AgentWithSimplePolicy, AgentTorch
-from rlberry.agents.utils.replay import ReplayBuffer
-from rlberry.agents.torch.utils.training import optimizer_factory
-from rlberry.agents.torch.utils.models import default_policy_net_fn
-from rlberry.agents.torch.utils.models import default_value_net_fn
-from rlberry.utils.torch import choose_device
-from rlberry.utils.factory import load
-from typing import Optional
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class A2CAgent(AgentTorch, AgentWithSimplePolicy):
-    """
-    Advantage Actor Critic Agent.
-
-    A2C, or Advantage Actor Critic, is a synchronous version of the A3C policy
-    gradient method. As an alternative to the asynchronous implementation of
-    A3C, A2C is a synchronous, deterministic implementation that waits for each
-    actor to finish its segment of experience before updating, averaging over
-    all of the actors. This more effectively uses GPUs due to larger batch sizes.
-
-    Parameters
-    ----------
-    env : Model
-        Online model with continuous (Box) state space and discrete actions
-    batch_size : int
-        Number of timesteps to wait before updating the policy.
-    gamma : double
-        Discount factor in [0, 1].
-    entr_coef : double
-        Entropy coefficient.
-    learning_rate : double
-        Learning rate.
-    optimizer_type: str
-        Type of optimizer. 'ADAM' by defaut.
-    policy_net_fn : function(env, **kwargs)
-        Function that returns an instance of a policy network (pytorch).
-        If None, a default net is used.
-    value_net_fn : function(env, **kwargs)
-        Function that returns an instance of a value network (pytorch).
-        If None, a default net is used.
-    policy_net_kwargs : dict
-        kwargs for policy_net_fn
-    value_net_kwargs : dict
-        kwargs for value_net_fn
-    device : str
-        Device to put the tensors on
-    eval_interval : int, default = None
-        Interval (in number of transitions) between agent evaluations in fit().
-        If None, never evaluate.
-
-    References
-    ----------
-    Mnih, V., Badia, A.P., Mirza, M., Graves, A., Lillicrap, T., Harley, T.,
-    Silver, D. & Kavukcuoglu, K. (2016).
-    "Asynchronous methods for deep reinforcement learning."
-    In International Conference on Machine Learning (pp. 1928-1937).
-    """
-
-    name = "A2C"
-
-    def __init__(
-        self,
-        env,
-        batch_size=256,
-        gamma=0.99,
-        entr_coef=0.01,
-        learning_rate=0.01,
-        optimizer_type="ADAM",
-        policy_net_fn=None,
-        value_net_fn=None,
-        policy_net_kwargs=None,
-        value_net_kwargs=None,
-        device="cuda:best",
-        eval_interval: Optional[int] = None,
-        **kwargs
-    ):
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-
-        self.batch_size = batch_size
-        self.gamma = gamma
-        self.entr_coef = entr_coef
-        self.learning_rate = learning_rate
-        self.device = choose_device(device)
-        self.eval_interval = eval_interval
-
-        self.policy_net_kwargs = policy_net_kwargs or {}
-        self.value_net_kwargs = value_net_kwargs or {}
-
-        if isinstance(policy_net_fn, str):
-            self.policy_net_fn = load(policy_net_fn)
-        elif policy_net_fn is None:
-            self.policy_net_fn = default_policy_net_fn
-        else:
-            self.policy_net_fn = policy_net_fn
-
-        if isinstance(value_net_fn, str):
-            self.value_net_fn = load(value_net_fn)
-        elif value_net_fn is None:
-            self.value_net_fn = default_value_net_fn
-        else:
-            self.value_net_fn = value_net_fn
-
-        self.optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate}
-        self.optimizer_type = optimizer_type
-
-        # check environment
-        assert isinstance(self.env.observation_space, spaces.Box)
-
-        # get horizon
-        if hasattr(self.env, "_max_episode_steps"):
-            max_episode_steps = self.env._max_episode_steps
-        else:
-            max_episode_steps = np.inf
-        self._max_episode_steps = max_episode_steps
-
-        self._policy = None  # categorical policy function
-
-        # initialize
-        self.reset()
-
-    def reset(self):
-        self._policy = self.policy_net_fn(self.env, **self.policy_net_kwargs).to(
-            self.device
-        )
-        self._policy_optimizer = optimizer_factory(
-            self._policy.parameters(), **self.optimizer_kwargs
-        )
-
-        self.value_net = self.value_net_fn(self.env, **self.value_net_kwargs).to(
-            self.device
-        )
-
-        self.value_optimizer = optimizer_factory(
-            self.value_net.parameters(), **self.optimizer_kwargs
-        )
-
-        self._policy_old = self.policy_net_fn(self.env, **self.policy_net_kwargs).to(
-            self.device
-        )
-        self._policy_old.load_state_dict(self._policy.state_dict())
-
-        self.mse_loss = nn.MSELoss()
-
-        self.memory = ReplayBuffer(max_replay_size=self.batch_size, rng=self.rng)
-        self.memory.setup_entry("states", dtype=np.float32)
-        if self._policy.ctns_actions:
-            self.memory.setup_entry("actions", dtype=np.float32)
-        else:
-            self.memory.setup_entry("actions", dtype=int)
-        self.memory.setup_entry("rewards", dtype=np.float32)
-        self.memory.setup_entry("dones", dtype=bool)
-
-        self.total_timesteps = 0
-        self.total_episodes = 0
-
-    def policy(self, observation):
-        state = observation
-        assert self._policy is not None
-        state = torch.from_numpy(state).float().to(self.device)
-        action_dist = self._policy_old(state)
-        if self._policy.ctns_actions:
-            action = action_dist.sample().numpy()
-        else:
-            action = action_dist.sample().item()
-        return action
-
-    def fit(self, budget: int, **kwargs):
-        """
-        Train the agent using the provided environment.
-
-        Parameters
-        ----------
-        budget: int
-            Number of timesteps to train the agent for.
-            One step = one transition in the environment.
-        """
-        del kwargs
-        timesteps_counter = 0
-        episode_rewards = 0.0
-        episode_timesteps = 0
-        observation, info = self.env.reset()
-        while timesteps_counter < budget:
-            action = self._select_action(observation)
-            next_observation, reward, terminated, truncated, info = self.env.step(
-                action
-            )
-            done = terminated or truncated
-            # if self._policy.ctns_actions:
-            #     action = torch.from_numpy(action).float().to(self.device)
-            # store data
-            episode_rewards += reward
-            self.memory.append(
-                {
-                    "states": observation,
-                    "actions": action,
-                    "rewards": reward,
-                    "dones": done,
-                }
-            )
-
-            # counters and next obs
-            self.total_timesteps += 1
-            timesteps_counter += 1
-            episode_timesteps += 1
-            observation = next_observation
-
-            # update
-            if self.total_timesteps % self.batch_size == 0:
-                self._update()
-
-            # eval
-            total_timesteps = self.total_timesteps
-            if (
-                self.eval_interval is not None
-                and total_timesteps % self.eval_interval == 0
-            ):
-                eval_rewards = self.eval(
-                    eval_horizon=self._max_episode_steps, gamma=1.0
-                )
-                if self.writer:
-                    memory_size = len(self.memory)
-                    self.writer.add_scalar(
-                        "eval_rewards", eval_rewards, total_timesteps
-                    )
-                    self.writer.add_scalar("memory_size", memory_size, total_timesteps)
-
-            # check if episode ended
-            if done:
-                self.total_episodes += 1
-                self.memory.end_episode()
-                if self.writer:
-                    self.writer.add_scalar(
-                        "episode_rewards", episode_rewards, total_timesteps
-                    )
-                    self.writer.add_scalar(
-                        "total_episodes", self.total_episodes, total_timesteps
-                    )
-                episode_rewards = 0.0
-                episode_timesteps = 0
-                observation, info = self.env.reset()
-
-    def _select_action(self, state):
-        state = torch.from_numpy(state).float().to(self.device)
-        action_dist = self._policy_old(state)
-        action = action_dist.sample()
-        if self._policy.ctns_actions:
-            action = action.numpy()
-        else:
-            action = action.item()
-        return action
-
-    def _update(self):
-        # monte carlo estimate of rewards
-        rewards = []
-        discounted_reward = 0
-
-        memory_data = self.memory.data
-        memory_states = memory_data["states"]
-        memory_actions = memory_data["actions"]
-        memory_rewards = memory_data["rewards"]
-        memory_dones = memory_data["dones"]
-
-        for reward, is_terminal in zip(
-            reversed(memory_rewards), reversed(memory_dones)
-        ):
-            if is_terminal:
-                discounted_reward = 0
-            discounted_reward = reward + (self.gamma * discounted_reward)
-            rewards.insert(0, discounted_reward)
-
-        # convert to tensor
-        rewards = torch.FloatTensor(rewards).to(self.device)
-        memory_states_tensors = [
-            torch.tensor(states).to(self.device).float() for states in memory_states
-        ]
-        memory_actions_tensors = [
-            torch.tensor(actions).to(self.device) for actions in memory_actions
-        ]
-
-        # convert list to tensor
-        old_states = torch.stack(memory_states_tensors).to(self.device).detach()
-        old_actions = torch.stack(memory_actions_tensors).to(self.device).detach()
-
-        # evaluate old actions and values
-        action_dist = self._policy(old_states)
-        logprobs = action_dist.log_prob(old_actions)
-        state_values = torch.squeeze(self.value_net(old_states))
-        dist_entropy = action_dist.entropy()
-
-        # normalize the advantages
-        advantages = rewards - state_values.detach()
-        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
-        # find pg loss
-        pg_loss = -logprobs * advantages
-        loss = (
-            pg_loss
-            + 0.5 * self.mse_loss(state_values, rewards)
-            - self.entr_coef * dist_entropy
-        )
-
-        # take gradient step
-        self._policy_optimizer.zero_grad()
-        self.value_optimizer.zero_grad()
-
-        loss.mean().backward()
-
-        self._policy_optimizer.step()
-        self.value_optimizer.step()
-
-        # copy new weights into old policy
-        self._policy_old.load_state_dict(self._policy.state_dict())
-
-    #
-    # For hyperparameter optimization
-    #
-    @classmethod
-    def sample_parameters(cls, trial):
-        batch_size = trial.suggest_categorical("batch_size", [1, 4, 8, 16, 32])
-        gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.99])
-        learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True)
-
-        entr_coef = trial.suggest_float("entr_coef", 1e-8, 0.1, log=True)
-
-        return {
-            "batch_size": batch_size,
-            "gamma": gamma,
-            "learning_rate": learning_rate,
-            "entr_coef": entr_coef,
-        }
diff --git a/rlberry/agents/torch/dqn/__init__.py b/rlberry/agents/torch/dqn/__init__.py
deleted file mode 100644
index 7f799acbe..000000000
--- a/rlberry/agents/torch/dqn/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .dqn import DQNAgent
-from .mdqn import MunchausenDQNAgent
diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py
deleted file mode 100644
index 84219c8c8..000000000
--- a/rlberry/agents/torch/dqn/dqn.py
+++ /dev/null
@@ -1,513 +0,0 @@
-import inspect
-from typing import Callable, Optional, Union
-
-from gymnasium import spaces
-import numpy as np
-import torch
-
-from rlberry import types
-from rlberry.agents import AgentWithSimplePolicy, AgentTorch
-from rlberry.agents.torch.utils.training import (
-    loss_function_factory,
-    model_factory,
-    optimizer_factory,
-    size_model_config,
-)
-from rlberry.agents.torch.dqn.dqn_utils import polynomial_schedule, lambda_returns
-from rlberry.agents.utils import replay
-from rlberry.utils.torch import choose_device
-from rlberry.utils.factory import load
-
-
-import rlberry
-
-logger = rlberry.logger
-
-
-def default_q_net_fn(env, **kwargs):
-    """
-    Returns a default Q value network.
-    """
-    del kwargs
-    model_config = {
-        "type": "MultiLayerPerceptron",
-        "layer_sizes": (64, 64),
-        "reshape": False,
-    }
-    model_config = size_model_config(env, **model_config)
-    return model_factory(**model_config)
-
-
-class DQNAgent(AgentTorch, AgentWithSimplePolicy):
-    """DQN Agent based on PyTorch.
-
-    Notes
-    -----
-    Uses Q(lambda) for computing targets by default. To recover
-    the standard DQN, set :code:`lambda_ = 0.0` and :code:`chunk_size = 1`.
-
-    Parameters
-    ----------
-    env: :class:`~rlberry.types.Env`
-        Environment, can be a tuple (constructor, kwargs)
-    gamma: float, default = 0.99
-        Discount factor.
-    batch_size: int, default=32
-        Batch size.
-    chunk_size: int, default=8
-        Length of sub-trajectories sampled from the replay buffer.
-    lambda_: float, default=0.5
-        Q(lambda) parameter.
-    target_update_parameter : int or float
-        If int: interval (in number total number of online updates) between updates of the target network.
-        If float: soft update coefficient
-    device: str
-        Torch device, see :func:`~rlberry.utils.torch.choose_device`
-    learning_rate : float, default = 1e-3
-        Optimizer learning rate.
-    loss_function: {"l1", "l2", "smooth_l1"}, default: "l2"
-        Loss function used to compute Bellman error.
-    epsilon_init: float, default = 1.0
-        Initial epsilon value for epsilon-greedy exploration.
-    epsilon_final: float, default = 0.1
-        Final epsilon value for epsilon-greedy exploration.
-    epsilon_decay_interval : int
-        After :code:`epsilon_decay` timesteps, epsilon approaches :code:`epsilon_final`.
-    optimizer_type : {"ADAM", "RMS_PROP"}
-        Optimization algorithm.
-    q_net_constructor : Callable, str or None
-        Function/constructor that returns a torch module for the Q-network:
-        :code:`qnet = q_net_constructor(env, **kwargs)`.
-
-        Module (Q-network) requirements:
-
-        * Input shape = (batch_dim, chunk_size, obs_dims)
-
-        * Ouput shape = (batch_dim, chunk_size, number_of_actions)
-
-        Example: use `rlberry.agents.torch.utils.training.model_factory_from_env`,
-         and `q_net_kwargs`
-        parameter to modify the neural network::
-
-            model_configs = {
-                "type": "MultiLayerPerceptron",
-                "layer_sizes": (5, 5),
-                "reshape": False,
-            }
-
-            agent = DQNAgent(env,
-                q_net_constructor=model_factory_from_env,
-                q_net_kwargs=model_configs
-                )
-        If str then it should correspond to the full path to the constructor function,
-        e.g.::
-            agent = DQNAgent(env,
-                q_net_constructor='rlberry.agents.torch.utils.training.model_factory_from_env',
-                q_net_kwargs=model_configs
-                )
-
-        If None then it is set to MultiLayerPerceptron with 2 hidden layers
-        of size 64
-
-    q_net_kwargs : optional, dict
-        Parameters for q_net_constructor.
-    use_double_dqn : bool, default = False
-        If True, use Double DQN.
-    use_prioritized_replay : bool, default = False
-        If True, use Prioritized Experience Replay.
-    train_interval: int
-        Update the model every :code:`train_interval` steps.
-        If -1, train only at the end of the episodes.
-    gradient_steps: int
-        How many gradient steps to do at each update.
-        If -1, take the number of timesteps since last update.
-    max_replay_size : int
-        Maximum number of transitions in the replay buffer.
-    learning_starts : int
-        How many steps of the model to collect transitions for before learning starts
-    eval_interval : int, default = None
-        Interval (in number of transitions) between agent evaluations in fit().
-        If None, never evaluate.
-
-    Attributes
-    ----------
-    gamma : float, default: 0.99
-        Discount factor used to discount future rewards in the Bellman equation.
-    batch_size : int, default: 32
-        Batch size used during the training process.
-    chunk_size : int, default: 8
-        Length of sub-trajectories sampled from the replay buffer.
-    lambda_ : float, default: 0.5
-        Q(lambda) parameter used in Q(lambda) algorithm for computing targets.
-    target_update_parameter : int or float
-        The parameter that controls the update frequency of the target network.
-        If int: interval (in number of total online updates) between updates of the target network.
-        If float: soft update coefficient, which controls the rate at which the target network approaches
-        the online network.
-    device : str
-        Torch device on which the agent's neural networks are placed. Use "cuda:best" to choose the best
-        available GPU device.
-    learning_rate : float, default: 1e-3
-        Learning rate used by the optimizer during neural network training.
-    epsilon_init : float, default: 1.0
-        Initial epsilon value for epsilon-greedy exploration. Epsilon-greedy policy is used to balance
-        exploration and exploitation during training.
-    epsilon_final : float, default: 0.1
-        Final epsilon value for epsilon-greedy exploration. Epsilon will approach this value as the agent
-        gains more experience.
-    epsilon_decay_interval : int
-        The number of timesteps after which the epsilon value will approach `epsilon_final`.
-    loss_function : {"l1", "l2", "smooth_l1"}, default: "l2"
-        The loss function used to compute the Bellman error during training. The available options are
-        Mean Absolute Error ("l1"), Mean Squared Error ("l2"), and Smooth L1 Loss ("smooth_l1").
-    optimizer_type : {"ADAM", "RMS_PROP"}
-        The optimization algorithm used during neural network training. Choose between ADAM and RMS_PROP.
-    q_net_constructor : Callable, str or None
-        Function/constructor that returns a torch module for the Q-network.
-        Example: use `rlberry.agents.torch.utils.training.model_factory_from_env` and `q_net_kwargs`
-        parameter to modify the neural network.
-    q_net_kwargs : optional, dict
-        Parameters for `q_net_constructor`.
-    use_double_dqn : bool, default: False
-        If True, use Double DQN algorithm, which helps to reduce overestimation bias in Q-value estimates.
-    use_prioritized_replay : bool, default: False
-        If True, use Prioritized Experience Replay, which prioritizes transitions in the replay buffer
-        based on their TD-errors, to improve the learning process.
-    train_interval : int
-        The agent updates the model every `train_interval` steps. If -1, the agent only trains at the end
-        of each episode.
-    gradient_steps : int
-        The number of gradient steps to perform at each model update. If -1, the number of timesteps since
-        the last update will be used.
-    max_replay_size : int
-        The maximum number of transitions allowed in the replay buffer.
-    learning_starts : int
-        The number of steps of the model to collect transitions for before learning starts.
-    eval_interval : int, default: None
-        The interval (in number of transitions) between agent evaluations in the `fit()` method. If None,
-        the agent won't evaluate during training.
-    """
-
-    name = "DQN"
-
-    def __init__(
-        self,
-        env: types.Env,
-        gamma: float = 0.99,
-        batch_size: int = 32,
-        chunk_size: int = 8,
-        lambda_: float = 0.5,
-        target_update_parameter: Union[int, float] = 0.005,
-        device: str = "cuda:best",
-        learning_rate: float = 1e-3,
-        epsilon_init: float = 1.0,
-        epsilon_final: float = 0.1,
-        epsilon_decay_interval: int = 20_000,
-        loss_function: str = "l2",
-        optimizer_type: str = "ADAM",
-        q_net_constructor: Optional[Callable[..., torch.nn.Module]] = None,
-        q_net_kwargs: Optional[dict] = None,
-        use_double_dqn: bool = False,
-        use_prioritized_replay: bool = False,
-        train_interval: int = 10,
-        gradient_steps: int = -1,
-        max_replay_size: int = 200_000,
-        learning_starts: int = 5_000,
-        eval_interval: Optional[int] = None,
-        **kwargs,
-    ):
-        # For all parameters, define self.param = param
-        _, _, _, values = inspect.getargvalues(inspect.currentframe())
-
-        values.pop("self")
-        for arg, val in values.items():
-            setattr(self, arg, val)
-
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-        env = self.env
-        assert isinstance(env.observation_space, spaces.Box)
-        assert isinstance(env.action_space, spaces.Discrete)
-
-        # DQN parameters
-
-        # Online and target Q networks, torch device
-        self._device = choose_device(device)
-        if isinstance(q_net_constructor, str):
-            q_net_ctor = load(q_net_constructor)
-        elif q_net_constructor is None:
-            q_net_ctor = default_q_net_fn
-        else:
-            q_net_ctor = q_net_constructor
-        q_net_kwargs = q_net_kwargs or dict()
-        self._qnet_online = q_net_ctor(env, **q_net_kwargs).to(self._device)
-        self._qnet_target = q_net_ctor(env, **q_net_kwargs).to(self._device)
-
-        # Optimizer and loss
-        optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate}
-        self._optimizer = optimizer_factory(
-            self._qnet_online.parameters(), **optimizer_kwargs
-        )
-        self._loss_function = loss_function_factory(loss_function, reduction="none")
-
-        # Training params
-        self._train_interval = train_interval
-        self._gradient_steps = gradient_steps
-        self._learning_starts = learning_starts
-        self._learning_starts = learning_starts
-        self._eval_interval = eval_interval
-
-        # Setup replay buffer
-        if hasattr(self.env, "_max_episode_steps"):
-            max_episode_steps = self.env._max_episode_steps
-        else:
-            max_episode_steps = np.inf
-        self._max_episode_steps = max_episode_steps
-
-        self._replay_buffer = replay.ReplayBuffer(
-            max_replay_size=max_replay_size,
-            rng=self.rng,
-            max_episode_steps=self._max_episode_steps,
-            enable_prioritized=use_prioritized_replay,
-        )
-        self._replay_buffer.setup_entry("observations", np.float32)
-        self._replay_buffer.setup_entry("next_observations", np.float32)
-        self._replay_buffer.setup_entry("actions", np.int32)
-        self._replay_buffer.setup_entry("rewards", np.float32)
-        self._replay_buffer.setup_entry("dones", bool)
-
-        # Counters
-        self._total_timesteps = 0
-        self._total_episodes = 0
-        self._total_updates = 0
-        self._timesteps_since_last_update = 0
-
-        # epsilon scheduling
-        self._epsilon_schedule = polynomial_schedule(
-            self.epsilon_init,
-            self.epsilon_final,
-            power=1.0,
-            transition_steps=self.epsilon_decay_interval,
-            transition_begin=0,
-        )
-
-    @property
-    def total_timesteps(self):
-        return self._total_timesteps
-
-    def _must_update(self, is_end_of_episode):
-        """Returns true if the model must be updated in the current timestep,
-        and the number of gradient steps to take"""
-        total_timesteps = self._total_timesteps
-        n_gradient_steps = self._gradient_steps
-
-        if total_timesteps < self._learning_starts:
-            return False, -1
-
-        if n_gradient_steps == -1:
-            n_gradient_steps = self._timesteps_since_last_update
-
-        run_update = False
-        if self._train_interval == -1:
-            run_update = is_end_of_episode
-        else:
-            run_update = total_timesteps % self._train_interval == 0
-        return run_update, n_gradient_steps
-
-    def _update(self, n_gradient_steps):
-        """Update networks."""
-        if self.use_prioritized_replay:
-            sampling_mode = "prioritized"
-        else:
-            sampling_mode = "uniform"
-
-        for _ in range(n_gradient_steps):
-            # Sample a batch
-            sampled = self._replay_buffer.sample(
-                self.batch_size, self.chunk_size, sampling_mode=sampling_mode
-            )
-            if not sampled:
-                return
-
-            # Update counters
-            self._timesteps_since_last_update = 0
-            self._total_updates += 1
-
-            batch = sampled.data
-            batch_info = sampled.info
-            assert batch["rewards"].shape == (self.batch_size, self.chunk_size)
-
-            # Compute targets
-            batch_observations = torch.FloatTensor(batch["observations"]).to(
-                self._device
-            )
-            batch_next_observations = torch.FloatTensor(batch["next_observations"]).to(
-                self._device
-            )
-            batch_actions = torch.LongTensor(batch["actions"]).to(self._device)
-
-            target_q_values_tp1 = self._qnet_target(batch_next_observations).detach()
-            # Check if double DQN
-            if self.use_double_dqn:
-                online_q_values_tp1 = self._qnet_online(
-                    batch_next_observations
-                ).detach()
-                a_argmax = online_q_values_tp1.argmax(dim=-1).detach()
-            else:
-                a_argmax = target_q_values_tp1.argmax(dim=-1).detach()
-
-            v_tp1 = (
-                torch.gather(target_q_values_tp1, dim=-1, index=a_argmax[:, :, None])[
-                    :, :, 0
-                ]
-                .cpu()
-                .numpy()
-            )
-
-            batch_lambda_returns = lambda_returns(
-                batch["rewards"],
-                self.gamma * (1.0 - np.array(batch["dones"], dtype=np.float32)),
-                v_tp1,
-                np.array(self.lambda_, dtype=np.float32),
-            )
-            targets = torch.tensor(batch_lambda_returns).to(self._device)
-
-            # Compute loss
-            batch_q_values = self._qnet_online(batch_observations)
-            batch_values = torch.gather(
-                batch_q_values, dim=-1, index=batch_actions[:, :, None]
-            )[
-                :, :, 0
-            ]  # shape (batch, chunk)
-
-            assert batch_values.shape == targets.shape
-            per_element_loss = self._loss_function(batch_values, targets)
-            per_batch_element_loss = per_element_loss.mean(dim=1)
-            weights = torch.FloatTensor(batch_info["weights"]).to(self._device)
-            loss = torch.sum(per_batch_element_loss * weights) / torch.sum(weights)
-
-            self._optimizer.zero_grad()
-            loss.backward()
-            self._optimizer.step()
-
-            if self.writer:
-                self.writer.add_scalar(
-                    "losses/q_loss", loss.item(), self._total_updates
-                )
-
-            # update priorities
-            if self.use_prioritized_replay:
-                new_priorities = per_element_loss.abs().detach().cpu().numpy() + 1e-6
-                self._replay_buffer.update_priorities(
-                    batch_info["indices"], new_priorities
-                )
-
-            # target update
-            if self.target_update_parameter > 1:
-                if self._total_updates % self.target_update_parameter == 0:
-                    self._qnet_target.load_state_dict(self._qnet_online.state_dict())
-            else:
-                tau = self.target_update_parameter
-                for param, target_param in zip(
-                    self._qnet_online.parameters(), self._qnet_target.parameters()
-                ):
-                    target_param.data.copy_(
-                        tau * param.data + (1 - tau) * target_param.data
-                    )
-
-    def fit(self, budget: int, **kwargs):
-        """
-        Train the agent using the provided environment.
-
-        Parameters
-        ----------
-        budget: int
-            Number of timesteps to train the agent for.
-            One step = one transition in the environment.
-        """
-        del kwargs
-        timesteps_counter = 0
-        episode_rewards = 0.0
-        episode_timesteps = 0
-        observation, info = self.env.reset()
-        while timesteps_counter < budget:
-            if self.total_timesteps < self._learning_starts:
-                action = self.env.action_space.sample()
-            else:
-                self._timesteps_since_last_update += 1
-                action = self._policy(observation, evaluation=False)
-            next_observation, reward, terminated, truncated, info = self.env.step(
-                action
-            )
-            done = terminated or truncated
-
-            # store data
-            episode_rewards += reward
-            self._replay_buffer.append(
-                {
-                    "observations": observation,
-                    "actions": action,
-                    "rewards": reward,
-                    "dones": done,
-                    "next_observations": next_observation,
-                }
-            )
-
-            # counters and next obs
-            self._total_timesteps += 1
-            timesteps_counter += 1
-            episode_timesteps += 1
-            observation = next_observation
-
-            # update
-            run_update, n_gradient_steps = self._must_update(done)
-            if run_update:
-                self._update(n_gradient_steps)
-
-            # eval
-            total_timesteps = self._total_timesteps
-            if (
-                self._eval_interval is not None
-                and total_timesteps % self._eval_interval == 0
-            ):
-                eval_rewards = self.eval(
-                    eval_horizon=self._max_episode_steps, gamma=1.0
-                )
-                if self.writer:
-                    buffer_size = len(self._replay_buffer)
-                    self.writer.add_scalar(
-                        "eval_rewards", eval_rewards, total_timesteps
-                    )
-                    self.writer.add_scalar("buffer_size", buffer_size, total_timesteps)
-
-            # check if episode ended
-            if done:
-                self._total_episodes += 1
-                self._replay_buffer.end_episode()
-                if self.writer:
-                    self.writer.add_scalar(
-                        "episode_rewards", episode_rewards, total_timesteps
-                    )
-                    self.writer.add_scalar(
-                        "total_episodes", self._total_episodes, total_timesteps
-                    )
-                episode_rewards = 0.0
-                episode_timesteps = 0
-                observation, info = self.env.reset()
-
-    def _policy(self, observation, evaluation=False):
-        epsilon = self._epsilon_schedule(self.total_timesteps)
-        if (not evaluation) and self.rng.uniform() < epsilon:
-            if self.writer:
-                self.writer.add_scalar("epsilon", epsilon, self.total_timesteps)
-            return self.env.action_space.sample()
-        else:
-            with torch.no_grad():
-                observation = (
-                    torch.FloatTensor(observation).to(self._device).unsqueeze(0)
-                )
-                qvals_tensor = self._qnet_online(observation)[0]
-                action = qvals_tensor.argmax().item()
-                return action
-
-    def policy(self, observation):
-        return self._policy(observation, evaluation=True)
diff --git a/rlberry/agents/torch/dqn/dqn_utils.py b/rlberry/agents/torch/dqn/dqn_utils.py
deleted file mode 100644
index 2d100b218..000000000
--- a/rlberry/agents/torch/dqn/dqn_utils.py
+++ /dev/null
@@ -1,142 +0,0 @@
-import numpy as np
-import torch
-import torch.nn.functional as F
-
-
-from rlberry.utils.jit_setup import numba_jit
-
-
-import rlberry
-
-logger = rlberry.logger
-
-
-def stable_scaled_log_softmax(x, tau, dim=-1):
-    """Scaled log_softmax operation.
-
-    Parameters
-    ----------
-      x: tensor of floats,
-        inputs of the softmax (logits).
-      tau: float,
-        softmax temperature.
-      dim: int,
-        axis to perform the softmax operation.
-    Returns:
-      tau * log softmax(x/tau, dim=dim)
-    """
-    max_x = torch.max(x, dim=dim, keepdim=True).values
-    y = x - max_x
-    return tau * F.log_softmax(y / tau, dim=dim)
-
-
-def stable_softmax(x, tau, dim=-1):
-    """Stable softmax operation.
-
-    Parameters
-    ----------
-      x: tensor of floats,
-        inputs of the softmax (logits).
-      tau: float,
-        softmax temperature.
-      dim: int,
-        axis to perform the softmax operation.
-    Returns:
-      softmax(x / tau, dim=dim)
-    """
-    max_x = torch.max(x, dim=dim, keepdim=True).values
-    y = x - max_x
-    return F.softmax(y / tau, dim=dim)
-
-
-def polynomial_schedule(
-    init_value: float,
-    end_value: float,
-    power: float,
-    transition_steps: int,
-    transition_begin: int = 0,
-):
-    """Constructs a schedule with polynomial transition from init to end value.
-
-    Notes
-    -----
-    Function taken from: https://github.com/deepmind/optax/blob/master/optax/_src/schedule.py,
-    which is licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-
-    Modifications with respect to source:
-
-    * Remove chex typing from the arguments.
-    * `import rlberry; logger=rlberry.logger` instead of :code:`logging.info()`.
-    * Changed documentation style.
-
-    Parameters
-    ----------
-    init_value: float
-        Initial value for the scalar to be annealed.
-    end_value: float
-        End value of the scalar to be annealed.
-    power: float
-        The power of the polynomial used to transition from init to end.
-    transition_steps: float
-        Number of steps over which annealing takes place,
-        the scalar starts changing at `transition_begin` steps and completes
-        the transition by `transition_begin + transition_steps` steps.
-        If `transition_steps <= 0`, then the entire annealing process is disabled
-        and the value is held fixed at `init_value`.
-    transition_begin: float
-        Must be positive. After how many steps to start annealing
-        (before this many steps the scalar value is held fixed at `init_value`).
-
-    Returns
-    -------
-    schedule: Callable[[int], float]
-        A function that maps step counts to values.
-    """
-    if transition_steps <= 0:
-        logger.info(
-            "A polynomial schedule was set with a non-positive `transition_steps` "
-            "value; this results in a constant schedule with value `init_value`."
-        )
-        return lambda count: init_value
-
-    if transition_begin < 0:
-        logger.info(
-            "An exponential schedule was set with a negative `transition_begin` "
-            "value; this will result in `transition_begin` falling back to `0`."
-        )
-        transition_begin = 0
-
-    def schedule(count):
-        count = np.clip(count - transition_begin, 0, transition_steps)
-        frac = 1 - count / transition_steps
-        return (init_value - end_value) * (frac**power) + end_value
-
-    return schedule
-
-
-@numba_jit
-def lambda_returns(r_t, discount_t, v_tp1, lambda_):
-    """
-    Computer lambda returns
-
-    Parameters
-    ----------
-    r_t: array
-        Array of shape (batch_dim, time_dim) containing the rewards.
-    discount_t: array
-        Array of shape (batch_dim, time_dim) containing the discounts (0.0 if terminal state).
-    v_tp1: array
-        Array of shape (batch_dim, time_dim) containing the values at timestep t+1
-    lambda_ : float in [0, 1]
-        Lambda-returns parameter.
-    """
-    returns = np.zeros_like(r_t)
-    aux = v_tp1[:, -1]
-    time_dim = v_tp1.shape[1]
-    for tt in range(time_dim):
-        i = time_dim - tt - 1
-        aux = r_t[:, i] + discount_t[:, i] * (
-            (1 - lambda_) * v_tp1[:, i] + lambda_ * aux
-        )
-        returns[:, i] = aux
-    return returns
diff --git a/rlberry/agents/torch/dqn/mdqn.py b/rlberry/agents/torch/dqn/mdqn.py
deleted file mode 100644
index 746e01d24..000000000
--- a/rlberry/agents/torch/dqn/mdqn.py
+++ /dev/null
@@ -1,478 +0,0 @@
-import inspect
-
-import numpy as np
-import torch
-from gymnasium import spaces
-from rlberry import types
-from rlberry.agents import AgentWithSimplePolicy, AgentTorch
-from rlberry.agents.torch.utils.training import (
-    loss_function_factory,
-    model_factory,
-    optimizer_factory,
-    size_model_config,
-)
-from rlberry.agents.torch.dqn.dqn_utils import (
-    lambda_returns,
-    polynomial_schedule,
-    stable_scaled_log_softmax,
-    stable_softmax,
-)
-from rlberry.agents.utils import replay
-from rlberry.utils.torch import choose_device
-from rlberry.utils.factory import load
-from typing import Callable, Optional, Union
-
-
-import rlberry
-
-logger = rlberry.logger
-
-
-def default_q_net_fn(env, **kwargs):
-    """
-    Returns a default Q value network.
-    """
-    del kwargs
-    model_config = {
-        "type": "MultiLayerPerceptron",
-        "layer_sizes": (64, 64),
-        "reshape": False,
-    }
-    model_config = size_model_config(env, **model_config)
-    return model_factory(**model_config)
-
-
-class MunchausenDQNAgent(AgentTorch, AgentWithSimplePolicy):
-    """Munchausen DQN Agent based on PyTorch.
-
-    Notes
-    -----
-    Uses Munchausen trick for DQN for computing targets by default.
-    Compared to DQN, the scaled log-policy was added to the immediate
-    reward. Slightly modifying DQN in that way provides an agent that
-    is competitive with distributional methods on Atari games, without
-    making use of distributional RL, n-step returns or prioritized replay.
-    See more: https://arxiv.org/pdf/2007.14430.pdf
-
-    Parameters
-    ----------
-    env: :class:`~rlberry.types.Env`
-        Environment, can be a tuple (constructor, kwargs)
-    gamma: float, default = 0.99
-        Discount factor.
-    batch_size: int, default=32
-        Batch size.
-    chunk_size: int, default=8
-        Length of sub-trajectories sampled from the replay buffer.
-    lambda_: float, default=0.5
-        Q(lambda) parameter.
-    tau: float, default=0.03
-        softmax temperature for the policy
-    alpha: float, default=0.9
-        Munchausen coefficient
-    target_update_parameter : int or float
-        If int: interval (in number total number of online updates) between updates of the target network.
-        If float: soft update coefficient
-    device: str
-        Torch device, see :func:`~rlberry.utils.torch.choose_device`
-    learning_rate : float, default = 1e-3
-        Optimizer learning rate.
-    clip_value_min: float, default = -1,
-        minimum value for munchausen term
-    loss_function: {"l1", "l2", "smooth_l1"}, default: "l2"
-        Loss function used to compute Bellman error.
-    epsilon_init: float, default = 1.0
-        Initial epsilon value for epsilon-greedy exploration.
-    epsilon_final: float, default = 0.1
-        Final epsilon value for epsilon-greedy exploration.
-    epsilon_decay_interval : int
-        After :code:`epsilon_decay` timesteps, epsilon approaches :code:`epsilon_final`.
-    optimizer_type : {"ADAM", "RMS_PROP"}
-        Optimization algorithm.
-    q_net_constructor : Callable, str or None
-        Function/constructor that returns a torch module for the Q-network:
-        :code:`qnet = q_net_constructor(env, **kwargs)`.
-
-        Module (Q-network) requirements:
-
-        * Input shape = (batch_dim, chunk_size, obs_dims)
-
-        * Ouput shape = (batch_dim, chunk_size, number_of_actions)
-
-        Example: use `rlberry.agents.torch.utils.training.model_factory_from_env`,
-         and `q_net_kwargs`
-        parameter to modify the neural network::
-
-            model_configs = {
-                "type": "MultiLayerPerceptron",
-                "layer_sizes": (5, 5),
-                "reshape": False,
-            }
-
-            agent = MunchausenDQNAgent(env,
-                q_net_constructor=model_factory_from_env,
-                q_net_kwargs=model_configs
-                )
-        If str then it should correspond to the full path to the constructor function,
-        e.g.::
-            agent = MunchausenDQNAgent(env,
-                q_net_constructor='rlberry.agents.torch.utils.training.model_factory_from_env',
-                q_net_kwargs=model_configs
-                )
-
-        If None then it is set to MultiLayerPerceptron with 2 hidden layers
-        of size 64
-
-    q_net_kwargs : optional, dict
-        Parameters for q_net_constructor.
-    use_prioritized_replay : bool, default = False
-        If True, use Prioritized Experience Replay.
-    train_interval: int
-        Update the model every :code:`train_interval` steps.
-        If -1, train only at the end of the episodes.
-    gradient_steps: int
-        How many gradient steps to do at each update.
-        If -1, take the number of timesteps since last update.
-    max_replay_size : int
-        Maximum number of transitions in the replay buffer.
-    learning_starts : int
-        How many steps of the model to collect transitions for before learning starts
-    eval_interval : int, default = None
-        Interval (in number of transitions) between agent evaluations in fit().
-        If None, never evaluate.
-    """
-
-    name = "Munchausen DQN"
-
-    def __init__(
-        self,
-        env: types.Env,
-        gamma: float = 0.99,
-        batch_size: int = 32,
-        chunk_size: int = 8,
-        lambda_: float = 0.5,
-        tau: float = 0.03,
-        alpha: float = 0.9,
-        target_update_parameter: Union[int, float] = 0.005,
-        # tardet_update_freq: int = 8000,
-        device: str = "cuda:best",
-        learning_rate: float = 5e-5,
-        clip_value_min: float = -1.0,
-        epsilon_init: float = 1.0,
-        epsilon_final: float = 0.1,
-        epsilon_decay_interval: int = 20_000,
-        loss_function: str = "l2",
-        optimizer_type: str = "ADAM",
-        q_net_constructor: Optional[Callable[..., torch.nn.Module]] = None,
-        q_net_kwargs: Optional[dict] = None,
-        use_prioritized_replay: bool = False,
-        train_interval: int = 4,
-        gradient_steps: int = -1,
-        max_replay_size: int = 1_000_000,
-        learning_starts: int = 5_000,
-        eval_interval: Optional[int] = None,
-        **kwargs,
-    ):
-        # For all parameters, define self.param = param
-        _, _, _, values = inspect.getargvalues(inspect.currentframe())
-        values.pop("self")
-        for arg, val in values.items():
-            setattr(self, arg, val)
-
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-        env = self.env
-        assert isinstance(env.observation_space, spaces.Box)
-        assert isinstance(env.action_space, spaces.Discrete)
-
-        # M-DQN parameters
-
-        # Online and target Q networks, torch device
-        self._device = choose_device(device)
-        if isinstance(q_net_constructor, str):
-            q_net_ctor = load(q_net_constructor)
-        elif q_net_constructor is None:
-            q_net_ctor = default_q_net_fn
-        else:
-            q_net_ctor = q_net_constructor
-        q_net_kwargs = q_net_kwargs or dict()
-        self._qnet_online = q_net_ctor(env, **q_net_kwargs).to(self._device)
-        self._qnet_target = q_net_ctor(env, **q_net_kwargs).to(self._device)
-
-        # Optimizer and loss
-        optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate}
-        self._optimizer = optimizer_factory(
-            self._qnet_online.parameters(), **optimizer_kwargs
-        )
-        self._loss_function = loss_function_factory(loss_function, reduction="none")
-
-        # Training params
-        self._train_interval = train_interval
-        self._gradient_steps = gradient_steps
-        self._learning_starts = learning_starts
-        self._learning_starts = learning_starts
-        self._eval_interval = eval_interval
-
-        # Setup replay buffer
-        if hasattr(self.env, "_max_episode_steps"):
-            max_episode_steps = self.env._max_episode_steps
-        else:
-            max_episode_steps = np.inf
-        self._max_episode_steps = max_episode_steps
-
-        self._replay_buffer = replay.ReplayBuffer(
-            max_replay_size=max_replay_size,
-            rng=self.rng,
-            max_episode_steps=self._max_episode_steps,
-            enable_prioritized=use_prioritized_replay,
-        )
-        self._replay_buffer.setup_entry("observations", np.float32)
-        self._replay_buffer.setup_entry("next_observations", np.float32)
-        self._replay_buffer.setup_entry("actions", np.int32)
-        self._replay_buffer.setup_entry("rewards", np.float32)
-        self._replay_buffer.setup_entry("dones", bool)
-
-        # Counters
-        self._total_timesteps = 0
-        self._total_episodes = 0
-        self._total_updates = 0
-        self._timesteps_since_last_update = 0
-
-        # epsilon scheduling
-        self._epsilon_schedule = polynomial_schedule(
-            self.epsilon_init,
-            self.epsilon_final,
-            power=1.0,
-            transition_steps=self.epsilon_decay_interval,
-            transition_begin=0,
-        )
-
-    @property
-    def total_timesteps(self):
-        return self._total_timesteps
-
-    def _must_update(self, is_end_of_episode):
-        """Returns true if the model must be updated in the current timestep,
-        and the number of gradient steps to take"""
-        total_timesteps = self._total_timesteps
-        n_gradient_steps = self._gradient_steps
-
-        if total_timesteps < self._learning_starts:
-            return False, -1
-
-        if n_gradient_steps == -1:
-            n_gradient_steps = self._timesteps_since_last_update
-
-        run_update = False
-        if self._train_interval == -1:
-            run_update = is_end_of_episode
-        else:
-            run_update = total_timesteps % self._train_interval == 0
-        return run_update, n_gradient_steps
-
-    def _update(self, n_gradient_steps):
-        """Update networks."""
-        if self.use_prioritized_replay:
-            sampling_mode = "prioritized"
-        else:
-            sampling_mode = "uniform"
-
-        for _ in range(n_gradient_steps):
-            # Sample a batch
-            sampled = self._replay_buffer.sample(
-                self.batch_size, self.chunk_size, sampling_mode=sampling_mode
-            )
-            if not sampled:
-                return
-
-            # Update counters
-            self._timesteps_since_last_update = 0
-            self._total_updates += 1
-
-            batch = sampled.data
-            batch_info = sampled.info
-            assert batch["rewards"].shape == (self.batch_size, self.chunk_size)
-
-            # Get batched tensors
-            batch_observations = torch.FloatTensor(batch["observations"]).to(
-                self._device
-            )
-            batch_rewards = torch.FloatTensor(batch["rewards"]).to(self._device)
-            batch_next_observations = torch.FloatTensor(batch["next_observations"]).to(
-                self._device
-            )
-            batch_actions = torch.LongTensor(batch["actions"]).to(self._device)
-            batch_dones = torch.LongTensor(batch["dones"]).to(self._device)
-
-            # Get target Q estimates
-            target_q_values_tp1 = self._qnet_target(batch_next_observations).detach()
-            target_q_values = self._qnet_target(batch_observations).detach()
-
-            # Compute softmax policies for the current and next step
-            log_pi = stable_scaled_log_softmax(target_q_values, self.tau, -1)
-            log_pi_tp1 = stable_scaled_log_softmax(target_q_values_tp1, self.tau, -1)
-            pi_tp1 = stable_softmax(target_q_values_tp1, self.tau, -1)
-
-            # Compute the "next step" part of the target
-            target_v_tp1 = (
-                torch.sum((target_q_values_tp1 - log_pi_tp1) * pi_tp1, -1).cpu().numpy()
-            )
-
-            # Compute the Munchausen term
-            munchausen_term = torch.gather(
-                log_pi, dim=-1, index=batch_actions[:, :, None]
-            )[:, :, 0]
-            clipped_munchausen_term = torch.clip(
-                munchausen_term, self.clip_value_min, 0
-            )
-            final_munchausen_term = self.alpha * clipped_munchausen_term
-
-            # Compute the final target
-            batch_lambda_returns = lambda_returns(
-                (batch_rewards + final_munchausen_term).cpu().numpy(),
-                self.gamma * (1.0 - np.array(batch["dones"], dtype=np.float32)),
-                target_v_tp1,
-                np.array(self.lambda_, dtype=np.float32),
-            )
-            targets = torch.tensor(batch_lambda_returns, device=self._device)
-
-            # Compute loss
-            batch_q_values = self._qnet_online(batch_observations)
-            batch_values = torch.gather(
-                batch_q_values, dim=-1, index=batch_actions[:, :, None]
-            )[
-                :, :, 0
-            ]  # shape (batch, chunk)
-
-            assert batch_values.shape == targets.shape
-            per_element_loss = self._loss_function(batch_values, targets)
-            per_batch_element_loss = per_element_loss.mean(dim=1)
-            weights = torch.FloatTensor(batch_info["weights"]).to(self._device)
-            loss = torch.sum(per_batch_element_loss * weights) / torch.sum(weights)
-
-            self._optimizer.zero_grad()
-            loss.backward()
-            self._optimizer.step()
-
-            if self.writer:
-                self.writer.add_scalar(
-                    "losses/q_loss", loss.item(), self._total_updates
-                )
-
-            # update priorities
-            if self.use_prioritized_replay:
-                new_priorities = per_element_loss.abs().detach().cpu().numpy() + 1e-6
-                self._replay_buffer.update_priorities(
-                    batch_info["indices"], new_priorities
-                )
-
-            # target update
-
-            if self.target_update_parameter > 1:
-                if self._total_updates % self.target_update_parameter == 0:
-                    self._qnet_target.load_state_dict(self._qnet_online.state_dict())
-            else:
-                tau = self.target_update_parameter
-                for param, target_param in zip(
-                    self._qnet_online.parameters(), self._qnet_target.parameters()
-                ):
-                    target_param.data.copy_(
-                        tau * param.data + (1 - tau) * target_param.data
-                    )
-
-    def fit(self, budget: int, **kwargs):
-        """
-        Train the agent using the provided environment.
-
-        Parameters
-        ----------
-        budget: int
-            Number of timesteps to train the agent for.
-            One step = one transition in the environment.
-        """
-        del kwargs
-        timesteps_counter = 0
-        episode_rewards = 0.0
-        episode_timesteps = 0
-        observation, info = self.env.reset()
-        while timesteps_counter < budget:
-            if self.total_timesteps < self._learning_starts:
-                action = self.env.action_space.sample()
-            else:
-                self._timesteps_since_last_update += 1
-                action = self._policy(observation, evaluation=False)
-            next_observation, reward, terminated, truncated, info = self.env.step(
-                action
-            )
-            done = terminated or truncated
-
-            # store data
-            episode_rewards += reward
-            self._replay_buffer.append(
-                {
-                    "observations": observation,
-                    "actions": action,
-                    "rewards": reward,
-                    "dones": done,
-                    "next_observations": next_observation,
-                }
-            )
-
-            # counters and next obs
-            self._total_timesteps += 1
-            timesteps_counter += 1
-            episode_timesteps += 1
-            observation = next_observation
-
-            # update
-            run_update, n_gradient_steps = self._must_update(done)
-            if run_update:
-                self._update(n_gradient_steps)
-
-            # eval
-            total_timesteps = self._total_timesteps
-            if (
-                self._eval_interval is not None
-                and total_timesteps % self._eval_interval == 0
-            ):
-                eval_rewards = self.eval(
-                    eval_horizon=self._max_episode_steps, gamma=1.0
-                )
-                if self.writer:
-                    buffer_size = len(self._replay_buffer)
-                    self.writer.add_scalar(
-                        "eval_rewards", eval_rewards, total_timesteps
-                    )
-                    self.writer.add_scalar("buffer_size", buffer_size, total_timesteps)
-
-            # check if episode ended
-            if done:
-                self._total_episodes += 1
-                self._replay_buffer.end_episode()
-                if self.writer:
-                    self.writer.add_scalar(
-                        "episode_rewards", episode_rewards, total_timesteps
-                    )
-                    self.writer.add_scalar(
-                        "total_episodes", self._total_episodes, total_timesteps
-                    )
-                episode_rewards = 0.0
-                episode_timesteps = 0
-                observation, info = self.env.reset()
-
-    def _policy(self, observation, evaluation=False):
-        epsilon = self._epsilon_schedule(self.total_timesteps)
-        if (not evaluation) and self.rng.uniform() < epsilon:
-            if self.writer:
-                self.writer.add_scalar("epsilon", epsilon, self.total_timesteps)
-            return self.env.action_space.sample()
-        else:
-            with torch.no_grad():
-                observation = (
-                    torch.FloatTensor(observation).to(self._device).unsqueeze(0)
-                )
-                qvals_tensor = self._qnet_online(observation)[0]
-                action = qvals_tensor.argmax().item()
-                return action
-
-    def policy(self, observation):
-        return self._policy(observation, evaluation=True)
diff --git a/rlberry/agents/torch/ppo/__init__.py b/rlberry/agents/torch/ppo/__init__.py
deleted file mode 100644
index b3f371adb..000000000
--- a/rlberry/agents/torch/ppo/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .ppo import PPOAgent
diff --git a/rlberry/agents/torch/ppo/ppo.py b/rlberry/agents/torch/ppo/ppo.py
deleted file mode 100644
index fdd27442b..000000000
--- a/rlberry/agents/torch/ppo/ppo.py
+++ /dev/null
@@ -1,843 +0,0 @@
-import numpy as np
-import torch
-import torch.nn as nn
-
-import gymnasium.spaces as spaces
-import rlberry
-from rlberry.agents import AgentWithSimplePolicy
-from rlberry.agents import AgentTorch
-from rlberry.envs.utils import process_env
-from rlberry.agents.torch.utils.training import optimizer_factory
-from rlberry.agents.torch.utils.models import default_policy_net_fn
-from rlberry.agents.torch.utils.models import default_value_net_fn
-from rlberry.utils.torch import choose_device
-from rlberry.utils.factory import load
-from rlberry.agents.torch.ppo.ppo_utils import (
-    process_ppo_env,
-    lambda_returns,
-    RolloutBuffer,
-)
-
-import dill
-import pickle
-import bz2
-import _pickle as cPickle
-from pathlib import Path
-
-
-logger = rlberry.logger
-
-
-# Notes about VecEnvs:
-# - reset() returns a numpy array of shape (n_envs, state_dim)
-# - step() returns a tuple of arrays (states, rewards, dones, infos)
-#   - states: np.array (n_envs, state_dim) dtype varies
-#   - rewards: np.array (n_envs,) np.float64
-#   - dones: np.array (n_envs,) bool
-#   - infos: list (n_envs,) dict
-# - close() closes all environments
-
-
-class PPOAgent(AgentTorch, AgentWithSimplePolicy):
-    """
-    Proximal Policy Optimization Agent.
-
-    Policy gradient methods for reinforcement learning, which alternate between
-    sampling data through interaction with the environment, and optimizing a
-    “surrogate” objective function using stochastic gradient ascent.
-
-    Parameters
-    ----------
-    env : rlberry Env
-        Environment with continuous (Box) observation space.
-    n_envs: int
-        Number of environments to be used.
-    n_steps : int
-        Number of transitions to collect in each environment per update.
-    batch_size : int
-        Size of mini batches during each PPO update epoch. It is recommended
-        that n_envs * n_steps is divisible by batch_size.
-    gamma : float
-        Discount factor in [0, 1].
-    k_epochs : int
-        Number of PPO epochs per update.
-    clip_eps : float
-        PPO clipping range (epsilon).
-    target_kl: float
-        Target KL divergence. If KL divergence between the current policy and
-        the new policy is greater than target_kl, the update is stopped early.
-        Set to None to disable early stopping.
-    normalize_avantages : bool
-        Whether or not to normalize advantages.
-    gae_lambda : float
-        Lambda parameter for TD(lambda) and Generalized Advantage Estimation.
-    entr_coef : float
-        Entropy coefficient.
-    vf_coef : float
-        Value function loss coefficient.
-    value_loss: str
-        Type of value loss. 'mse' corresponds to mean squared error,
-        'clipped' corresponds to the original PPO loss, and 'avec'
-        corresponds to the AVEC loss (Flet-Berliac et al. 2021).
-    max_grad_norm : float
-        Maximum norm of the gradient of both actor and critic networks.
-    learning_rate : float
-        Learning rate.
-    lr_schedule: str
-        Learning rate schedule. 'constant' corresponds to a constant learning
-        rate, and 'linear' corresponds to a linearly decreasing learning rate,
-        starting at learning_rate and ending at 0. WARNING: the schedule is
-        reset at each call to fit().
-    optimizer_type: str
-        Type of optimizer. 'ADAM' by defaut.
-    policy_net_fn : function(env, **kwargs)
-        Function that returns an instance of a policy network (pytorch).
-        If None, a default net is used.
-    policy_net_kwargs : dict
-        kwargs for policy_net_fn
-    value_net_fn : function(env, **kwargs)
-        Function that returns an instance of a value network (pytorch).
-        If None, a default net is used.
-    value_net_kwargs : dict
-        kwargs for value_net_fn
-    eval_env : rlberry Env
-        Environment used for evaluation. If None, env is used.
-    n_eval_episodes : int
-        Number of episodes to be used for evaluation.
-    eval_horizon : int
-        Maximum number of steps per episode during evaluation.
-    eval_freq : int
-        Number of updates between evaluations. If None, no evaluation is
-        performed.
-    device: str
-        Device on which to put the tensors. 'cuda:best' by default.
-
-    Attributes
-    ----------
-    __value_losses__ : list
-        List of supported value loss types. ["clipped", "mse", "avec"]
-    __lr_schedule___ : list
-        List of supported learning rate schedule types. ["constant", "linear"]
-    copy_env : bool
-        If True, copy the environment to create multiple environments for parallel interaction.
-    n_envs : int
-        Number of environments used by the agent.
-    n_steps : int
-        Number of transitions to collect in each environment per update.
-    batch_size : int
-        Size of mini batches during each PPO update epoch.
-    gamma : float
-        Discount factor used to discount future rewards.
-    k_epochs : int
-        Number of PPO epochs per update.
-    clip_eps : float
-        PPO clipping range (epsilon).
-    target_kl: float
-        Target KL divergence for early stopping. If None, early stopping is disabled.
-    normalize_advantages : bool
-        Whether or not to normalize advantages.
-    gae_lambda : float
-        Lambda parameter for TD(lambda) and Generalized Advantage Estimation.
-    entr_coef : float
-        Entropy coefficient. Controls the contribution of entropy regularization to the policy's objective.
-    vf_coef : float
-        Value function loss coefficient. Controls the contribution of the value function loss to the total loss.
-    value_loss: str
-        Type of value loss used. Can be "mse", "clipped", or "avec".
-    max_grad_norm : float
-        Maximum norm of the gradient of both actor and critic networks. Used for gradient clipping.
-    learning_rate : float
-        Learning rate used by the optimizer during neural network training.
-    lr_schedule : str
-        Learning rate schedule used during training. Can be "constant" or "linear".
-    optimizer_type : str
-        Type of optimizer used during neural network training.
-    n_eval_episodes : int
-        Number of episodes used for evaluation.
-    eval_horizon : int
-        Maximum number of steps per episode during evaluation.
-    eval_freq : int
-        Number of updates between evaluations. If None, no evaluation is performed.
-    policy_net_fn : function(env, **kwargs)
-        Function that returns an instance of a policy network (PyTorch).
-    policy_net_kwargs : dict
-        Keyword arguments for `policy_net_fn`.
-    value_net_fn : function(env, **kwargs)
-        Function that returns an instance of a value network (PyTorch).
-    value_net_kwargs : dict
-        Keyword arguments for `value_net_fn`.
-    eval_env : rlberry.Env
-        The environment used for evaluation. If None, the same environment as env is used.
-    state_dim : int
-        Dimensionality of the continuous state space of the environment.
-    policy_net : torch.nn.Module
-        The policy network used by the agent.
-    value_net : torch.nn.Module
-        The value network used by the agent.
-    device : str
-        Torch device on which the agent's neural networks are placed.
-    optimizer_kwargs : dict
-        Keyword arguments for the optimizer used during neural network training.
-
-    References
-    ----------
-    Schulman, J., Wolski, F., Dhariwal, P., Radford, A. & Klimov, O. (2017).
-    "Proximal Policy Optimization Algorithms."
-    arXiv preprint arXiv:1707.06347.
-
-    Schulman, J., Levine, S., Abbeel, P., Jordan, M., & Moritz, P. (2015).
-    "Trust region policy optimization."
-    In International Conference on Machine Learning (pp. 1889-1897).
-
-    Flet-Berliac, Y., Ouhamma, R., Maillard, O.-A., Preux, P. (2021)
-    "Learning Value Functions in Deep Policy Gradients using Residual Variance."
-    In 9th International Conference on Learning Representations (ICLR).
-    """
-
-    name = "PPO"
-    __value_losses__ = ["clipped", "mse", "avec"]
-    __lr_schedule___ = ["constant", "linear"]
-
-    def __init__(
-        self,
-        env,
-        copy_env=True,
-        n_envs=1,
-        n_steps=512,
-        batch_size=64,
-        gamma=0.99,
-        k_epochs=10,
-        clip_eps=0.2,
-        target_kl=0.05,
-        normalize_advantages=True,
-        gae_lambda=0.95,
-        entr_coef=0.01,
-        vf_coef=0.5,
-        value_loss="mse",
-        max_grad_norm=0.5,
-        learning_rate=3e-4,
-        lr_schedule="constant",
-        optimizer_type="ADAM",
-        policy_net_fn=None,
-        policy_net_kwargs=None,
-        value_net_fn=None,
-        value_net_kwargs=None,
-        eval_env=None,
-        n_eval_episodes=10,
-        eval_horizon=int(1e5),
-        eval_freq=None,
-        device="cuda:best",
-        **kwargs
-    ):
-        kwargs.pop("eval_env", None)
-        AgentWithSimplePolicy.__init__(
-            self, None, **kwargs
-        )  # PPO handles the env internally
-
-        # create environment
-        self.copy_env = copy_env
-        self.n_envs = n_envs
-        self.env = process_ppo_env(env, self.seeder, num_envs=n_envs, copy_env=copy_env)
-        eval_env = eval_env or env
-        self.eval_env = process_env(eval_env, self.seeder, copy_env=copy_env)
-
-        # hyperparameters
-        value_loss, lr_schedule = value_loss.lower(), lr_schedule.lower()
-        assert value_loss in self.__value_losses__, "value_loss must be in {}".format(
-            self.__value_losses__
-        )
-        assert lr_schedule in self.__lr_schedule___, "lr_schedule must be in {}".format(
-            self.__lr_schedule___
-        )
-
-        self.n_steps = n_steps
-        self.batch_size = batch_size
-        self.gamma = gamma
-        self.k_epochs = k_epochs
-        self.clip_eps = clip_eps
-        self.target_kl = target_kl
-        self.normalize_advantages = normalize_advantages
-        self.gae_lambda = gae_lambda
-        self.entr_coef = entr_coef
-        self.vf_coef = vf_coef
-        self.value_loss = value_loss
-        self.max_grad_norm = max_grad_norm
-        self.learning_rate = learning_rate
-        self.lr_schedule = lr_schedule
-        self.optimizer_type = optimizer_type
-        self.n_eval_episodes = n_eval_episodes
-        self.eval_horizon = eval_horizon
-        self.eval_freq = eval_freq
-        self.kwargs = kwargs
-
-        self.state_dim = self.env.observation_space.shape[0]
-
-        # policy network
-        self.policy_net_kwargs = policy_net_kwargs or {}
-        if isinstance(policy_net_fn, str):
-            self.policy_net_fn = load(policy_net_fn)
-        elif policy_net_fn is None:
-            self.policy_net_fn = default_policy_net_fn
-        else:
-            self.policy_net_fn = policy_net_fn
-
-        # value network
-        self.value_net_kwargs = value_net_kwargs or {}
-        if isinstance(value_net_fn, str):
-            self.value_net_fn = load(value_net_fn)
-        elif value_net_fn is None:
-            self.value_net_fn = default_value_net_fn
-        else:
-            self.value_net_fn = value_net_fn
-
-        self.device = choose_device(device)
-
-        self.optimizer_kwargs = {
-            "optimizer_type": optimizer_type,
-            "lr": learning_rate,
-            "eps": 1e-5,
-        }
-
-        # check environment
-        # TODO: should we restrict this to Box?
-        #       what about the action space?
-        assert isinstance(self.env.observation_space, spaces.Box)
-
-        # initialize
-        self.policy_net = self.value_net = None
-        self.reset()
-
-    @classmethod
-    def from_config(cls, **kwargs):
-        kwargs["policy_net_fn"] = eval(kwargs["policy_net_fn"])
-        kwargs["value_net_fn"] = eval(kwargs["value_net_fn"])
-        return cls(**kwargs)
-
-    def reset(self, **kwargs):
-        """
-        Reset the agent.
-        """
-        self.total_timesteps = 0
-        self.total_episodes = 0
-
-        # Initialize rollout buffer
-        self.memory = RolloutBuffer(self.rng, self.n_steps)
-        self.memory.setup_entry("observations", dtype=np.float32)
-        self.memory.setup_entry("actions", dtype=self.env.single_action_space.dtype)
-        self.memory.setup_entry("rewards", dtype=np.float32)
-        self.memory.setup_entry("dones", dtype=bool)
-        self.memory.setup_entry("logprobs", dtype=np.float32)
-        self.memory.setup_entry("infos", dtype=dict)
-
-        # Initialize neural networks and optimizers
-        # TODO: using a single env to configure the networks is a hack that
-        #       should be fixed when model factories are revised
-        env = self.env.envs[0]
-        self.policy_net = self.policy_net_fn(env, **self.policy_net_kwargs).to(
-            self.device
-        )
-        self.value_net = self.value_net_fn(env, **self.value_net_kwargs).to(self.device)
-        self.optimizer = optimizer_factory(
-            list(self.policy_net.parameters()) + list(self.value_net.parameters()),
-            **self.optimizer_kwargs
-        )
-
-    def policy(self, observation):
-        assert self.policy_net is not None
-        obs = torch.from_numpy(observation).float().to(self.device)
-        action = self.policy_net(obs).sample()
-        return action.cpu().numpy()
-
-    def fit(self, budget: int, lr_scheduler=None, **kwargs):
-        """
-        Train the agent using the provided environment.
-
-        Parameters
-        ----------
-        budget: int
-            Total number of steps to be performed in the environment. Parameters
-            are updated every n_steps interactions with the environment.
-        lr_scheduler: callable
-            A function that takes the current step and returns the current learning
-            rate. If None, a default scheduler is used.
-        """
-        del kwargs
-
-        if lr_scheduler is None:
-            lr_scheduler = self._get_lr_scheduler(budget)
-
-        if len(self.memory) == 0:
-            timesteps_counter = 0
-        else:  # it's not the first "fit" on this agent, so there is a previous buffer to continue
-            timesteps_counter = len(self.memory) * self.n_envs
-
-        episode_returns = np.zeros(self.n_envs, dtype=np.float32)
-        episode_lengths = np.zeros(self.n_envs, dtype=np.int32)
-
-        next_obs, infos = self.env.reset()
-        next_obs = torch.Tensor(next_obs).to(
-            self.device
-        )  # should always be a torch tensor
-        next_done = np.zeros(self.n_envs, dtype=bool)  # initialize done to False
-        while timesteps_counter < budget:
-            obs = next_obs
-            done = next_done
-
-            # select action and take step
-            with torch.no_grad():
-                action, logprobs = self._select_action(obs)
-            next_obs, reward, next_terminated, next_truncated, info = self.env.step(
-                action
-            )
-            next_done = np.logical_or(next_terminated, next_truncated)
-            next_obs = torch.Tensor(next_obs).to(self.device)
-
-            # end of episode logging
-            for i in range(self.n_envs):
-                if next_done[i]:
-                    self.total_episodes += 1
-                    if self.writer and "episode" in info["final_info"][i]:
-                        if "episode" in info["final_info"][i]:
-                            r, l = (
-                                info["final_info"][i]["episode"]["r"],
-                                info["final_info"][i]["episode"]["l"],
-                            )
-                        else:
-                            r, l = episode_returns[i], episode_lengths[i]
-                        self.writer.add_scalar(
-                            "episode_returns", r, self.total_timesteps
-                        )
-                        self.writer.add_scalar(
-                            "episode_lengths", l, self.total_timesteps
-                        )
-                        self.writer.add_scalar(
-                            "total_episodes", self.total_episodes, self.total_timesteps
-                        )
-                    episode_returns[i], episode_lengths[i] = 0.0, 0
-
-            # append data to memory and update variables
-            self.memory.append(
-                {
-                    "observations": obs.cpu().numpy(),
-                    "actions": action,
-                    "rewards": reward,
-                    "dones": done,
-                    "infos": info,
-                    "logprobs": logprobs,
-                }
-            )
-            self.total_timesteps += self.n_envs
-            timesteps_counter += self.n_envs
-            episode_returns += reward
-            episode_lengths += 1
-
-            # evaluation
-            if (
-                self.writer
-                and self.eval_freq is not None
-                and self.total_timesteps % self.eval_freq == 0
-            ):
-                evaluation = self.eval(
-                    eval_horizon=self.eval_horizon,
-                    n_simulations=self.n_eval_episodes,
-                    gamma=1.0,
-                )
-                self.writer.add_scalar("evaluation", evaluation, self.total_timesteps)
-
-            # update with collected experience
-            if timesteps_counter % (self.n_envs * self.n_steps) == 0:
-                if self.lr_schedule != "constant":
-                    lr = lr_scheduler(self.total_timesteps)
-                    self.optimizer.param_groups[0]["lr"] = lr
-                self._update(next_obs=next_obs, next_done=next_done)
-
-    def _get_lr_scheduler(self, budget):
-        """
-        Returns a learning rate schedule for the policy and value networks.
-        """
-        if self.lr_schedule == "constant":
-            return lambda t: self.learning_rate
-        elif self.lr_schedule == "linear":
-            return lambda t: self.learning_rate * (1 - t / float(budget))
-
-    def _select_action(self, obs):
-        """
-        Select an action given the current state using the policy network.
-        Also returns the log probability of the selected action.
-
-        Parameters
-        ----------
-        obs: torch.Tensor
-            Observation tensor of shape (batch_size, obs_dim)
-
-        Returns
-        -------
-        A tuple (action, log_prob).
-        """
-        action_dist = self.policy_net(obs)
-        action = action_dist.sample()
-        action_logprob = action_dist.log_prob(action)
-        return action.cpu().numpy(), action_logprob.cpu().numpy()
-
-    def _update(self, next_obs=None, next_done=None):
-        """
-        Performs a PPO update based on the data in `self.memory`.
-
-        Parameters
-        ----------
-        next_obs: torch.Tensor or None
-            Next observation tensor of shape (n_envs, obs_dim). Used to
-            bootstrap the value function. If None, the value function is
-            bootstrapped with zeros.
-        next_done: np.ndarray or None
-            Array of shape (n_envs,) indicating whether the next observation
-            is terminal. If None, this function assumes that they are not
-            terminal.
-
-        Notes
-        -----
-        This function assumes that the data in `self.memory` is complete,
-        and it will clear the memory during the update.
-        """
-        assert (
-            int(next_obs is None) + int(next_done is None)
-        ) % 2 == 0, "'next_obs' and 'next_done' should be both None or not None at the same time."
-
-        # get batch data
-        batch = self.memory.get()
-        self.memory.clear()
-
-        # get shapes
-        n_steps, n_envs, *obs_shape = batch["observations"].shape
-        _, _, *action_shape = batch["actions"].shape
-
-        # create tensors from batch data
-        def _to_tensor(x):
-            return torch.from_numpy(x).to(self.device).detach()
-
-        b_obs = _to_tensor(batch["observations"])
-
-        # create buffers
-        b_values = torch.zeros(
-            (n_steps, n_envs), dtype=torch.float32, device=self.device
-        )
-        b_advantages = torch.zeros_like(b_values)
-        b_returns = torch.zeros_like(b_values)
-
-        # compute values
-        # note: some implementations compute the value when collecting the data
-        #       and use those stale values for the update. This can be better
-        #       in architectures with a shared encoder, because you avoid two
-        #       forward passes through the encoder. However, we choose to compute
-        #       the values here, because it is easier to implement and it has no
-        #       impact on performance in most cases.
-        with torch.no_grad():
-            b_values = self.value_net(b_obs).squeeze(-1)
-            if next_obs is not None:
-                b_next_value = self.value_net(next_obs).squeeze(-1)
-
-        # compute returns and advantages
-        # using numpy and numba for speedup
-        rewards = np.copy(batch["rewards"])
-
-        next_dones = np.zeros_like(batch["dones"])
-        next_dones[:-1] = batch["dones"][1:]
-        if next_obs is not None:
-            next_dones[-1] = next_done
-
-        values = b_values.cpu().numpy()
-        next_values = np.zeros_like(values)
-        next_values[:-1] = values[1:]
-        if next_obs is not None:
-            next_values[-1] = b_next_value.cpu().numpy()
-
-        returns = lambda_returns(
-            rewards, next_dones, next_values, self.gamma, self.gae_lambda
-        )
-        advantages = returns - values
-
-        # convert to tensor
-        b_actions = _to_tensor(batch["actions"])
-        b_logprobs = _to_tensor(batch["logprobs"])
-        b_returns = _to_tensor(returns)
-        b_advantages = _to_tensor(advantages)
-
-        # flatten the batch
-        b_obs = b_obs.view(n_steps * n_envs, *obs_shape)
-        b_actions = b_actions.view(n_steps * n_envs, *action_shape)
-        b_logprobs = b_logprobs.view(n_steps * n_envs, *action_shape)
-        b_values = b_values.view(n_steps * n_envs)
-        b_returns = b_returns.view(n_steps * n_envs)
-        b_advantages = b_advantages.view(n_steps * n_envs)
-
-        # run minibatch updates
-        clipped = []  # whether the policy loss was clipped
-        b_indices = np.arange(n_steps * n_envs)
-        for epoch in range(self.k_epochs):
-            self.rng.shuffle(b_indices)
-            for start in range(0, n_steps * n_envs, self.batch_size):
-                end = min(start + self.batch_size, n_steps * n_envs)
-                mb_indices = b_indices[start:end]
-
-                mb_obs = b_obs[mb_indices]
-                mb_actions = b_actions[mb_indices]
-                mb_old_logprobs = b_logprobs[mb_indices]
-                mb_returns = b_returns[mb_indices]
-                mb_advantages = b_advantages[mb_indices]
-
-                # normalize advantages
-                if self.normalize_advantages:
-                    mb_advantages = (mb_advantages - mb_advantages.mean()) / (
-                        mb_advantages.std() + 1e-8
-                    )
-
-                # forward pass to values and logprobs
-                action_dist = self.policy_net(mb_obs)
-                mb_values = self.value_net(mb_obs).squeeze(-1)
-
-                mb_logprobs = action_dist.log_prob(mb_actions)
-                mb_entropy = action_dist.entropy()
-                if len(mb_logprobs.shape) > 1:
-                    # in continuous action spaces, the distribution returns one
-                    # value per action dim, so we sum over them
-                    mb_logprobs = torch.sum(mb_logprobs, dim=-1)
-                    mb_old_logprobs = torch.sum(mb_old_logprobs, dim=-1)
-                    mb_entropy = torch.sum(mb_entropy, dim=-1)
-                mb_logratio = mb_logprobs - mb_old_logprobs
-                mb_ratio = torch.exp(mb_logratio)
-
-                # compute approximated kl divergence and whether the policy loss
-                # was clipped
-                with torch.no_grad():
-                    approx_kl = torch.mean((mb_ratio - 1) - mb_logratio)
-                    clipped.append(
-                        (torch.abs(mb_ratio - 1.0) > self.clip_eps)
-                        .float()
-                        .mean()
-                        .item()
-                    )
-
-                # policy loss
-                pg_loss1 = -mb_advantages * mb_ratio
-                pg_loss2 = -mb_advantages * torch.clamp(
-                    mb_ratio, 1 - self.clip_eps, 1 + self.clip_eps
-                )
-                pg_loss = torch.mean(torch.max(pg_loss1, pg_loss2))
-
-                # value loss
-                if self.value_loss == "mse":
-                    v_loss = 0.5 * torch.mean((mb_values - mb_returns) ** 2)
-                elif self.value_loss == "avec":
-                    v_loss = torch.var(mb_returns - mb_values)
-                elif self.value_loss == "clipped":
-                    mb_old_values = b_values[
-                        mb_indices
-                    ]  # these are stale after the first minibatch
-                    mb_clipped_values = mb_old_values + torch.clamp(
-                        mb_values - mb_old_values, -self.clip_eps, self.clip_eps
-                    )
-
-                    v_loss_unclipped = (mb_values - mb_returns) ** 2
-                    v_loss_clipped = (mb_clipped_values - mb_returns) ** 2
-                    v_loss = 0.5 * torch.mean(
-                        torch.max(v_loss_unclipped, v_loss_clipped)
-                    )
-
-                # entropy loss
-                entropy_loss = torch.mean(mb_entropy)
-
-                # total loss
-                loss = pg_loss + self.vf_coef * v_loss - self.entr_coef * entropy_loss
-
-                # optimize
-                self.optimizer.zero_grad()
-                loss.backward()
-                if self.max_grad_norm is not None:
-                    nn.utils.clip_grad_norm_(
-                        list(self.policy_net.parameters())
-                        + list(self.value_net.parameters()),
-                        self.max_grad_norm,
-                    )
-                self.optimizer.step()
-
-            if self.target_kl and approx_kl > self.target_kl:
-                break
-
-        # compute explained variance
-        y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
-        var_y = np.var(y_true)
-        explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
-
-        # log metrics
-        # note: this approach only logs the last batch of the last
-        # epoch, which is not ideal. However, it is the way it is
-        # done in most implementations of PPO.
-        if self.writer:
-            self.writer.add_scalar(
-                "fit/policy_loss",
-                pg_loss.item(),
-                self.total_timesteps,
-            )
-            self.writer.add_scalar(
-                "fit/value_loss",
-                v_loss.item(),
-                self.total_timesteps,
-            )
-            self.writer.add_scalar(
-                "fit/entropy_loss",
-                entropy_loss.item(),
-                self.total_episodes,
-            )
-            self.writer.add_scalar(
-                "fit/approx_kl",
-                approx_kl.item(),
-                self.total_episodes,
-            )
-            self.writer.add_scalar(
-                "fit/clipfrac",
-                np.mean(clipped),
-                self.total_episodes,
-            )
-            self.writer.add_scalar(
-                "fit/explained_variance",
-                explained_var,
-                self.total_episodes,
-            )
-            self.writer.add_scalar(
-                "fit/learning_rate",
-                self.optimizer.param_groups[0]["lr"],
-            )
-
-    #
-    # For hyperparameter optimization
-    #
-    @classmethod
-    def sample_parameters(cls, trial):
-        batch_size = trial.suggest_categorical("batch_size", [1, 4, 8, 16, 32])
-        gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.99])
-        learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True)
-        entr_coef = trial.suggest_float("entr_coef", 1e-8, 0.1, log=True)
-
-        clip_eps = trial.suggest_categorical("clip_eps", [0.1, 0.2, 0.3])
-
-        k_epochs = trial.suggest_categorical("k_epochs", [1, 5, 10, 20])
-
-        return {
-            "batch_size": batch_size,
-            "gamma": gamma,
-            "learning_rate": learning_rate,
-            "entr_coef": entr_coef,
-            "clip_eps": clip_eps,
-            "k_epochs": k_epochs,
-        }
-
-    ##### Overwrite some inherited functions
-
-    def save(self, filename):
-        """
-        Overwrite the 'save' and 'load' functions to not store the env if it's a "vectorized env" (can't be managed with pickle)
-
-        ----- documentation from original save -----
-
-        Save agent object. By default, the agent is pickled.
-
-        If overridden, the load() method must also be overriden.
-
-        Before saving, consider setting writer to None if it can't be pickled (tensorboard writers
-        keep references to files and cannot be pickled).
-
-        Note: dill[1]_ is used when pickle fails
-        (see https://stackoverflow.com/a/25353243, for instance).
-        Pickle is tried first, since it is faster.
-
-        Parameters
-        ----------
-        filename: Path or str
-            File in which to save the Agent.
-
-        Returns
-        -------
-        pathlib.Path
-            If save() is successful, a Path object corresponding to the filename is returned.
-            Otherwise, None is returned.
-        .. warning:: The returned filename might differ from the input filename: For instance,
-        the method can append the correct suffix to the name before saving.
-
-        References
-        ----------
-        .. [1] https://github.com/uqfoundation/dill
-        """
-        # remove writer if not pickleable
-        if not dill.pickles(self.writer):
-            self.set_writer(None)
-        # save
-        filename = Path(filename).with_suffix(".pickle")
-        filename.parent.mkdir(parents=True, exist_ok=True)
-
-        dict_to_save = dict(self.__dict__)
-        del dict_to_save["env"]
-        del dict_to_save["eval_env"]
-
-        try:
-            if not self.compress_pickle:
-                with filename.open("wb") as ff:
-                    pickle.dump(dict_to_save, ff)
-            else:
-                with bz2.BZ2File(filename, "wb") as ff:
-                    cPickle.dump(dict_to_save, ff)
-        except Exception:
-            try:
-                if not self.compress_pickle:
-                    with filename.open("wb") as ff:
-                        dill.dump(dict_to_save, ff)
-                else:
-                    with bz2.BZ2File(filename, "wb") as ff:
-                        dill.dump(dict_to_save, ff)
-            except Exception as ex:
-                logger.warning("Agent instance cannot be pickled: " + str(ex))
-                return None
-
-        return filename
-
-    @classmethod
-    def load(cls, filename, **kwargs):
-        """
-        Overwrite the 'save' and 'load' functions to not store the env if it's a "vectorized env" (can't be managed with pickle)
-
-        ----- documentation from original load -----
-        Load agent object.
-        If overridden, save() method must also be overriden.
-
-        Parameters
-        ----------
-        **kwargs: dict
-            Arguments to required by the __init__ method of the Agent subclass.
-        """
-        filename = Path(filename).with_suffix(".pickle")
-        obj = cls(**kwargs)
-
-        try:
-            if not obj.compress_pickle:
-                with filename.open("rb") as ff:
-                    tmp_dict = pickle.load(ff)
-            else:
-                with bz2.BZ2File(filename, "rb") as ff:
-                    tmp_dict = cPickle.load(ff)
-        except Exception:
-            if not obj.compress_pickle:
-                with filename.open("rb") as ff:
-                    tmp_dict = dill.load(ff)
-            else:
-                with bz2.BZ2File(filename, "rb") as ff:
-                    tmp_dict = dill.load(ff)
-
-        temp_env = obj.__dict__["env"]
-        temp_eval_env = obj.__dict__["eval_env"]
-
-        obj.__dict__.clear()
-        obj.__dict__.update(tmp_dict)
-
-        obj.__dict__["env"] = temp_env
-        obj.__dict__["eval_env"] = temp_eval_env
-
-        return obj
diff --git a/rlberry/agents/torch/ppo/ppo_utils.py b/rlberry/agents/torch/ppo/ppo_utils.py
deleted file mode 100644
index ec7f6df2f..000000000
--- a/rlberry/agents/torch/ppo/ppo_utils.py
+++ /dev/null
@@ -1,193 +0,0 @@
-import copy
-import logging
-
-import gymnasium as gym
-import numpy as np
-
-from rlberry.envs.utils import process_env
-from rlberry.utils.jit_setup import numba_jit
-
-
-logger = logging.getLogger(__name__)
-
-
-def process_ppo_env(env, seeder, num_envs=1, asynchronous=False, copy_env=True):
-    """
-    Process environment for PPO. It's the only agent that supports vectorized
-    environments.
-
-    Parameters
-    ----------
-    env : gym.Env
-        Environment to be processed.
-    seeder : rlberry.Seeder
-        Seeder object.
-    num_envs : int
-        Number of environments to be used.
-    asynchronous : bool
-        If True, the environments are run asynchronously.
-
-    Returns
-    -------
-    vec_env : gymnasium.vector.VectorEnv
-        Vectorized environment.
-    """
-    vec_env_cls = (
-        gym.vector.AsyncVectorEnv if asynchronous else gym.vector.SyncVectorEnv
-    )
-    return vec_env_cls(
-        [lambda: process_env(env, seeder, copy_env=copy_env) for _ in range(num_envs)]
-    )
-
-
-@numba_jit
-def lambda_returns(r_t, terminal_tp1, v_tp1, gamma, lambda_):
-    """
-    Compute lambda returns.
-
-    Parameters
-    ----------
-    r_t: array
-        Array of shape (time_dim, batch_dim) containing the rewards.
-    terminal_tp1: array
-        Array of shape (time_dim, batch_dim) containing the discounts (0.0 if terminal state).
-    v_tp1: array
-        Array of shape (time_dim, batch_dim) containing the values at timestep t+1
-    lambda_ : float in [0, 1]
-        Lambda-returns parameter.
-    """
-    T = v_tp1.shape[0]
-    returns = np.zeros_like(r_t)
-    aux = v_tp1[-1].astype(np.float32)
-    for tt in range(T):
-        i = T - tt - 1
-        returns[i] = r_t[i] + gamma * (1 - terminal_tp1[i]) * (
-            (1 - lambda_) * v_tp1[i] + lambda_ * aux
-        )
-        aux = returns[i]
-    return returns
-
-
-class RolloutBuffer:
-    """
-    Rollout buffer that allows sampling data with shape (batch_size,
-    num_trajectories, ...).
-    Parameters
-    ----------
-    rng: numpy.random.Generator
-        Numpy random number generator.
-        See https://numpy.org/doc/stable/reference/random/generator.html
-    max_episode_steps: int, optional
-        Maximum length of an episode
-    """
-
-    def __init__(self, rng, num_rollout_steps):
-        self._rng = rng
-        self._num_rollout_steps = num_rollout_steps
-        self._curr_step = 0
-        self._tags = []
-        self._data = dict()
-        self._dtypes = dict()
-
-    @property
-    def data(self):
-        """Dict containing all stored data."""
-        return self._data
-
-    @property
-    def tags(self):
-        """Tags identifying the entries in the replay buffer."""
-        return self._tags
-
-    @property
-    def dtypes(self):
-        """Dict containing the data types for each tag."""
-        return self._dtypes
-
-    @property
-    def num_rollout_steps(self):
-        """Number of steps to take in each environment per policy rollout."""
-        return self._num_rollout_steps
-
-    @property
-    def num_envs(self):
-        return self._num_envs
-
-    def __len__(self):
-        return self._curr_step
-
-    def full(self):
-        """Returns True if the buffer is full."""
-        return len(self) == self.num_rollout_steps
-
-    def clear(self):
-        """Clear data in replay."""
-        self._curr_step = 0
-        for tag in self._data:
-            self._data[tag] = None
-
-    def setup_entry(self, tag, dtype):
-        """Configure replay buffer to store data.
-        Parameters
-        ----------
-        tag : str
-            Tag that identifies the entry (e.g "observation", "reward")
-        dtype : obj
-            Data type of the entry (e.g. `np.float32`). Type is not
-            checked in :meth:`append`, but it is used to construct the numpy
-            arrays returned by the :meth:`sample`method.
-        """
-        assert len(self) == 0, "Cannot setup entry on non-empty buffer."
-        if tag in self._data:
-            raise ValueError(f"Entry {tag} already added to replay buffer.")
-        self._tags.append(tag)
-        self._dtypes[tag] = dtype
-        self._data[tag] = None
-
-    def append(self, data):
-        """
-        Stores data from an environment step in the buffer.
-
-        Parameters
-        ----------
-        data : dict
-            Dictionary containing scalar values, whose keys must be in self.tags.
-        """
-        assert set(data.keys()) == set(self.tags), "Data keys must be in self.tags"
-        assert len(self) < self.num_rollout_steps, "Buffer is full."
-        for tag in self.tags:
-            #
-            if self._data[tag] is None:
-                if isinstance(data[tag], np.ndarray):
-                    # if data[tag].dtype != self._dtypes[tag]:
-                    #     logger.warning(
-                    #         f"Data type for tag {tag} is {data[tag].dtype}, "
-                    #         f"but it was configured as {self._dtypes[tag]}.")
-                    shape = data[tag].shape
-                    self._data[tag] = np.zeros(
-                        (self.num_rollout_steps, *shape), dtype=self._dtypes[tag]
-                    )
-                elif isinstance(data[tag], float) or isinstance(data[tag], int):
-                    self._data[tag] = np.zeros(
-                        self.num_rollout_steps, dtype=self._dtypes[tag]
-                    )
-                else:
-                    self._data[tag] = [None] * self.num_rollout_steps
-            self._data[tag][self._curr_step] = data[tag]
-        self._curr_step += 1
-
-    def get(self):
-        """
-        Returns the collected data. If the appended data for a given tag is a
-        numpy array, the returned data will be a numpy array of shape:
-
-        (T, *S), where T is the number of rollout steps, and S is the shape of
-        the data that was appended.
-
-        Otherwise, the returned data will be a list of length T.
-
-        Returns
-        -------
-        Returns a dict with the collected data.
-        """
-        return copy.deepcopy(self._data)
diff --git a/rlberry/agents/torch/reinforce/__init__.py b/rlberry/agents/torch/reinforce/__init__.py
deleted file mode 100644
index a0efecff6..000000000
--- a/rlberry/agents/torch/reinforce/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .reinforce import REINFORCEAgent
diff --git a/rlberry/agents/torch/reinforce/reinforce.py b/rlberry/agents/torch/reinforce/reinforce.py
deleted file mode 100644
index f9f0c2e2f..000000000
--- a/rlberry/agents/torch/reinforce/reinforce.py
+++ /dev/null
@@ -1,270 +0,0 @@
-import torch
-import inspect
-import numpy as np
-
-import gymnasium.spaces as spaces
-from rlberry.agents import AgentWithSimplePolicy, AgentTorch
-from rlberry.agents.utils.memories import Memory
-from rlberry.agents.torch.utils.training import optimizer_factory
-from rlberry.agents.torch.utils.models import default_policy_net_fn
-from rlberry.utils.torch import choose_device
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class REINFORCEAgent(AgentTorch, AgentWithSimplePolicy):
-    """
-    REINFORCE with entropy regularization.
-
-    Parameters
-    ----------
-    env : Model
-        Online model with continuous (Box) state space and discrete actions
-    batch_size : int, default: 8
-        Number of episodes used for the update of the policy netowrk.
-    horizon : int, default: 256
-        Episode length: one transition per episode steps. So total number of transitions used for one policy update is batch_size * horizon.
-    gamma : double
-        Discount factor in [0, 1].
-    entr_coef : double
-        Entropy coefficient.
-    learning_rate : double
-        Learning rate.
-    normalize: bool
-        If True normalize rewards
-    optimizer_type: str
-        Type of optimizer. 'ADAM' by defaut.
-    policy_net_fn : function(env, **kwargs)
-        Function that returns an instance of a policy network (pytorch).
-        If None, a default net is used.
-    policy_net_kwargs : dict
-        kwargs for policy_net_fn
-    use_bonus_if_available : bool, default = False
-        If true, check if environment info has entry 'exploration_bonus'
-        and add it to the reward. See also UncertaintyEstimatorWrapper.
-    device: str
-        Device to put the tensors on
-
-    Attributes
-    ----------
-    device : str
-        Torch device on which the agent's neural networks are placed.
-    batch_size : int, default: 8
-        Number of episodes used for the update of the policy netowrk.
-    horizon : int, default: 256
-        Episode length: one transition per episode steps.
-    gamma : float, default: 0.99
-        Discount factor used to discount future rewards in the Bellman equation.
-    state_dim : int
-        Dimensionality of the continuous state space of the environment.
-    action_dim : int
-        Number of discrete actions available in the environment.
-    policy_net_fn : function(env, **kwargs)
-        Function that returns an instance of a policy network (PyTorch).
-    policy_net_kwargs : dict
-        Keyword arguments for `policy_net_fn`.
-    optimizer_kwargs : dict
-        Keyword arguments for the optimizer used during neural network training.
-    policy_net : torch.nn.Module
-        The policy network used by the agent.
-    policy_optimizer : torch.optim.Optimizer
-        The optimizer used for training the policy network.
-    memory : Memory
-        The memory buffer used to store the agent's experiences.
-    episode : int
-        A counter that keeps track of the number of episodes.
-
-    References
-    ----------
-    Williams, Ronald J.,
-    "Simple statistical gradient-following algorithms for connectionist
-    reinforcement learning."
-    ReinforcementLearning.Springer,Boston,MA,1992.5-3
-    """
-
-    name = "REINFORCE"
-
-    def __init__(
-        self,
-        env,
-        batch_size=8,
-        horizon=256,
-        gamma=0.99,
-        entr_coef=0.01,
-        learning_rate=0.0001,
-        normalize=True,
-        optimizer_type="ADAM",
-        policy_net_fn=None,
-        policy_net_kwargs=None,
-        use_bonus_if_available=False,
-        device="cuda:best",
-        **kwargs
-    ):
-        # For all parameters, define self.param = param
-        _, _, _, values = inspect.getargvalues(inspect.currentframe())
-        values.pop("self")
-        for arg, val in values.items():
-            setattr(self, arg, val)
-
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-
-        self.device = choose_device(device)
-
-        self.state_dim = self.env.observation_space.shape[0]
-        self.action_dim = self.env.action_space.n
-
-        self.policy_net_kwargs = policy_net_kwargs or {}
-
-        #
-        self.policy_net_fn = policy_net_fn or default_policy_net_fn
-
-        self.optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate}
-
-        # check environment
-        assert isinstance(self.env.observation_space, spaces.Box)
-        assert isinstance(self.env.action_space, spaces.Discrete)
-
-        self.policy_net = None  # policy network
-
-        # initialize
-        self.reset()
-
-    def reset(self, **kwargs):
-        self.policy_net = self.policy_net_fn(self.env, **self.policy_net_kwargs).to(
-            self.device
-        )
-
-        self.policy_optimizer = optimizer_factory(
-            self.policy_net.parameters(), **self.optimizer_kwargs
-        )
-
-        self.memory = Memory()
-
-        self.episode = 0
-
-    def policy(self, observation):
-        state = observation
-        assert self.policy_net is not None
-        state = torch.from_numpy(state).float().to(self.device)
-        action_dist = self.policy_net(state)
-        action = action_dist.sample().item()
-        return action
-
-    def fit(self, budget: int, **kwargs):
-        """
-        Train the agent using the provided environment.
-
-        Parameters
-        ----------
-        budget: int
-            number of episodes. Each episode runs for self.horizon unless it
-            enconters a terminal state in which case it stops early.
-        """
-        del kwargs
-        n_episodes_to_run = budget
-        count = 0
-        while count < n_episodes_to_run:
-            self._run_episode()
-            count += 1
-
-    def _run_episode(self):
-        # interact for H steps
-        episode_rewards = 0
-        observation, info = self.env.reset()
-        for _ in range(self.horizon):
-            # running policy
-            action = self.policy(observation)
-            next_observation, reward, terminated, truncated, info = self.env.step(
-                action
-            )
-            done = terminated or truncated
-
-            # check whether to use bonus
-            bonus = 0.0
-            if self.use_bonus_if_available:
-                if info is not None and "exploration_bonus" in info:
-                    bonus = info["exploration_bonus"]
-
-            # save in batch
-            self.memory.states.append(observation)
-            self.memory.actions.append(action)
-            self.memory.rewards.append(reward + bonus)  # add bonus here
-            self.memory.is_terminals.append(done)
-            episode_rewards += reward
-
-            if done:
-                break
-
-            # update observation
-            observation = next_observation
-
-        # update
-        self.episode += 1
-
-        #
-        if self.writer is not None:
-            self.writer.add_scalar("episode_rewards", episode_rewards, self.episode)
-
-        #
-        if self.episode % self.batch_size == 0:
-            self._update()
-            self.memory.clear_memory()
-
-        return episode_rewards
-
-    def _normalize(self, x):
-        return (x - x.mean()) / (x.std() + 1e-5)
-
-    def _update(self):
-        # monte carlo estimate of rewards
-        rewards = []
-        discounted_reward = 0
-        for reward, is_terminal in zip(
-            reversed(self.memory.rewards), reversed(self.memory.is_terminals)
-        ):
-            if is_terminal:
-                discounted_reward = 0
-            discounted_reward = reward + (self.gamma * discounted_reward)
-            rewards.insert(0, discounted_reward)
-
-        # convert list to tensor
-        states = torch.FloatTensor(np.array(self.memory.states)).to(self.device)
-        actions = torch.LongTensor(self.memory.actions).to(self.device)
-        rewards = torch.FloatTensor(rewards).to(self.device)
-        if self.normalize:
-            rewards = self._normalize(rewards)
-
-        # evaluate logprobs
-        action_dist = self.policy_net(states)
-        logprobs = action_dist.log_prob(actions)
-        dist_entropy = action_dist.entropy()
-
-        # compute loss
-        loss = -logprobs * rewards - self.entr_coef * dist_entropy
-
-        # take gradient step
-        self.policy_optimizer.zero_grad()
-
-        loss.mean().backward()
-
-        self.policy_optimizer.step()
-
-    #
-    # For hyperparameter optimization
-    #
-    @classmethod
-    def sample_parameters(cls, trial):
-        batch_size = trial.suggest_categorical("batch_size", [1, 4, 8, 16, 32])
-        gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.99])
-        learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True)
-
-        entr_coef = trial.suggest_float("entr_coef", 1e-8, 0.1, log=True)
-
-        return {
-            "batch_size": batch_size,
-            "gamma": gamma,
-            "learning_rate": learning_rate,
-            "entr_coef": entr_coef,
-        }
diff --git a/rlberry/agents/torch/sac/__init__.py b/rlberry/agents/torch/sac/__init__.py
deleted file mode 100644
index 5a7ff1963..000000000
--- a/rlberry/agents/torch/sac/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .sac import SACAgent
diff --git a/rlberry/agents/torch/sac/sac.py b/rlberry/agents/torch/sac/sac.py
deleted file mode 100644
index 1828dc566..000000000
--- a/rlberry/agents/torch/sac/sac.py
+++ /dev/null
@@ -1,543 +0,0 @@
-import time
-
-import gymnasium.spaces as spaces
-import numpy as np
-import rlberry
-import torch
-import torch.nn as nn
-import torch.optim as optim
-from rlberry.agents import AgentTorch, AgentWithSimplePolicy
-from rlberry.agents.torch.sac.sac_utils import default_policy_net_fn, default_q_net_fn
-from rlberry.agents.torch.utils.training import optimizer_factory
-from rlberry.agents.utils.replay import ReplayBuffer
-from rlberry.utils.factory import load
-from rlberry.utils.torch import choose_device
-
-logger = rlberry.logger
-
-
-class SACAgent(AgentTorch, AgentWithSimplePolicy):
-    """
-    Soft Actor Critic Agent.
-
-    SAC, or SOFT Actor Critic, an offpolicy actor-critic deep RL algorithm
-    based on the maximum entropy reinforcement learning framework. In this
-    framework, the actor aims to maximize expected reward while also
-    maximizing entropy.
-
-    Parameters
-    ----------
-    env : Model
-        Online model with continuous (Box) state space and continuous actions
-    batch_size : int
-        Number of episodes to wait before updating the policy.
-    gamma : double
-        Discount factor in [0, 1].
-    learning_rate : double
-        Learning rate.
-    buffer_capacity : int
-        Capacity of the replay buffer
-    optimizer_type: str
-        Type of optimizer. 'ADAM' by defaut.
-    tau : double
-        Target smoothing coefficient
-    policy frequency
-        Policy training frequency (Delayed TD3 update)
-    alpha
-        Entropy regularization coefficient
-    autotunealpha
-        Automatic tuning of alpha
-    learning start
-        Timesteps done before training starts
-    policy_net_fn : function(env, **kwargs)
-        Function that returns an instance of a policy network (pytorch).
-        If None, a default net is used.
-    policy_net_kwargs : dict
-        kwargs for policy_net_fn
-    q_net_constructor : Callable, str or None
-        Function/constructor that returns a torch module for the Q-network
-    q_net_kwargs : optional, dict
-        Parameters for q_net_constructor.
-    device : str
-        Device to put the tensors on
-    writer_frequency : int
-        Frequency of tensorboard logging
-
-    References
-    ----------
-    Haarnoja, Tuomas, et al. "Soft actor-critic algorithms and applications."
-    arXiv preprint arXiv:1812.05905 (2018).
-    """
-
-    name = "SAC"
-
-    def __init__(
-        self,
-        env,
-        batch_size=256,
-        gamma=0.99,
-        q_learning_rate=1e-3,
-        policy_learning_rate=3e-4,
-        buffer_capacity: int = int(1e6),
-        optimizer_type="ADAM",
-        tau=0.005,
-        policy_frequency=2,
-        alpha=0.2,
-        autotune_alpha=True,
-        learning_start=5e3,
-        policy_net_fn=None,
-        policy_net_kwargs=None,
-        q_net_constructor=None,
-        q_net_kwargs=None,
-        writer_frequency=100,
-        device="cuda:best",
-        **kwargs
-    ):
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-
-        # check environment
-        assert isinstance(self.env.observation_space, spaces.Box)
-        assert isinstance(self.env.action_space, spaces.Box)
-
-        # Setup cuda device
-        self.device = choose_device(device)
-
-        # Hyperparameters
-        self.batch_size = batch_size
-        self.gamma = gamma
-        self.q_learning_rate = q_learning_rate
-        self.policy_learning_rate = policy_learning_rate
-        self.buffer_capacity = buffer_capacity
-        self.learning_start = learning_start
-        self.policy_frequency = policy_frequency
-        self.tau = tau
-        self.optimizer_type = optimizer_type
-
-        # Setup Actor
-        self.policy_net_kwargs = policy_net_kwargs or {}
-        self.policy_net_fn = policy_net_fn or default_policy_net_fn
-        self.policy_optimizer_kwargs = {
-            "optimizer_type": self.optimizer_type,
-            "lr": policy_learning_rate,
-        }
-
-        # Setup Q networks and their targets
-        if isinstance(q_net_constructor, str):
-            q_net_constructor = load(q_net_constructor)
-        elif q_net_constructor is None:
-            q_net_constructor = default_q_net_fn
-        else:
-            q_net_constructor = q_net_constructor
-        q_net_kwargs = q_net_kwargs or {}
-        self.q_net_kwargs = q_net_kwargs
-        self.q_net_constructor = q_net_constructor
-        self.q1 = q_net_constructor(self.env, **q_net_kwargs).to(self.device)
-        self.q2 = q_net_constructor(self.env, **q_net_kwargs).to(self.device)
-        self.q1_target = q_net_constructor(self.env, **q_net_kwargs).to(self.device)
-        self.q2_target = q_net_constructor(self.env, **q_net_kwargs).to(self.device)
-        self.q_optimizer_kwargs = {
-            "optimizer_type": self.optimizer_type,
-            "lr": q_learning_rate,
-        }
-
-        # Setup tensorboard writer
-        self.writer_frequency = writer_frequency
-
-        # Setup Actor action scaling
-        self.action_scale = torch.tensor(
-            (self.env.action_space.high - self.env.action_space.low) / 2.0,
-            dtype=torch.float32,
-        ).to(self.device)
-        self.action_bias = torch.tensor(
-            (self.env.action_space.high + self.env.action_space.low) / 2.0,
-            dtype=torch.float32,
-        ).to(self.device)
-
-        # Autotune alpha or use a fixed default value
-        self.autotune_alpha = autotune_alpha
-        if not self.autotune_alpha:
-            self.alpha = alpha
-
-        # initialize
-        self.reset()
-
-    def reset(self, **kwargs):
-        """
-        Reset the agent.
-        This function resets the agent by initializing the necessary components and parameters for training.
-        """
-
-        # Initialize the rollout buffer
-        self.memory = ReplayBuffer(max_replay_size=self.buffer_capacity, rng=self.rng)
-        self.memory.setup_entry("states", dtype=np.float32)
-        self.memory.setup_entry("next_states", dtype=np.float32)
-        self.memory.setup_entry("actions", dtype=np.float32)
-        self.memory.setup_entry("rewards", dtype=np.float32)
-        self.memory.setup_entry("dones", dtype=np.float32)
-
-        # Intialize the Actor
-        self.cont_policy = self.policy_net_fn(self.env, **self.policy_net_kwargs).to(
-            self.device
-        )
-        self.policy_optimizer = optimizer_factory(
-            self.cont_policy.parameters(), **self.policy_optimizer_kwargs
-        )
-        self.cont_policy.load_state_dict(self.cont_policy.state_dict())
-
-        # Intialize the Q networks and their targets
-        self.q1 = self.q_net_constructor(self.env, **self.q_net_kwargs)
-        self.q2 = self.q_net_constructor(self.env, **self.q_net_kwargs)
-        self.q1_target = self.q_net_constructor(self.env, **self.q_net_kwargs)
-        self.q2_target = self.q_net_constructor(self.env, **self.q_net_kwargs)
-        self.q1.to(self.device)
-        self.q2.to(self.device)
-        self.q1_target.to(self.device)
-        self.q2_target.to(self.device)
-        self.q1_optimizer = optimizer_factory(
-            self.q1.parameters(), **self.q_optimizer_kwargs
-        )
-        self.q2_optimizer = optimizer_factory(
-            self.q2.parameters(), **self.q_optimizer_kwargs
-        )
-        self.q1_target_optimizer = optimizer_factory(
-            self.q1.parameters(), **self.q_optimizer_kwargs
-        )
-        self.q2_target_optimizer = optimizer_factory(
-            self.q2.parameters(), **self.q_optimizer_kwargs
-        )
-        # Define the loss
-        self.mse_loss = nn.MSELoss()
-
-        # Automatic entropy tuning
-        if self.autotune_alpha:
-            self.target_entropy = -torch.prod(
-                torch.Tensor(self.env.action_space.shape).to(self.device)
-            ).item()
-            self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
-            self.alpha = self.log_alpha.exp().item()
-            self.a_optimizer = optim.Adam([self.log_alpha], lr=self.q_learning_rate)
-
-        # initialize episode, steps and time counters
-        self.total_episodes = 0
-        self.total_timesteps = 0
-        self.time = time.time()
-
-    def policy(self, state):
-        assert self.cont_policy is not None
-        state = np.array([state])
-        state = torch.FloatTensor(state).to(self.device)
-
-        # Get the mean and log standard deviation of the action distribution from the policy network
-        action_dist = self.cont_policy(state)
-        mean, log_std = action_dist
-
-        # Compute the standard deviation and
-        # create a normal distribution with the computed mean and standard deviation
-        std = log_std.exp()
-        action_dist = torch.distributions.Normal(mean, std)
-
-        # Sample an action using the reparameterization trick
-        x_t = action_dist.rsample()
-        y_t = torch.tanh(x_t)
-
-        # Apply scaling and bias to the action
-        action = y_t * self.action_scale + self.action_bias
-        return action.detach().cpu().numpy()[0]
-
-    def fit(self, budget: int, **kwargs):
-        """
-        Train the agent using the provided environment.
-
-        Parameters
-        ----------
-        budget: int
-            number of episodes. Each episode runs for self.horizon unless it
-            enconters a terminal state in which case it stops early.
-        """
-
-        # Intialize environment and get first observation
-        state, _ = self.env.reset()
-
-        while self.total_timesteps < budget:
-            # Select action
-            if self.total_timesteps < self.learning_start:
-                # In order to improve exploration, before "learning_start"
-                # actions are sampled from a uniform random distribution over valid actions
-                action = np.array(self.env.action_space.sample())
-            else:
-                # SAC action selection
-                tensor_state = np.array([state])
-                action, _ = self._select_action(tensor_state)
-                action = action.detach().cpu().numpy()[0]
-
-            # Step through the environment
-            next_state, reward, next_terminated, next_truncated, info = self.env.step(
-                action
-            )
-            done = np.logical_or(next_terminated, next_truncated)
-
-            # End of episode logging
-            if "episode" in info.keys():
-                self.writer.add_scalar(
-                    "episode/episode_rewards",
-                    info["episode"]["r"],
-                    self.total_timesteps,
-                )
-                self.writer.add_scalar(
-                    "episode/episode_length", info["episode"]["l"], self.total_timesteps
-                )
-
-            # Add experience to replay buffer
-            self.memory.append(
-                {
-                    "states": state,
-                    "next_states": next_state,
-                    "actions": action,
-                    "rewards": reward,
-                    "dones": done,
-                }
-            )
-
-            # Update current state
-            state = next_state
-
-            # Reset the environment if episode is over
-            if done:
-                state, _ = self.env.reset()
-                self.memory.end_episode()
-
-            # Learning starts when there are enough samples in replay buffer
-            if self.total_timesteps > self.learning_start:
-                self._update()
-
-            self.total_timesteps += 1
-
-    def _select_action(self, state):
-        """
-        Select an action to take based on the current state.
-
-        This function selects an action to take based on the current state.
-        The action is sampled from a squashed Gaussian distribution defined by the policy network.
-
-        Parameters
-        ----------
-        state: numpy.ndarray or torch.Tensor
-            The current state of the environment
-
-        Returns
-        -------
-        action torch.Tensor
-            The selected action
-        log_prob torch.Tensor
-            The log probability of the selected action
-        """
-
-        # Convert the state to a torch.Tensor if it's not already
-        state = torch.FloatTensor(state).to(self.device)
-
-        # Get the mean and log standard deviation of the action distribution from the policy network
-        action_dist = self.cont_policy(state)
-        mean, log_std = action_dist
-
-        # Compute the standard deviation and
-        # create a normal distribution with the computed mean and standard deviation
-        std = log_std.exp()
-        action_dist = torch.distributions.Normal(mean, std)
-
-        # Sample an action using the reparameterization trick
-        x_t = action_dist.rsample()
-        y_t = torch.tanh(x_t)
-
-        # Apply scaling and bias to the action
-        # and compute the log probability of the selected action
-        action = y_t * self.action_scale + self.action_bias
-        log_prob = action_dist.log_prob(x_t)
-
-        # Enforce Action Bound
-        log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6)
-        log_prob = log_prob.sum(1, keepdim=True)
-        return action, log_prob
-
-    def _update(self):
-        """
-        Perform an update step for the SAC agent.
-
-        It updates the Q-networks and the policy network based on the collected
-        experiences from the replay buffer.
-        """
-
-        # Sample a batch from replay buffer
-        memory_data = self.memory.sample(self.batch_size, 1).data
-        states = (
-            torch.tensor(memory_data["states"])
-            .view(self.batch_size, -1)
-            .to(self.device)
-        )
-        next_state = (
-            torch.tensor(memory_data["next_states"])
-            .view(self.batch_size, -1)
-            .to(self.device)
-        )
-        actions = (
-            torch.tensor(memory_data["actions"])
-            .view(self.batch_size, -1)
-            .to(self.device)
-        )
-        rewards = (
-            torch.tensor(memory_data["rewards"])
-            .view(self.batch_size, -1)
-            .to(self.device)
-        )
-        dones = (
-            torch.tensor(memory_data["dones"]).view(self.batch_size, -1).to(self.device)
-        )
-
-        with torch.no_grad():
-            # Select action using the current policy
-            next_state_actions, next_state_log_pi = self._select_action(
-                next_state.detach().cpu().numpy()
-            )
-            # Compute the next state's Q-values
-            q1_next_target = self.q1_target(
-                torch.cat([next_state, next_state_actions], dim=-1)
-            )
-            q2_next_target = self.q2_target(
-                torch.cat([next_state, next_state_actions], dim=-1)
-            )
-            # Compute Q targets:
-            #   - Compute the minimum Q-values between Q1 and Q2
-            #   - Entropy regularization term is subtracted from the Q-values
-            #     This term encourages exploration by penalizing overly certain or deterministic actions.
-            min_q_next_target = (
-                torch.min(q1_next_target, q2_next_target)
-                - self.alpha * next_state_log_pi
-            )
-            # Compute the target Q-values using the Bellman equation with entropy regularization
-            next_q_value = rewards.flatten() + (1 - dones.flatten()) * self.gamma * (
-                min_q_next_target
-            ).view(-1)
-
-        # Compute Q loss
-        q1_v = self.q1(torch.cat([states, actions], dim=-1))
-        q2_v = self.q2(torch.cat([states, actions], dim=-1))
-        q1_loss_v = self.mse_loss(q1_v.squeeze(), next_q_value)
-        q2_loss_v = self.mse_loss(q2_v.squeeze(), next_q_value)
-        q_loss_v = q1_loss_v + q2_loss_v
-
-        # Update Q networks
-        self.q1_optimizer.zero_grad()
-        self.q2_optimizer.zero_grad()
-        q_loss_v.backward()
-        self.q1_optimizer.step()
-        self.q2_optimizer.step()
-
-        act_loss = None
-        alpha_loss = None
-        state_log_pi = None
-        # TD3 Delayed update of the policy network
-        if self.total_timesteps % self.policy_frequency == 0:
-            # Compensate for the delay by doing more than one update
-            for _ in range(self.policy_frequency):
-                # Select action using the current policy
-                state_action, state_log_pi = self._select_action(
-                    states.detach().cpu().numpy()
-                )
-                # Compute the next state's Q-values
-                q_out_v1 = self.q1(torch.cat([states, state_action], dim=-1))
-                q_out_v2 = self.q2(torch.cat([states, state_action], dim=-1))
-                # Select the minimum Q to reduce over estimation and improve stability
-                q_out_v = torch.min(q_out_v1, q_out_v2)
-                # Compute policy loss:
-                #   - Maximize the expected return of the policy : improves action selection
-                #   - Maximize the entropy of the policy : improves exploration
-                # Alpha is used to balance the trade-off between exploration and exploitation
-                act_loss = ((self.alpha * state_log_pi) - q_out_v).mean()
-
-                # Update policy network
-                self.policy_optimizer.zero_grad()
-                act_loss.backward()
-                self.policy_optimizer.step()
-
-                # Update alpha if autotuning is enabled
-                if self.autotune_alpha:
-                    with torch.no_grad():
-                        state_action, state_log_pi = self._select_action(
-                            states.detach().cpu().numpy()
-                        )
-                    alpha_loss = (
-                        -self.log_alpha * (state_log_pi + self.target_entropy)
-                    ).mean()
-
-                    self.a_optimizer.zero_grad()
-                    alpha_loss.backward()
-                    self.a_optimizer.step()
-                    self.alpha = self.log_alpha.exp().item()
-
-        # Target Q networks update by polyak averaging
-        for param, target_param in zip(
-            self.q1.parameters(), self.q1_target.parameters()
-        ):
-            target_param.data.copy_(
-                self.tau * param.data + (1 - self.tau) * target_param.data
-            )
-        for param, target_param in zip(
-            self.q2.parameters(), self.q2_target.parameters()
-        ):
-            target_param.data.copy_(
-                self.tau * param.data + (1 - self.tau) * target_param.data
-            )
-
-        # Log metrics
-        if (
-            self.writer is not None
-            and self.total_timesteps % self.writer_frequency == 0
-        ):
-            self.writer.add_scalar(
-                "fit/loss_q1", float(q1_loss_v.detach()), self.total_timesteps
-            )
-            self.writer.add_scalar(
-                "fit/loss_q2", float(q2_loss_v.detach()), self.total_timesteps
-            )
-            self.writer.add_scalar(
-                "fit/value_q1", float(q1_v.mean().detach()), self.total_timesteps
-            )
-            self.writer.add_scalar(
-                "fit/value_q2", float(q2_v.mean().detach()), self.total_timesteps
-            )
-            if act_loss:
-                self.writer.add_scalar(
-                    "fit/loss_act", float(act_loss.detach()), self.total_timesteps
-                )
-                self.writer.add_scalar(
-                    "fit/alpha", float(self.alpha), self.total_timesteps
-                )
-            self.writer.add_scalar(
-                "episode/SPS",
-                int(self.total_timesteps / (time.time() - self.time)),
-                self.total_timesteps,
-            )
-            if self.autotune_alpha and alpha_loss:
-                self.writer.add_scalar(
-                    "fit/alpha_loss", float(alpha_loss.detach()), self.total_timesteps
-                )
-
-    #
-    # For hyperparameter optimization
-    #
-    @classmethod
-    def sample_parameters(cls, trial):
-        batch_size = trial.suggest_categorical("batch_size", [128, 256, 512, 1024])
-        gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.99])
-        q_learning_rate = trial.suggest_loguniform("q_learning_rate", 1e-5, 1)
-        policy_learning_rate = trial.suggest_loguniform(
-            "policy_learning_rate", 1e-6, 1e-1
-        )
-        policy_frequency = trial.suggest_categorical("policy_frequency", [1, 2, 3, 5])
-
-        return {
-            "batch_size": batch_size,
-            "gamma": gamma,
-            "q_learning_rate": q_learning_rate,
-            "policy_learning_rate": policy_learning_rate,
-            "policy_frequency": policy_frequency,
-        }
diff --git a/rlberry/agents/torch/sac/sac_utils.py b/rlberry/agents/torch/sac/sac_utils.py
deleted file mode 100644
index f644dfdf1..000000000
--- a/rlberry/agents/torch/sac/sac_utils.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from rlberry.agents.torch.utils.training import model_factory
-
-
-def default_q_net_fn(env, **kwargs):
-    """
-    Returns a default Q value network.
-    """
-    model_config = {
-        "type": "MultiLayerPerceptron",
-        "layer_sizes": (256, 256),
-        "reshape": True,
-        "in_size": env.observation_space.shape[0] + env.action_space.shape[0],
-        "out_size": 1,
-    }
-    if kwargs:
-        for k, v in kwargs.items():
-            model_config[k] = v
-    return model_factory(**model_config)
-
-
-def default_policy_net_fn(env, **kwargs):
-    """
-    Returns a default Q value network.
-    """
-    model_config = {
-        "type": "MultiLayerPerceptron",
-        "in_size": env.observation_space.shape[0],
-        "layer_sizes": [256, 256],
-        "out_size": env.action_space.shape[0],
-        "reshape": True,
-        "is_policy": True,
-        "ctns_actions": True,
-        "squashed_policy": True,
-    }
-    if kwargs:
-        for k, v in kwargs.items():
-            model_config[k] = v
-    return model_factory(**model_config)
diff --git a/rlberry/agents/torch/tests/__init__.py b/rlberry/agents/torch/tests/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/rlberry/agents/torch/tests/test_a2c.py b/rlberry/agents/torch/tests/test_a2c.py
deleted file mode 100644
index 057649705..000000000
--- a/rlberry/agents/torch/tests/test_a2c.py
+++ /dev/null
@@ -1,122 +0,0 @@
-from rlberry.envs import Wrapper
-from rlberry.agents.torch import A2CAgent
-from rlberry.manager import ExperimentManager, evaluate_agents
-from rlberry.envs.benchmarks.ball_exploration import PBall2D
-from gymnasium import make
-
-
-def test_a2c():
-    env = "CartPole-v1"
-    mdp = make(env)
-    env_ctor = Wrapper
-    env_kwargs = dict(env=mdp)
-
-    a2crlberry_stats = ExperimentManager(
-        A2CAgent,
-        (env_ctor, env_kwargs),
-        fit_budget=int(100),
-        eval_kwargs=dict(eval_horizon=2),
-        init_kwargs=dict(batch_size=100),
-        n_fit=1,
-        agent_name="A2C_rlberry_" + env,
-    )
-
-    a2crlberry_stats.fit()
-
-    output = evaluate_agents([a2crlberry_stats], n_simulations=2, plot=False)
-    a2crlberry_stats.clear_output_dir()
-    env = "Pendulum-v1"
-    mdp = make(env)
-    env_ctor = Wrapper
-    env_kwargs = dict(env=mdp)
-
-    a2crlberry_stats = ExperimentManager(
-        A2CAgent,
-        (env_ctor, env_kwargs),
-        fit_budget=int(100),
-        eval_kwargs=dict(eval_horizon=2),
-        init_kwargs=dict(batch_size=100),
-        n_fit=1,
-        agent_name="A2C_rlberry_" + env,
-    )
-
-    a2crlberry_stats.fit()
-
-    output = evaluate_agents([a2crlberry_stats], n_simulations=2, plot=False)
-    a2crlberry_stats.clear_output_dir()
-
-    env = "Acrobot-v1"
-    mdp = make(env)
-    env_ctor = Wrapper
-    env_kwargs = dict(env=mdp)
-
-    a2crlberry_stats = ExperimentManager(
-        A2CAgent,
-        (env_ctor, env_kwargs),
-        fit_budget=int(100),
-        eval_kwargs=dict(eval_horizon=2),
-        init_kwargs=dict(batch_size=100),
-        n_fit=1,
-        agent_name="A2C_rlberry_" + env,
-    )
-
-    a2crlberry_stats.fit()
-
-    output = evaluate_agents([a2crlberry_stats], n_simulations=2, plot=False)
-    a2crlberry_stats.clear_output_dir()
-
-    env_ctor = PBall2D
-    env_kwargs = dict()
-
-    a2crlberry_stats = ExperimentManager(
-        A2CAgent,
-        (env_ctor, env_kwargs),
-        fit_budget=int(100),
-        eval_kwargs=dict(eval_horizon=2),
-        init_kwargs=dict(batch_size=100),
-        n_fit=1,
-        agent_name="A2C_rlberry_" + "PBall2D",
-    )
-
-    a2crlberry_stats.fit()
-
-    output = evaluate_agents([a2crlberry_stats], n_simulations=2, plot=False)
-    a2crlberry_stats.clear_output_dir()
-
-    # test also non default
-    env = "CartPole-v1"
-    mdp = make(env)
-    env_ctor = Wrapper
-    env_kwargs = dict(env=mdp)
-
-    a2crlberry_stats = ExperimentManager(
-        A2CAgent,
-        (env_ctor, env_kwargs),
-        fit_budget=int(100),
-        eval_kwargs=dict(eval_horizon=2),
-        init_kwargs=dict(
-            batch_size=100,
-            policy_net_fn="rlberry.agents.torch.utils.training.model_factory_from_env",
-            policy_net_kwargs=dict(
-                type="MultiLayerPerceptron",
-                layer_sizes=(256,),
-                reshape=False,
-                is_policy=True,
-            ),
-            value_net_fn="rlberry.agents.torch.utils.training.model_factory_from_env",
-            value_net_kwargs=dict(
-                type="MultiLayerPerceptron",
-                layer_sizes=[
-                    512,
-                ],
-                reshape=False,
-                out_size=1,
-            ),
-        ),
-        n_fit=1,
-        agent_name="A2C_rlberry_" + env,
-    )
-    a2crlberry_stats.fit()
-
-    output = evaluate_agents([a2crlberry_stats], n_simulations=2, plot=False)
-    a2crlberry_stats.clear_output_dir()
diff --git a/rlberry/agents/torch/tests/test_dqn.py b/rlberry/agents/torch/tests/test_dqn.py
deleted file mode 100644
index 5b6848fb7..000000000
--- a/rlberry/agents/torch/tests/test_dqn.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import pytest
-from rlberry.envs import gym_make
-from rlberry.agents.torch.dqn import DQNAgent
-from rlberry.agents.torch.utils.training import model_factory
-from rlberry.manager import ExperimentManager
-import os
-import pathlib
-
-import tempfile
-
-
-@pytest.mark.parametrize(
-    "use_double_dqn, use_prioritized_replay", [(False, False), (True, True)]
-)
-def test_dqn_agent(use_double_dqn, use_prioritized_replay):
-    env = gym_make("CartPole-v1")
-    agent = DQNAgent(
-        env,
-        learning_starts=5,
-        eval_interval=75,
-        train_interval=2,
-        gradient_steps=-1,
-        use_double_dqn=use_double_dqn,
-        use_prioritized_replay=use_prioritized_replay,
-    )
-    agent.fit(budget=500)
-
-    model_configs = {
-        "type": "MultiLayerPerceptron",
-        "layer_sizes": (5, 5),
-        "reshape": False,
-    }
-
-    def mlp(env, **kwargs):
-        """
-        Returns a default Q value network.
-        """
-        kwargs["in_size"] = env.observation_space.shape[0]
-        kwargs["out_size"] = env.action_space.n
-        return model_factory(**kwargs)
-
-    new_agent = DQNAgent(
-        env, q_net_constructor=mlp, q_net_kwargs=model_configs, learning_starts=100
-    )
-    new_agent.fit(budget=2000)
-
-
-def test_dqn_classic_env():
-    env = gym_make("CartPole-v1")
-    agent = DQNAgent(
-        env,
-        learning_starts=5,
-        eval_interval=75,
-        train_interval=2,
-        gradient_steps=-1,
-        use_double_dqn=True,
-        use_prioritized_replay=True,
-    )
-    agent.fit(budget=200)
-
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        saving_path = tmpdirname + "/agent_test_dqn_classic_env.pickle"
-
-        # test the save function
-        agent.save(saving_path)
-        assert os.path.exists(saving_path)
-
-        # test the loading function
-        test_load_env = gym_make("CartPole-v1")
-        loaded_agent = DQNAgent.load(saving_path, **dict(env=test_load_env))
-        assert loaded_agent
-
-        # test the agent
-        observation, info = test_load_env.reset()
-        for tt in range(100):
-            action = loaded_agent.policy(observation)
-            next_observation, reward, terminated, truncated, info = test_load_env.step(
-                action
-            )
-            done = terminated or truncated
-            if done:
-                next_observation, info = test_load_env.reset()
-            observation = next_observation
-
-
-def test_dqn_experiment_manager_classic_env():
-    # saving_path = "rlberry/agents/torch/tests/agentmanager_test_dqn_classic_env"
-
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        saving_path = tmpdirname + "/agentmanager_test_dqn_classic_env"
-
-        test_experiment_manager = ExperimentManager(
-            DQNAgent,  # The Agent class.
-            (
-                gym_make,
-                dict(
-                    id="CartPole-v1",
-                ),
-            ),  # The Environment to solve.
-            init_kwargs=dict(  # Where to put the agent's hyperparameters
-                learning_starts=5,
-                eval_interval=75,
-                train_interval=2,
-                gradient_steps=-1,
-                use_double_dqn=True,
-                use_prioritized_replay=True,
-                chunk_size=1,
-            ),
-            fit_budget=200,  # The number of interactions between the agent and the environment during training.
-            eval_kwargs=dict(
-                eval_horizon=50
-            ),  # The number of interactions between the agent and the environment during evaluations.
-            n_fit=1,  # The number of agents to train. Usually, it is good to do more than 1 because the training is stochastic.
-            agent_name="test_dqn_classic_env",  # The agent's name.
-            output_dir=saving_path,
-        )
-
-        test_experiment_manager.fit(budget=200)
-
-        # test the save function
-        test_experiment_manager.save()
-        assert os.path.exists(saving_path)
-
-        # test the loading function
-        test_load_env = gym_make("CartPole-v1")
-        path_to_load = next(pathlib.Path(saving_path).glob("**/*.pickle"))
-        loaded_experiment_manager = ExperimentManager.load(path_to_load)
-        assert loaded_experiment_manager
-
-        # test the agent
-        state, info = test_load_env.reset()
-        for tt in range(50):
-            action = loaded_experiment_manager.get_agent_instances()[0].policy(state)
-            next_s, _, terminated, truncated, test = test_load_env.step(action)
-            done = terminated or truncated
-            if done:
-                break
-            state = next_s
diff --git a/rlberry/agents/torch/tests/test_factory.py b/rlberry/agents/torch/tests/test_factory.py
deleted file mode 100644
index f1dddb92b..000000000
--- a/rlberry/agents/torch/tests/test_factory.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import pytest
-from rlberry.agents.torch.utils.training import model_factory
-
-
-@pytest.mark.parametrize(
-    "ntype",
-    [
-        "MultiLayerPerceptron",
-        "ConvolutionalNetwork",
-        "DuelingNetwork",
-        "Table",
-    ],
-)
-def test_dqn_agent(ntype):
-    if ntype == "MultiLayerPerceptron":
-        nkwargs = {"in_size": 5, "layer_sizes": [5, 5]}
-    elif ntype == "ConvolutionalNetwork":
-        nkwargs = dict(in_channels=10, in_height=20, in_width=30, out_size=15)
-    elif ntype == "DuelingNetwork":
-        nkwargs = {"in_size": 5, "out_size": 3}
-    elif ntype == "Table":
-        nkwargs = dict(state_size=5, action_size=3)
-    network = model_factory(ntype, **nkwargs)
diff --git a/rlberry/agents/torch/tests/test_mdqn.py b/rlberry/agents/torch/tests/test_mdqn.py
deleted file mode 100644
index b327b8599..000000000
--- a/rlberry/agents/torch/tests/test_mdqn.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import pytest
-from rlberry.envs import gym_make
-from rlberry.agents.torch.dqn import MunchausenDQNAgent
-from rlberry.agents.torch.utils.training import model_factory
-
-
-@pytest.mark.parametrize("use_prioritized_replay", [(False), (True)])
-def test_mdqn_agent(use_prioritized_replay):
-    env = gym_make("CartPole-v1")
-    agent = MunchausenDQNAgent(
-        env,
-        learning_starts=5,
-        batch_size=5,
-        eval_interval=2,
-        train_interval=2,
-        gradient_steps=-1,
-        use_prioritized_replay=use_prioritized_replay,
-    )
-    agent.fit(budget=50)
-
-    model_configs = {
-        "type": "MultiLayerPerceptron",
-        "layer_sizes": (5, 5),
-        "reshape": False,
-    }
-
-    def mlp(env, **kwargs):
-        """
-        Returns a default Q value network.
-        """
-        kwargs["in_size"] = env.observation_space.shape[0]
-        kwargs["out_size"] = env.action_space.n
-        return model_factory(**kwargs)
-
-    new_agent = MunchausenDQNAgent(
-        env, q_net_constructor=mlp, q_net_kwargs=model_configs, learning_starts=100
-    )
-    new_agent.fit(budget=200)
-    observation, info = env.reset()
-    new_agent.policy(observation)
diff --git a/rlberry/agents/torch/tests/test_ppo.py b/rlberry/agents/torch/tests/test_ppo.py
deleted file mode 100644
index ed31465cb..000000000
--- a/rlberry/agents/torch/tests/test_ppo.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# from rlberry.envs import gym_make
-# from rlberry.agents.torch.ppo import PPOAgent
-
-
-# env = (gym_make, dict(id="Acrobot-v1"))
-# # env = gym_make(id="Acrobot-v1")
-# ppo = PPOAgent(env)
-# ppo.fit(4096)
-
-import pytest
-from rlberry.envs import Wrapper
-from rlberry.agents.torch import PPOAgent
-from rlberry.manager import ExperimentManager, evaluate_agents
-from rlberry.envs.benchmarks.ball_exploration import PBall2D
-from gymnasium import make
-from rlberry.agents.torch.utils.training import model_factory_from_env
-import sys
-
-
-@pytest.mark.timeout(300)
-@pytest.mark.xfail(sys.platform == "win32", reason="bug with windows???")
-def test_ppo():
-    env = "CartPole-v1"
-    mdp = make(env)
-    env_ctor = Wrapper
-    env_kwargs = dict(env=mdp)
-
-    pporlberry_stats = ExperimentManager(
-        PPOAgent,
-        (env_ctor, env_kwargs),
-        fit_budget=int(132),
-        eval_kwargs=dict(eval_horizon=2),
-        init_kwargs=dict(batch_size=24, n_steps=96, device="cpu"),
-        n_fit=1,
-        agent_name="PPO_rlberry_" + env,
-    )
-
-    pporlberry_stats.fit()
-
-    output = evaluate_agents([pporlberry_stats], n_simulations=2, plot=False)
-    pporlberry_stats.clear_output_dir()
-
-    env = "Pendulum-v1"
-    mdp = make(env)
-    env_ctor = Wrapper
-    env_kwargs = dict(env=mdp)
-
-    pporlberry_stats = ExperimentManager(
-        PPOAgent,
-        (env_ctor, env_kwargs),
-        fit_budget=int(132),
-        eval_kwargs=dict(eval_horizon=2),
-        init_kwargs=dict(batch_size=24, n_steps=96, device="cpu"),
-        n_fit=1,
-        agent_name="PPO_rlberry_" + env,
-    )
-
-    pporlberry_stats.fit()
-
-    output = evaluate_agents([pporlberry_stats], n_simulations=2, plot=False)
-    pporlberry_stats.clear_output_dir()
-
-    env = "Acrobot-v1"
-    mdp = make(env)
-    env_ctor = Wrapper
-    env_kwargs = dict(env=mdp)
-
-    pporlberry_stats = ExperimentManager(
-        PPOAgent,
-        (env_ctor, env_kwargs),
-        fit_budget=int(132),
-        eval_kwargs=dict(eval_horizon=2),
-        init_kwargs=dict(batch_size=24, n_steps=96, device="cpu"),
-        n_fit=1,
-        agent_name="PPO_rlberry_" + env,
-    )
-
-    pporlberry_stats.fit()
-
-    output = evaluate_agents([pporlberry_stats], n_simulations=2, plot=False)
-    pporlberry_stats.clear_output_dir()
-
-    env_ctor = PBall2D
-    env_kwargs = dict()
-    pporlberry_stats = ExperimentManager(
-        PPOAgent,
-        (env_ctor, env_kwargs),
-        fit_budget=int(132),
-        eval_kwargs=dict(eval_horizon=2),
-        init_kwargs=dict(batch_size=24, n_steps=96, device="cpu"),
-        n_fit=1,
-        agent_name="PPO_rlberry_" + "PBall2D",
-    )
-
-    pporlberry_stats.fit()
-
-    output = evaluate_agents([pporlberry_stats], n_simulations=2, plot=False)
-    pporlberry_stats.clear_output_dir()
-
-    # test also non default
-    env = "CartPole-v1"
-    mdp = make(env)
-    env_ctor = Wrapper
-    env_kwargs = dict(env=mdp)
-
-    pporlberry_stats = ExperimentManager(
-        PPOAgent,
-        (env_ctor, env_kwargs),
-        fit_budget=int(132),
-        eval_kwargs=dict(eval_horizon=2),
-        init_kwargs=dict(
-            batch_size=24,
-            n_steps=96,
-            policy_net_fn="rlberry.agents.torch.utils.training.model_factory_from_env",
-            policy_net_kwargs=dict(
-                type="MultiLayerPerceptron",
-                layer_sizes=(256,),
-                reshape=False,
-                is_policy=True,
-            ),
-            value_net_fn="rlberry.agents.torch.utils.training.model_factory_from_env",
-            value_net_kwargs=dict(
-                type="MultiLayerPerceptron",
-                layer_sizes=[
-                    512,
-                ],
-                reshape=False,
-                out_size=1,
-            ),
-        ),
-        n_fit=1,
-        agent_name="PPO_rlberry_" + env,
-    )
-    pporlberry_stats.fit()
-
-    pporlberry_stats = ExperimentManager(
-        PPOAgent,
-        (env_ctor, env_kwargs),
-        fit_budget=int(132),
-        eval_kwargs=dict(eval_horizon=2),
-        init_kwargs=dict(
-            batch_size=24,
-            n_steps=96,
-            policy_net_fn=model_factory_from_env,
-            policy_net_kwargs=dict(
-                type="MultiLayerPerceptron",
-                layer_sizes=(256,),
-                reshape=False,
-                is_policy=True,
-            ),
-            value_net_fn=model_factory_from_env,
-            value_net_kwargs=dict(
-                type="MultiLayerPerceptron",
-                layer_sizes=[
-                    512,
-                ],
-                reshape=False,
-                out_size=1,
-            ),
-        ),
-        n_fit=1,
-        agent_name="PPO_rlberry_" + env,
-    )
-    pporlberry_stats.fit()
-
-    output = evaluate_agents([pporlberry_stats], n_simulations=2, plot=False)
-    pporlberry_stats.clear_output_dir()
-
-    env_ctor = PBall2D
-    env_kwargs = dict()
-    pporlberry_stats = ExperimentManager(
-        PPOAgent,
-        (env_ctor, env_kwargs),
-        fit_budget=int(132),
-        eval_kwargs=dict(eval_horizon=2),
-        init_kwargs=dict(batch_size=24, n_steps=96, device="cpu"),
-        n_fit=1,
-        agent_name="PPO_rlberry_" + "PBall2D",
-    )
-
-    pporlberry_stats.fit()
-
-    output = evaluate_agents([pporlberry_stats], n_simulations=2, plot=False)
-    pporlberry_stats.clear_output_dir()
-
-    pporlberry_stats = ExperimentManager(
-        PPOAgent,
-        (env_ctor, env_kwargs),
-        fit_budget=int(132),
-        eval_kwargs=dict(eval_horizon=2),
-        init_kwargs=dict(
-            batch_size=24, n_steps=96, normalize_advantages=True, device="cpu"
-        ),
-        n_fit=1,
-        agent_name="PPO_rlberry_" + "PBall2D",
-    )
-
-    pporlberry_stats.fit()
-
-    output = evaluate_agents([pporlberry_stats], n_simulations=2, plot=False)
-    pporlberry_stats.clear_output_dir()
diff --git a/rlberry/agents/torch/tests/test_reinforce.py b/rlberry/agents/torch/tests/test_reinforce.py
deleted file mode 100644
index 5df650288..000000000
--- a/rlberry/agents/torch/tests/test_reinforce.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from rlberry.agents.torch import REINFORCEAgent
-from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
-from rlberry.exploration_tools.discrete_counter import DiscreteCounter
-from rlberry.wrappers.uncertainty_estimator_wrapper import UncertaintyEstimatorWrapper
-
-
-def test_reinforce_agent():
-    _env = get_benchmark_env(level=1)
-    n_episodes = 50
-    horizon = 30
-
-    #
-    def uncertainty_estimator_fn(observation_space, action_space):
-        counter = DiscreteCounter(observation_space, action_space, n_bins_obs=20)
-        return counter
-
-    env = UncertaintyEstimatorWrapper(
-        _env, uncertainty_estimator_fn, bonus_scale_factor=1.0
-    )
-    #
-    agent = REINFORCEAgent(
-        env,
-        horizon=horizon,
-        gamma=0.99,
-        learning_rate=0.001,
-        use_bonus_if_available=True,
-    )
-    agent.fit(budget=n_episodes)
-    agent.policy(env.observation_space.sample())
-
-
-def test_reinforce_agent_partial_fit():
-    env = get_benchmark_env(level=1)
-    n_episodes = 10
-    horizon = 30
-
-    agent = REINFORCEAgent(
-        env,
-        horizon=horizon,
-        gamma=0.99,
-        learning_rate=0.001,
-        use_bonus_if_available=False,
-    )
-    agent.fit(budget=n_episodes // 2)
-    agent.policy(env.observation_space.sample())
-    assert agent.episode == 5
-    agent.fit(budget=n_episodes // 2)
-    assert agent.episode == 10
-    agent.policy(env.observation_space.sample())
diff --git a/rlberry/agents/torch/tests/test_sac.py b/rlberry/agents/torch/tests/test_sac.py
deleted file mode 100644
index db5f5067f..000000000
--- a/rlberry/agents/torch/tests/test_sac.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import sys
-
-import pytest
-from gymnasium import make
-from rlberry.agents.torch.sac import SACAgent
-from rlberry.envs import Wrapper
-from rlberry.manager import AgentManager, evaluate_agents
-
-
-@pytest.mark.timeout(300)
-@pytest.mark.xfail(sys.platform == "win32", reason="bug with windows???")
-def test_sac():
-    env = "Pendulum-v1"
-    mdp = make(env)
-    env_ctor = Wrapper
-    env_kwargs = dict(env=mdp)
-
-    sacrlberry_stats = AgentManager(
-        SACAgent,
-        (env_ctor, env_kwargs),
-        fit_budget=int(132),
-        eval_kwargs=dict(eval_horizon=2),
-        init_kwargs=dict(batch_size=24, device="cpu"),
-        n_fit=1,
-        agent_name="SAC_rlberry_" + env,
-    )
-
-    sacrlberry_stats.fit()
-
-    output = evaluate_agents([sacrlberry_stats], n_simulations=2, plot=False)
-    sacrlberry_stats.clear_output_dir()
-
-    # test also non default
-    env = "Pendulum-v1"
-    mdp = make(env)
-    env_ctor = Wrapper
-    env_kwargs = dict(env=mdp)
-
-    sacrlberry_stats = AgentManager(
-        SACAgent,
-        (env_ctor, env_kwargs),
-        fit_budget=int(1024),
-        eval_kwargs=dict(eval_horizon=2),
-        init_kwargs=dict(
-            learning_start=int(512),
-            autotune_alpha=False,
-            batch_size=24,
-            policy_net_kwargs=dict(
-                type="MultiLayerPerceptron",
-                layer_sizes=(256,),
-                reshape=False,
-                is_policy=True,
-            ),
-            q_net_kwargs=dict(
-                type="MultiLayerPerceptron",
-                layer_sizes=[
-                    512,
-                ],
-                reshape=False,
-                out_size=1,
-            ),
-        ),
-        n_fit=1,
-        agent_name="SAC_rlberry_" + env,
-    )
-    sacrlberry_stats.fit()
-    output = evaluate_agents([sacrlberry_stats], n_simulations=2, plot=False)
-    sacrlberry_stats.clear_output_dir()
diff --git a/rlberry/agents/torch/tests/test_torch_atari.py b/rlberry/agents/torch/tests/test_torch_atari.py
deleted file mode 100644
index bb7629c78..000000000
--- a/rlberry/agents/torch/tests/test_torch_atari.py
+++ /dev/null
@@ -1,287 +0,0 @@
-from rlberry.manager import ExperimentManager
-from rlberry.agents.torch.dqn.dqn import DQNAgent
-from rlberry.envs.gym_make import atari_make
-
-from rlberry.agents.torch import PPOAgent
-from rlberry.agents.torch.utils.training import model_factory_from_env
-import pathlib
-import numpy as np
-import pytest
-import os
-
-import tempfile
-
-
-def test_forward_dqn():
-    mlp_configs = {
-        "type": "MultiLayerPerceptron",  # A network architecture
-        "layer_sizes": [32],  # Network dimensions
-        "reshape": False,
-        "is_policy": False,  # The network should output a distribution
-        # over actions
-    }
-
-    cnn_configs = {
-        "type": "ConvolutionalNetwork",  # A network architecture
-        "activation": "RELU",
-        "in_channels": 4,
-        "in_height": 84,
-        "in_width": 84,
-        "head_mlp_kwargs": mlp_configs,
-        "transpose_obs": False,
-        "is_policy": False,  # The network should output a distribution
-    }
-
-    tuned_agent = ExperimentManager(
-        DQNAgent,  # The Agent class.
-        (
-            atari_make,
-            # uncomment when rlberry will manage vectorized env
-            # dict(id="ALE/Breakout-v5", n_envs=3),
-            dict(id="ALE/Breakout-v5", n_envs=1),
-        ),  # The Environment to solve.
-        init_kwargs=dict(  # Where to put the agent's hyperparameters
-            q_net_constructor="rlberry.agents.torch.utils.training.model_factory_from_env",
-            q_net_kwargs=cnn_configs,
-            max_replay_size=100,
-            batch_size=32,
-            learning_starts=100,
-            gradient_steps=1,
-            epsilon_final=0.01,
-            learning_rate=1e-4,  # Size of the policy gradient descent steps.
-            chunk_size=5,
-        ),
-        fit_budget=200,  # The number of interactions between the agent and the environment during training.
-        eval_kwargs=dict(
-            eval_horizon=10
-        ),  # The number of interactions between the agent and the environment during evaluations.
-        n_fit=1,  # The number of agents to train. Usually, it is good to do more than 1 because the training is stochastic.
-        agent_name="DQN_test",  # The agent's name.
-    )
-
-    tuned_agent.fit()
-
-
-def test_forward_empty_input_dim():
-    mlp_configs = {
-        "type": "MultiLayerPerceptron",  # A network architecture
-        "layer_sizes": [32],  # Network dimensions
-        "reshape": False,
-        "is_policy": False,  # The network should output a distribution
-        # over actions
-    }
-
-    cnn_configs = {
-        "type": "ConvolutionalNetwork",  # A network architecture
-        "activation": "RELU",
-        "head_mlp_kwargs": mlp_configs,
-        "transpose_obs": False,
-        "is_policy": False,  # The network should output a distribution
-    }
-
-    tuned_agent = ExperimentManager(
-        DQNAgent,  # The Agent class.
-        (
-            atari_make,
-            # uncomment when rlberry will manage vectorized env
-            # dict(id="ALE/Breakout-v5", n_envs=3),
-            dict(id="ALE/Breakout-v5", n_envs=1),
-        ),  # The Environment to solve.
-        init_kwargs=dict(  # Where to put the agent's hyperparameters
-            q_net_constructor="rlberry.agents.torch.utils.training.model_factory_from_env",
-            q_net_kwargs=cnn_configs,
-            max_replay_size=100,
-            batch_size=32,
-            learning_starts=100,
-            gradient_steps=1,
-            epsilon_final=0.01,
-            learning_rate=1e-4,  # Size of the policy gradient descent steps.
-            chunk_size=5,
-        ),
-        fit_budget=10,  # The number of interactions between the agent and the environment during training.
-        eval_kwargs=dict(
-            eval_horizon=10
-        ),  # The number of interactions between the agent and the environment during evaluations.
-        n_fit=1,  # The number of agents to train. Usually, it is good to do more than 1 because the training is stochastic.
-        agent_name="DQN_test",  # The agent's name.
-    )
-
-    tuned_agent.fit()
-
-
-@pytest.mark.parametrize("num_envs", [1, 3])
-def test_ppo_vectorized_atari_env(num_envs):
-    policy_mlp_configs = {
-        "type": "MultiLayerPerceptron",  # A network architecture
-        "layer_sizes": [32],  # Network dimensions
-        "reshape": False,
-        "is_policy": True,  # The network should output a distribution
-        # over actions
-    }
-
-    critic_mlp_configs = {
-        "type": "MultiLayerPerceptron",
-        "layer_sizes": [32],
-        "reshape": False,
-        "out_size": 1,  # The critic network is an approximator of
-        # a value function V: States -> |R
-    }
-
-    policy_configs = {
-        "type": "ConvolutionalNetwork",  # A network architecture
-        "activation": "RELU",
-        "in_channels": 4,
-        "in_height": 84,
-        "in_width": 84,
-        "head_mlp_kwargs": policy_mlp_configs,
-        "transpose_obs": False,
-        "is_policy": True,  # The network should output a distribution
-    }
-
-    critic_configs = {
-        "type": "ConvolutionalNetwork",
-        "layer_sizes": "RELU",
-        "in_channels": 4,
-        "in_height": 84,
-        "in_width": 84,
-        "head_mlp_kwargs": critic_mlp_configs,
-        "transpose_obs": False,
-        "out_size": 1,
-    }
-
-    agent = PPOAgent(
-        (
-            atari_make,
-            dict(id="ALE/Freeway-v5"),
-        ),
-        optimizer_type="ADAM",  # What optimizer to use for policy gradient descent steps.
-        learning_rate=1e-4,  # Size of the policy gradient descent steps.
-        policy_net_fn=model_factory_from_env,  # A policy network constructor
-        policy_net_kwargs=policy_configs,  # Policy network's architecure
-        value_net_fn=model_factory_from_env,  # A Critic network constructor
-        value_net_kwargs=critic_configs,  # Critic network's architecure.
-        n_envs=num_envs,
-        n_steps=64,
-        batch_size=128,
-        # **dict(eval_env=(atari_make,dict(id="ALE/Freeway-v5",n_envs=1)))
-    )
-    agent.fit(budget=500)
-
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        saving_path = tmpdirname + "/agent_test_ppo_vect_env.pickle"
-
-        # test the save function
-        agent.save(saving_path)
-        assert os.path.exists(saving_path)
-
-        # test the loading function
-        test_load_env = atari_make("ALE/Freeway-v5")
-        test_load_env.reset()
-        loaded_agent = PPOAgent.load(
-            saving_path, **dict(env=test_load_env), copy_env=False
-        )
-        assert loaded_agent
-
-        # test the agent
-        observation, info = test_load_env.reset()
-        for tt in range(100):
-            action = loaded_agent.policy(observation)
-            next_observation, reward, terminated, truncated, info = test_load_env.step(
-                action
-            )
-            done = terminated or truncated
-            if done:
-                next_observation, info = test_load_env.reset()
-            observation = next_observation
-
-
-@pytest.mark.parametrize("num_envs", [1, 3])
-def test_ppo_experiment_manager_vectorized_atari_env(num_envs):
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        saving_path = tmpdirname + "/agentmanager_test_ppo_vectorized_env"
-
-        policy_mlp_configs = {
-            "type": "MultiLayerPerceptron",  # A network architecture
-            "layer_sizes": [32],  # Network dimensions
-            "reshape": False,
-            "is_policy": True,  # The network should output a distribution
-            # over actions
-        }
-
-        critic_mlp_configs = {
-            "type": "MultiLayerPerceptron",
-            "layer_sizes": [32],
-            "reshape": False,
-            "out_size": 1,  # The critic network is an approximator of
-            # a value function V: States -> |R
-        }
-
-        policy_configs = {
-            "type": "ConvolutionalNetwork",  # A network architecture
-            "activation": "RELU",
-            "in_channels": 4,
-            "in_height": 84,
-            "in_width": 84,
-            "head_mlp_kwargs": policy_mlp_configs,
-            "transpose_obs": False,
-            "is_policy": True,  # The network should output a distribution
-        }
-
-        critic_configs = {
-            "type": "ConvolutionalNetwork",
-            "layer_sizes": "RELU",
-            "in_channels": 4,
-            "in_height": 84,
-            "in_width": 84,
-            "head_mlp_kwargs": critic_mlp_configs,
-            "transpose_obs": False,
-            "out_size": 1,
-        }
-
-        test_experiment_manager = ExperimentManager(
-            PPOAgent,  # The Agent class.
-            (
-                atari_make,
-                dict(id="ALE/Atlantis-v5"),
-            ),  # The Environment to solve.
-            init_kwargs=dict(  # Where to put the agent's hyperparameters
-                optimizer_type="ADAM",  # What optimizer to use for policy gradient descent steps.
-                learning_rate=1e-4,  # Size of the policy gradient descent steps.
-                policy_net_fn=model_factory_from_env,  # A policy network constructor
-                policy_net_kwargs=policy_configs,  # Policy network's architecure
-                value_net_fn=model_factory_from_env,  # A Critic network constructor
-                value_net_kwargs=critic_configs,  # Critic network's architecure.
-                n_envs=num_envs,
-                n_steps=64,
-                batch_size=128,
-            ),
-            fit_budget=200,  # The number of interactions between the agent and the environment during training.
-            eval_kwargs=dict(
-                eval_horizon=50
-            ),  # The number of interactions between the agent and the environment during evaluations.
-            n_fit=1,  # The number of agents to train. Usually, it is good to do more than 1 because the training is stochastic.
-            agent_name="test_ppo_vectorized_env",  # The agent's name.
-            output_dir=saving_path,
-            # eval_env = (atari_make,dict(id="ALE/Atlantis-v5",n_envs=1))
-        )
-        test_experiment_manager.fit(budget=500)
-
-        # test the save function
-        test_experiment_manager.save()
-        assert os.path.exists(saving_path)
-
-        # test the loading function
-        test_load_env = atari_make("ALE/Atlantis-v5")
-        test_load_env.reset()
-        path_to_load = next(pathlib.Path(saving_path).glob("**/*.pickle"))
-        loaded_experiment_manager = ExperimentManager.load(path_to_load)
-        assert loaded_experiment_manager
-
-        # test the agent
-        obs, infos = test_load_env.reset()
-        for tt in range(50):
-            actions = loaded_experiment_manager.get_agent_instances()[0].policy(obs)
-            obs, reward, terminated, truncated, info = test_load_env.step(actions)
-            done = np.logical_or(terminated, truncated)
-            if done:
-                break
diff --git a/rlberry/agents/torch/tests/test_torch_models.py b/rlberry/agents/torch/tests/test_torch_models.py
deleted file mode 100644
index 9bc692294..000000000
--- a/rlberry/agents/torch/tests/test_torch_models.py
+++ /dev/null
@@ -1,47 +0,0 @@
-"""
-TODO: Test attention modules
-"""
-
-import torch
-from rlberry.agents.torch.utils.models import MultiLayerPerceptron
-from rlberry.agents.torch.utils.models import ConvolutionalNetwork, DuelingNetwork
-
-
-def test_mlp():
-    model = MultiLayerPerceptron(
-        in_size=5, layer_sizes=[10, 10, 10], out_size=10, reshape=False
-    )
-    x = torch.rand(1, 5)
-    y = model.forward(x)
-    assert y.shape[1] == 10
-
-
-def test_mlp_policy():
-    model = MultiLayerPerceptron(
-        in_size=5, layer_sizes=[10, 10, 10], out_size=10, reshape=False, is_policy=True
-    )
-    x = torch.rand(1, 5)
-    scores = model.action_scores(x)
-    assert scores.shape[1] == 10
-
-
-def test_cnn():
-    model = ConvolutionalNetwork(in_channels=10, in_height=20, in_width=30, out_size=15)
-    x = torch.rand(1, 10, 20, 30)
-    y = model.forward(x)
-    assert y.shape[1] == 15
-
-
-def test_dueling_network():
-    model = DuelingNetwork(in_size=10, out_size=15)
-    x = torch.rand(1, 10)
-    y = model.forward(x)
-
-
-def test_cnn_policy():
-    model = ConvolutionalNetwork(
-        in_channels=10, in_height=20, in_width=30, out_size=15, is_policy=True
-    )
-    x = torch.rand(1, 10, 20, 30)
-    scores = model.action_scores(x)
-    assert scores.shape[1] == 15
diff --git a/rlberry/agents/torch/tests/test_torch_training.py b/rlberry/agents/torch/tests/test_torch_training.py
deleted file mode 100644
index fe5fb722c..000000000
--- a/rlberry/agents/torch/tests/test_torch_training.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import torch
-from rlberry.agents.torch.utils.training import loss_function_factory, optimizer_factory
-from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
-from rlberry.agents.torch.utils.models import default_policy_net_fn
-
-# loss_function_factory
-assert isinstance(loss_function_factory("l2"), torch.nn.MSELoss)
-assert isinstance(loss_function_factory("l1"), torch.nn.L1Loss)
-assert isinstance(loss_function_factory("smooth_l1"), torch.nn.SmoothL1Loss)
-assert isinstance(loss_function_factory("bce"), torch.nn.BCELoss)
-
-# optimizer_factory
-env = get_benchmark_env(level=1)
-assert (
-    optimizer_factory(default_policy_net_fn(env).parameters(), "ADAM").defaults["lr"]
-    == 0.001
-)
-assert optimizer_factory(default_policy_net_fn(env).parameters(), "ADAM").defaults[
-    "betas"
-] == (0.9, 0.999)
-assert (
-    optimizer_factory(default_policy_net_fn(env).parameters(), "RMS_PROP").defaults[
-        "lr"
-    ]
-    == 0.01
-)
-assert (
-    optimizer_factory(default_policy_net_fn(env).parameters(), "RMS_PROP").defaults[
-        "alpha"
-    ]
-    == 0.99
-)
diff --git a/rlberry/agents/torch/utils/__init__.py b/rlberry/agents/torch/utils/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/rlberry/agents/torch/utils/models.py b/rlberry/agents/torch/utils/models.py
deleted file mode 100644
index 709493995..000000000
--- a/rlberry/agents/torch/utils/models.py
+++ /dev/null
@@ -1,534 +0,0 @@
-#
-# Simple MLP and CNN models
-#
-from functools import partial
-
-
-from gymnasium import spaces
-from gymnasium.vector.sync_vector_env import SyncVectorEnv
-from gymnasium.vector.async_vector_env import AsyncVectorEnv
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.distributions import Categorical, Normal
-
-from rlberry.agents.torch.utils.training import model_factory, activation_factory
-
-
-def default_twinq_net_fn(env):
-    """
-    Returns a default Twinq network
-    """
-    assert isinstance(env.action_space, spaces.Discrete)
-    if isinstance(env.observation_space, spaces.Box):
-        obs_shape = env.observation_space.shape
-    elif isinstance(env.observation_space, spaces.Tuple):
-        obs_shape = env.observation_space.spaces[0].shape
-    else:
-        raise ValueError(
-            "Incompatible observation space: {}".format(env.observation_space)
-        )
-    # Assume CHW observation space
-
-    if len(obs_shape) == 1:
-        model_config = {
-            "type": "MultiLayerPerceptron",
-            "in_size": int(obs_shape[0]) + int(env.action_space.n),
-            "layer_sizes": [64, 64],
-        }
-    else:
-        raise ValueError(
-            "Incompatible observation shape: {}".format(env.observation_space.shape)
-        )
-
-    model_config["out_size"] = 1
-
-    q1 = model_factory(**model_config)
-    q2 = model_factory(**model_config)
-
-    return (q1, q2)
-
-
-def default_policy_net_fn(env):
-    """
-    Returns a default policy network.
-    """
-
-    # remove potential wrappers
-    while type(env) in [SyncVectorEnv, AsyncVectorEnv]:
-        env = env.envs[0]
-
-    if isinstance(env.observation_space, spaces.Box):
-        obs_shape = env.observation_space.shape
-    elif isinstance(env.observation_space, spaces.Tuple):
-        obs_shape = env.observation_space.spaces[0].shape
-    else:
-        raise ValueError(
-            "Incompatible observation space: {}".format(env.observation_space)
-        )
-
-    if len(obs_shape) == 3:
-        if obs_shape[0] < obs_shape[1] and obs_shape[0] < obs_shape[2]:
-            # Assume CHW observation space
-            model_config = {
-                "type": "ConvolutionalNetwork",
-                "is_policy": True,
-                "in_channels": int(obs_shape[0]),
-                "in_height": int(obs_shape[1]),
-                "in_width": int(obs_shape[2]),
-            }
-        elif obs_shape[2] < obs_shape[0] and obs_shape[2] < obs_shape[1]:
-            # Assume WHC observation space
-            model_config = {
-                "type": "ConvolutionalNetwork",
-                "is_policy": True,
-                "transpose_obs": True,
-                "in_channels": int(obs_shape[2]),
-                "in_height": int(obs_shape[1]),
-                "in_width": int(obs_shape[0]),
-            }
-    elif len(obs_shape) == 2:
-        model_config = {
-            "type": "ConvolutionalNetwork",
-            "is_policy": True,
-            "in_channels": int(1),
-            "in_height": int(obs_shape[0]),
-            "in_width": int(obs_shape[1]),
-        }
-    elif len(obs_shape) == 1:
-        model_config = {
-            "type": "MultiLayerPerceptron",
-            "in_size": int(obs_shape[0]),
-            "layer_sizes": [64, 64],
-            "reshape": False,
-            "is_policy": True,
-        }
-    else:
-        raise ValueError(
-            "Incompatible observation shape: {}".format(env.observation_space.shape)
-        )
-
-    if isinstance(env.action_space, spaces.Discrete):
-        model_config["out_size"] = env.action_space.n
-        model_config["ctns_actions"] = False
-    elif isinstance(env.action_space, spaces.Tuple):
-        model_config["out_size"] = env.action_space.spaces[0].n
-        model_config["ctns_actions"] = False
-    elif isinstance(env.action_space, spaces.Box):
-        model_config["out_size"] = env.action_space.shape[0]
-        model_config["ctns_actions"] = True
-
-    return model_factory(**model_config)
-
-
-def default_value_net_fn(env):
-    """
-    Returns a default value network.
-    """
-
-    # remove potential wrappers
-    while type(env) in [SyncVectorEnv, AsyncVectorEnv]:
-        env = env.envs[0]
-
-    if isinstance(env.observation_space, spaces.Box):
-        obs_shape = env.observation_space.shape
-    elif isinstance(env.observation_space, spaces.Tuple):
-        obs_shape = env.observation_space.spaces[0].shape
-    else:
-        raise ValueError(
-            "Incompatible observation space: {}".format(env.observation_space)
-        )
-    # Assume CHW observation space
-    if len(obs_shape) == 3:
-        model_config = {
-            "type": "ConvolutionalNetwork",
-            "in_channels": int(obs_shape[0]),
-            "in_height": int(obs_shape[1]),
-            "in_width": int(obs_shape[2]),
-        }
-    elif len(obs_shape) == 2:
-        model_config = {
-            "type": "ConvolutionalNetwork",
-            "in_channels": int(1),
-            "in_height": int(obs_shape[0]),
-            "in_width": int(obs_shape[1]),
-        }
-    elif len(obs_shape) == 1:
-        model_config = {
-            "type": "MultiLayerPerceptron",
-            "in_size": int(obs_shape[0]),
-            "layer_sizes": [64, 64],
-        }
-    else:
-        raise ValueError(
-            "Incompatible observation shape: {}".format(env.observation_space.shape)
-        )
-
-    model_config["out_size"] = 1
-
-    return model_factory(**model_config)
-
-
-class Net(nn.Module):
-    def __init__(self, obs_size, hidden_size, n_actions):
-        super(Net, self).__init__()
-        self.net = nn.Sequential(
-            nn.Linear(obs_size, hidden_size),
-            nn.ReLU(),
-            nn.Linear(hidden_size, n_actions),
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-class BaseModule(torch.nn.Module):
-    """
-    Base torch.nn.Module implementing basic features:
-        - initialization factory
-        - normalization parameters
-    """
-
-    def __init__(self, activation_type="RELU", reset_type="xavier"):
-        super().__init__()
-        self.activation = activation_factory(activation_type)
-        self.reset_type = reset_type
-
-    def _init_weights(self, m, param=None, put_bias_to_zero=False):
-        if hasattr(m, "weight"):
-            if self.reset_type == "xavier":
-                torch.nn.init.xavier_uniform_(m.weight.data)
-            elif self.reset_type == "zeros":
-                torch.nn.init.constant_(m.weight.data, 0.0)
-            elif self.reset_type == "orthogonal":
-                torch.nn.init.orthogonal_(m.weight.data, gain=param)
-            else:
-                raise ValueError("Unknown reset type")
-        if put_bias_to_zero:
-            if hasattr(m, "bias") and m.bias is not None:
-                torch.nn.init.constant_(m.bias.data, 0.0)
-
-    def reset(self):
-        self.apply(self._init_weights)
-
-
-class Table(torch.nn.Module):
-    """Torch module for a policy for discrete state-action spaces.
-
-    Parameters
-    ----------
-    state_size: int
-        Number of states
-    action_size: int
-        Number of actions
-    """
-
-    def __init__(self, state_size, action_size):
-        super().__init__()
-        self.policy = nn.Embedding.from_pretrained(
-            torch.zeros(state_size, action_size), freeze=False
-        )
-        self.softmax = nn.Softmax(dim=-1)
-
-    def forward(self, x):
-        action_probs = self.softmax(self.action_scores(x))
-        return Categorical(action_probs)
-
-    def action_scores(self, x):
-        return self.policy(x.long())
-
-
-class MultiLayerPerceptron(BaseModule):
-    """Torch module for an MLP.
-
-    Parameters
-    ----------
-    in_size: int
-        Input size
-    layer_sizes: Sequence[int]
-        Dimensions of each hidden layer.
-    reshape: bool, default = True
-        If True, input tensors are reshaped to (batch_size, dim)
-    out_size: int, optional
-        Output size. If None, the output size is given by the last
-        element of layer_sizes.
-    activation: {"RELU", "TANH", "ELU"}
-        Activation function.
-    is_policy: bool, default=False
-        If true, the :meth:`forward` method returns a distribution over the
-        output.
-    ctns_actions: bool, default=False
-        If true, the :meth:`forward` method returns a normal distribution
-        corresponding to the output. Otherwise, a categorical distribution
-        is returned.
-    std0: float, default=1.0
-        Initial standard deviation for the normal distribution. Only used
-        if ctns_actions and is_policy are True.
-    reset_type: {"xavier", "orthogonal", "zeros"}, default="orthogonal"
-        Type of weight initialization.
-    pred_init_scale: float, default="auto"
-        Scale of the initial weights of the output layer. If "auto", the
-        scale is set to 0.01 for policy networks and 1.0 otherwise.
-    """
-
-    def __init__(
-        self,
-        in_size=None,
-        layer_sizes=None,
-        reshape=False,
-        out_size=None,
-        activation="RELU",
-        is_policy=False,
-        ctns_actions=False,
-        std0=1.0,
-        reset_type="orthogonal",
-        pred_init_scale="auto",
-        squashed_policy=False,
-        **kwargs
-    ):
-        super().__init__(reset_type=reset_type, **kwargs)
-
-        self.reshape = reshape
-        self.layer_sizes = layer_sizes or [64, 64]
-        self.layer_sizes = list(self.layer_sizes)
-        self.out_size = out_size
-        self.activation = activation_factory(activation)
-        self.is_policy = is_policy
-        self.ctns_actions = ctns_actions
-        self.std0 = std0
-        self.squashed_policy = squashed_policy
-
-        # Set pred_init_scale
-        if pred_init_scale == "auto":
-            self.pred_init_scale = 0.01 if is_policy else 1.0
-        else:
-            self.pred_init_scale = pred_init_scale
-
-        # Instantiate parameters
-        sizes = [in_size] + self.layer_sizes
-        self.layers = nn.ModuleList(
-            [nn.Linear(sizes[i], sizes[i + 1]) for i in range(len(sizes) - 1)]
-        )
-        if out_size:
-            if squashed_policy:
-                self.fc_mean = nn.Linear(256, out_size)
-                self.fc_logstd = nn.Linear(256, out_size)
-            if ctns_actions:
-                self.logstd = nn.Parameter(np.log(std0) * torch.ones(out_size))
-            self.predict = nn.Linear(sizes[-1], out_size)
-
-        # Initialize parameters
-        self.reset()
-
-    def reset(self):
-        self.apply(partial(self._init_weights, param=np.log(2)))
-        if self.out_size:
-            if self.ctns_actions:
-                self.logstd.data.fill_(np.log(self.std0))
-                self.apply(
-                    partial(self._init_weights, param=np.log(2), put_bias_to_zero=True)
-                )
-            self._init_weights(self.predict, param=self.pred_init_scale)
-
-    def forward(self, x):
-        if self.reshape:
-            x = x.reshape(x.shape[0], -1)  # We expect a batch of vectors
-        for layer in self.layers:
-            x = self.activation(layer(x.float()))
-        if self.squashed_policy:
-            mean = self.fc_mean(x)
-            log_std = self.fc_logstd(x)
-            log_std = torch.tanh(log_std)
-            LOG_STD_MAX = 2
-            LOG_STD_MIN = -5
-            log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
-            return (mean, log_std)
-        if self.out_size:
-            x = self.predict(x)
-        if self.is_policy:
-            if self.ctns_actions:
-                std = torch.exp(self.logstd.expand_as(x))
-                dist = Normal(x, std)
-            else:
-                action_probs = F.softmax(x, dim=-1)
-                dist = Categorical(action_probs)
-            return dist
-        return x
-
-    def action_scores(self, x):
-        if self.is_policy:
-            if self.reshape:
-                x = x.reshape(x.shape[0], -1)  # We expect a batch of vectors
-            for layer in self.layers:
-                x = self.activation(layer(x.float()))
-            if self.out_size:
-                action_scores = self.predict(x)
-            return action_scores
-
-
-class DuelingNetwork(BaseModule):
-    """Torch module for a DQN dueling network based on a MultiLayerPerceptron.
-
-    Parameters
-    -----------
-    in_size: int
-        Input size
-    base_module_kwargs: dict
-        Parameters for :func:`~rlberry.agents.torch.utils.training.model_factory`
-        to build shared (MLP) architecture for the advantage and value nets.
-    value_kwargs: dict
-        Parameters for :func:`~rlberry.agents.torch.utils.training.model_factory`
-        to build value network (MLP).
-    advantage_kwargs: dict
-        Parameters for :func:`~rlberry.agents.torch.utils.training.model_factory`
-        to build advantage network (MLP).
-    out_size: int
-        Output size.
-    """
-
-    def __init__(
-        self,
-        in_size=None,
-        base_module_kwargs=None,
-        value_kwargs=None,
-        advantage_kwargs=None,
-        out_size=None,
-    ):
-        super().__init__()
-        self.out_size = out_size
-        base_module_kwargs = base_module_kwargs or {}
-        base_module_kwargs["in_size"] = in_size
-        self.base_module = model_factory(**base_module_kwargs)
-        value_kwargs = value_kwargs or {}
-        value_kwargs["in_size"] = self.base_module.layer_sizes[-1]
-        value_kwargs["out_size"] = 1
-        self.value = model_factory(**value_kwargs)
-        advantage_kwargs = advantage_kwargs or {}
-        advantage_kwargs["in_size"] = self.base_module.layer_sizes[-1]
-        advantage_kwargs["out_size"] = out_size
-        self.advantage = model_factory(**advantage_kwargs)
-
-    def forward(self, x):
-        x = self.base_module(x)
-        value = self.value(x).expand(-1, self.out_size)
-        advantage = self.advantage(x)
-        return (
-            value + advantage - advantage.mean(1).unsqueeze(1).expand(-1, self.out_size)
-        )
-
-
-class ConvolutionalNetwork(nn.Module):
-    """Torch module for a CNN.
-
-    Expects inputs of shape BCHW, where
-    B = batch size;
-    C = number of channels;
-    H = height;
-    W = width.
-
-    For the CNN forward, if the tensor has more than 4 dimensions (not BCHW), it keeps the 3 last dimension as CHW and merge all first ones into 1 (Batch). Go through the CNN + MLP, then split the first dimension as before.
-
-    Parameters
-    ----------
-    activation: {"RELU", "TANH", "ELU"}
-        Activation function.
-    in_channels: int
-        Number of input channels C
-    in_height: int
-        Input height H
-    in_width: int
-        Input width W
-    head_mlp_kwargs: dict, optional
-        Parameters to build an MLP
-        (:class:`~rlberry.agents.torch.utils.models.MultiLayerPerceptron`)
-        using the factory
-        :func:`~rlberry.agents.torch.utils.training.model_factory`
-
-    """
-
-    def __init__(
-        self,
-        activation="RELU",
-        in_channels=None,
-        in_height=None,
-        in_width=None,
-        head_mlp_kwargs=None,
-        out_size=None,
-        is_policy=False,
-        transpose_obs=False,
-        **kwargs
-    ):
-        super().__init__()
-        self.activation = activation_factory(activation)
-        self.conv1 = nn.Conv2d(in_channels, 16, kernel_size=2, stride=2)
-        self.conv2 = nn.Conv2d(16, 32, kernel_size=2, stride=2)
-        self.conv3 = nn.Conv2d(32, 64, kernel_size=2, stride=2)
-
-        # MLP Head
-        self.head_mlp_kwargs = head_mlp_kwargs or {}
-        self.head_mlp_kwargs["in_size"] = self._get_conv_out_size(
-            [in_channels, in_height, in_width]
-        )  # Number of Linear input connections depends on output of conv layers
-        self.head_mlp_kwargs["out_size"] = out_size
-        self.head_mlp_kwargs["is_policy"] = is_policy
-        self.head = model_factory(**self.head_mlp_kwargs)
-
-        self.is_policy = is_policy
-        self.transpose_obs = transpose_obs
-
-    def _get_conv_out_size(self, shape):
-        """
-        Computes the output dimensions of the convolution network.
-        Shape : dimension of the input of the CNN
-        """
-        conv_result = self.activation((self.conv1(torch.zeros(1, *shape))))
-        conv_result = self.activation((self.conv2(conv_result)))
-        conv_result = self.activation((self.conv3(conv_result)))
-        return int(np.prod(conv_result.size()))
-
-    def convolutions(self, x):
-        x = x.float()
-        # if there is no batch (CHW), add one dimension to specify batch of 1 (and get format BCHW)
-        if len(x.shape) == 3:
-            x = x.unsqueeze(0)
-        if self.transpose_obs:
-            x = torch.transpose(x, -1, -3)
-        x = self.activation((self.conv1(x)))
-        x = self.activation((self.conv2(x)))
-        x = self.activation((self.conv3(x)))
-        x = x.view(x.size(0), -1)  # flatten
-        return x
-
-    def forward(self, x):
-        """
-        Forward convolutional network
-
-        Parameters
-        ----------
-        x: torch.tensor
-            Tensor of shape BCHW (Batch,Chanel,Height,Width : if more than 4 dimensions, merge all the first in batch dimension)
-        """
-        flag_view_to_change = False
-
-        if len(x.shape) > 4:
-            flag_view_to_change = True
-            dim_to_retore = x.shape[:-3]
-            inputview_size = tuple((-1,)) + tuple(x.shape[-3:])
-            outputview_size = tuple(dim_to_retore) + tuple(
-                (self.head_mlp_kwargs["out_size"],)
-            )
-            x = x.view(inputview_size)
-
-        conv_result = self.convolutions(x)
-        output_result = self.head(
-            conv_result.view(conv_result.size()[0], -1)
-        )  # give the 'conv_result' flattenned in 2 dimensions (batch and other) to the MLP (head)
-
-        if flag_view_to_change:
-            output_result = output_result.view(outputview_size)
-
-        return output_result
-
-    def action_scores(self, x):
-        return self.head.action_scores(self.convolutions(x))
diff --git a/rlberry/agents/torch/utils/training.py b/rlberry/agents/torch/utils/training.py
deleted file mode 100644
index ed338b3bb..000000000
--- a/rlberry/agents/torch/utils/training.py
+++ /dev/null
@@ -1,148 +0,0 @@
-import numpy as np
-import torch
-from gymnasium import spaces
-from torch import nn as nn
-from torch.nn import functional as F
-
-
-def loss_function_factory(loss_function, **kwargs):
-    if loss_function == "l2":
-        return torch.nn.MSELoss(**kwargs)
-    elif loss_function == "l1":
-        return torch.nn.L1Loss(**kwargs)
-    elif loss_function == "smooth_l1":
-        return torch.nn.SmoothL1Loss(**kwargs)
-    elif loss_function == "bce":
-        return torch.nn.BCELoss(**kwargs)
-    else:
-        raise ValueError("Unknown loss function : {}".format(loss_function))
-
-
-def optimizer_factory(params, optimizer_type="ADAM", **kwargs):
-    if optimizer_type == "ADAM":
-        return torch.optim.Adam(params=params, **kwargs)
-    elif optimizer_type == "RMS_PROP":
-        return torch.optim.RMSprop(params=params, **kwargs)
-    else:
-        raise ValueError("Unknown optimizer type: {}".format(optimizer_type))
-
-
-def model_factory_from_env(env, **kwargs):
-    """Returns a torch module after setting up input/output dimensions according to an env.
-
-    Parameters
-    ----------
-    env: gym.Env
-        Environment
-    **kwargs: Dict
-        Parameters to be updated, used to call :func:`~rlberry.agents.torch.utils.training.model_factory`.
-    """
-    kwargs = size_model_config(env, **kwargs)
-    return model_factory(**kwargs)
-
-
-def model_factory(type="MultiLayerPerceptron", **kwargs) -> nn.Module:
-    """Build a neural net of a given type.
-
-    Parameters
-    ----------
-    type: {"MultiLayerPerceptron",
-           "ConvolutionalNetwork",
-           "DuelingNetwork",
-           "Table"}, default = "MultiLayerPerceptron"
-        Type of neural network.
-    **kwargs: dict
-        Parameters that vary according to each neural net type, see
-
-        * :class:`~rlberry.agents.torch.utils.models.MultiLayerPerceptron`
-
-        * :class:`~rlberry.agents.torch.utils.models.ConvolutionalNetwork`
-
-        * :class:`~rlberry.agents.torch.utils.models.DuelingNetwork`
-
-        * :class:`~rlberry.agents.torch.utils.models.Table`
-    """
-    from rlberry.agents.torch.utils.models import (
-        MultiLayerPerceptron,
-        DuelingNetwork,
-        ConvolutionalNetwork,
-        Table,
-    )
-
-    if type == "MultiLayerPerceptron":
-        return MultiLayerPerceptron(**kwargs)
-    elif type == "DuelingNetwork":
-        return DuelingNetwork(**kwargs)
-    elif type == "ConvolutionalNetwork":
-        return ConvolutionalNetwork(**kwargs)
-    elif type == "Table":
-        return Table(**kwargs)
-    else:
-        raise ValueError("Unknown model type")
-
-
-def size_model_config(env, **model_config):
-    """
-    Setup input/output dimensions for the configuration of
-    a model depending on the environment observation/action spaces.
-
-    Parameters
-    ----------
-    env : gym.Env
-        An environment.
-    model_config : dict
-        Parameters to be updated, used to call :func:`~rlberry.agents.torch.utils.training.model_factory`.
-        If "out_size" is not given in model_config, assumes
-        that the output dimension of the neural net is equal to the number
-        of actions in the environment.
-    """
-
-    if isinstance(env.observation_space, spaces.Box):
-        obs_shape = env.observation_space.shape
-    elif isinstance(env.observation_space, spaces.Tuple):
-        obs_shape = env.observation_space.spaces[0].shape
-    elif isinstance(env.observation_space, spaces.Discrete):
-        return model_config
-
-    # Assume CHW observation space
-    if "type" in model_config and model_config["type"] == "ConvolutionalNetwork":
-        if "transpose_obs" in model_config and not model_config["transpose_obs"]:
-            # Assume CHW observation space
-            if "in_channels" not in model_config:
-                model_config["in_channels"] = int(obs_shape[0])
-            if "in_height" not in model_config:
-                model_config["in_height"] = int(obs_shape[1])
-            if "in_width" not in model_config:
-                model_config["in_width"] = int(obs_shape[2])
-        else:
-            # Assume WHC observation space to transpose
-            if "in_channels" not in model_config:
-                model_config["in_channels"] = int(obs_shape[2])
-            if "in_height" not in model_config:
-                model_config["in_height"] = int(obs_shape[1])
-            if "in_width" not in model_config:
-                model_config["in_width"] = int(obs_shape[0])
-    else:
-        model_config["in_size"] = int(np.prod(obs_shape))
-
-    if "out_size" not in model_config:
-        if isinstance(env.action_space, spaces.Discrete):
-            model_config["out_size"] = env.action_space.n
-        elif isinstance(env.action_space, spaces.Tuple):
-            model_config["out_size"] = env.action_space.spaces[0].n
-    return model_config
-
-
-def activation_factory(activation_type):
-    if activation_type == "RELU":
-        return F.relu
-    elif activation_type == "TANH":
-        return torch.tanh
-    elif activation_type == "ELU":
-        return nn.ELU()
-    else:
-        raise ValueError("Unknown activation_type: {}".format(activation_type))
-
-
-def trainable_parameters(model):
-    return sum(p.numel() for p in model.parameters() if p.requires_grad)
diff --git a/rlberry/agents/ucbvi/__init__.py b/rlberry/agents/ucbvi/__init__.py
deleted file mode 100644
index 031f77a6f..000000000
--- a/rlberry/agents/ucbvi/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .ucbvi import UCBVIAgent
diff --git a/rlberry/agents/ucbvi/ucbvi.py b/rlberry/agents/ucbvi/ucbvi.py
deleted file mode 100644
index d5dfc4e67..000000000
--- a/rlberry/agents/ucbvi/ucbvi.py
+++ /dev/null
@@ -1,332 +0,0 @@
-import numpy as np
-
-import gymnasium.spaces as spaces
-from rlberry.agents import AgentWithSimplePolicy
-from rlberry.agents.ucbvi.utils import (
-    update_value_and_get_action,
-    update_value_and_get_action_sd,
-)
-from rlberry.exploration_tools.discrete_counter import DiscreteCounter
-from rlberry.agents.dynprog.utils import (
-    backward_induction_sd,
-    backward_induction_reward_sd,
-)
-from rlberry.agents.dynprog.utils import backward_induction_in_place
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class UCBVIAgent(AgentWithSimplePolicy):
-    """
-    UCBVI [1]_ with custom exploration bonus.
-
-    Notes
-    -----
-    The recommended policy after all the episodes is computed without
-    exploration bonuses.
-
-    Parameters
-    ----------
-    env : gym.Env
-        Environment with discrete states and actions.
-    gamma : double, default: 1.0
-        Discount factor in [0, 1]. If gamma is 1.0, the problem is set to
-        be finite-horizon.
-    horizon : int
-        Horizon of the objective function. If None and gamma<1, set to
-        1/(1-gamma).
-    bonus_scale_factor : double, default: 1.0
-        Constant by which to multiply the exploration bonus, controls
-        the level of exploration.
-    bonus_type : {"simplified_bernstein"}
-        Type of exploration bonus. Currently, only "simplified_bernstein"
-        is implemented. If `reward_free` is true, this parameter is ignored
-        and the algorithm uses 1/n bonuses.
-    reward_free : bool, default: False
-        If true, ignores rewards and uses only 1/n bonuses.
-    stage_dependent : bool, default: False
-        If true, assume that transitions and rewards can change with the stage h.
-    real_time_dp : bool, default: False
-        If true, uses real-time dynamic programming [2]_ instead of full backward induction
-        for the sampling policy.
-
-    References
-    ----------
-    .. [1] Azar et al., 2017
-        Minimax Regret Bounds for Reinforcement Learning
-        https://arxiv.org/abs/1703.05449
-
-    .. [2] Efroni, Yonathan, et al.
-          Tight regret bounds for model-based reinforcement learning with greedy policies.
-          Advances in Neural Information Processing Systems. 2019.
-          https://papers.nips.cc/paper/2019/file/25caef3a545a1fff2ff4055484f0e758-Paper.pdf
-    """
-
-    name = "UCBVI"
-
-    def __init__(
-        self,
-        env,
-        gamma=1.0,
-        horizon=100,
-        bonus_scale_factor=1.0,
-        bonus_type="simplified_bernstein",
-        reward_free=False,
-        stage_dependent=False,
-        real_time_dp=False,
-        **kwargs
-    ):
-        # init base class
-        AgentWithSimplePolicy.__init__(self, env, **kwargs)
-
-        self.gamma = gamma
-        self.horizon = horizon
-        self.bonus_scale_factor = bonus_scale_factor
-        self.bonus_type = bonus_type
-        self.reward_free = reward_free
-        self.stage_dependent = stage_dependent
-        self.real_time_dp = real_time_dp
-
-        # check environment
-        assert isinstance(self.env.observation_space, spaces.Discrete)
-        assert isinstance(self.env.action_space, spaces.Discrete)
-
-        # other checks
-        assert gamma >= 0 and gamma <= 1.0
-        if self.horizon is None:
-            assert gamma < 1.0, "If no horizon is given, gamma must be smaller than 1."
-            self.horizon = int(np.ceil(1.0 / (1.0 - gamma)))
-
-        # maximum value
-        r_range = self.env.reward_range[1] - self.env.reward_range[0]
-        if r_range == np.inf or r_range == 0.0:
-            logger.warning(
-                "{}: Reward range is  zero or infinity. ".format(self.name)
-                + "Setting it to 1."
-            )
-            r_range = 1.0
-
-        self.v_max = np.zeros(self.horizon)
-        self.v_max[-1] = r_range
-        for hh in reversed(range(self.horizon - 1)):
-            self.v_max[hh] = r_range + self.gamma * self.v_max[hh + 1]
-
-        # initialize
-        self.reset()
-
-    def reset(self, **kwargs):
-        H = self.horizon
-        S = self.env.observation_space.n
-        A = self.env.action_space.n
-
-        if self.stage_dependent:
-            shape_hsa = (H, S, A)
-            shape_hsas = (H, S, A, S)
-        else:
-            shape_hsa = (S, A)
-            shape_hsas = (S, A, S)
-
-        # visit counter
-        self.N_sa = np.zeros(shape_hsa)
-        # bonus
-        self.B_sa = np.zeros((H, S, A))
-
-        # MDP estimator
-        self.R_hat = np.zeros(shape_hsa)
-        self.P_hat = np.ones(shape_hsas) * 1.0 / S
-
-        # Value functions
-        self.V = np.ones((H, S))
-        self.Q = np.zeros((H, S, A))
-        # for rec. policy
-        self.V_policy = np.zeros((H, S))
-        self.Q_policy = np.zeros((H, S, A))
-
-        # Init V and bonus
-        for hh in range(self.horizon):
-            self.B_sa[hh, :, :] = self.v_max[hh]
-            self.V[hh, :] = self.v_max[hh]
-
-        # ep counter
-        self.episode = 0
-
-        # useful object to compute total number of visited states & entropy of visited states
-        self.counter = DiscreteCounter(
-            self.env.observation_space, self.env.action_space
-        )
-
-        # update name
-        if self.real_time_dp:
-            self.name = "UCBVI-RTDP"
-
-    def policy(self, observation):
-        state = observation
-        assert self.Q_policy is not None
-        return self.Q_policy[0, state, :].argmax()
-
-    def _get_action(self, state, hh=0):
-        """Sampling policy."""
-        if not self.real_time_dp:
-            assert self.Q is not None
-            return self.Q[hh, state, :].argmax()
-        else:
-            if self.stage_dependent:
-                update_fn = update_value_and_get_action_sd
-            else:
-                update_fn = update_value_and_get_action
-            return update_fn(
-                state,
-                hh,
-                self.V,
-                self.R_hat,
-                self.P_hat,
-                self.B_sa,
-                self.gamma,
-                self.v_max,
-            )
-
-    def _compute_bonus(self, n, hh):
-        # reward-free
-        if self.reward_free:
-            bonus = 1.0 / n
-            return bonus
-
-        # not reward-free
-        if self.bonus_type == "simplified_bernstein":
-            bonus = self.bonus_scale_factor * np.sqrt(1.0 / n) + self.v_max[hh] / n
-            bonus = min(bonus, self.v_max[hh])
-            return bonus
-        else:
-            raise ValueError(
-                "Error: bonus type {} not implemented".format(self.bonus_type)
-            )
-
-    def _update(self, state, action, next_state, reward, hh):
-        if self.stage_dependent:
-            self.N_sa[hh, state, action] += 1
-
-            nn = self.N_sa[hh, state, action]
-            prev_r = self.R_hat[hh, state, action]
-            prev_p = self.P_hat[hh, state, action, :]
-
-            self.R_hat[hh, state, action] = (
-                1.0 - 1.0 / nn
-            ) * prev_r + reward * 1.0 / nn
-
-            self.P_hat[hh, state, action, :] = (1.0 - 1.0 / nn) * prev_p
-            self.P_hat[hh, state, action, next_state] += 1.0 / nn
-
-            self.B_sa[hh, state, action] = self._compute_bonus(nn, hh)
-
-        else:
-            self.N_sa[state, action] += 1
-
-            nn = self.N_sa[state, action]
-            prev_r = self.R_hat[state, action]
-            prev_p = self.P_hat[state, action, :]
-
-            self.R_hat[state, action] = (1.0 - 1.0 / nn) * prev_r + reward * 1.0 / nn
-
-            self.P_hat[state, action, :] = (1.0 - 1.0 / nn) * prev_p
-            self.P_hat[state, action, next_state] += 1.0 / nn
-
-            self.B_sa[hh, state, action] = self._compute_bonus(nn, hh)
-
-    def _run_episode(self):
-        # interact for H steps
-        episode_rewards = 0
-        observation, info = self.env.reset()
-        for hh in range(self.horizon):
-            action = self._get_action(observation, hh)
-            next_observation, reward, terminated, truncated, info = self.env.step(
-                action
-            )
-            done = terminated or truncated
-            episode_rewards += reward  # used for logging only
-
-            self.counter.update(observation, action)
-
-            if self.reward_free:
-                reward = 0.0  # set to zero before update if reward_free
-
-            self._update(observation, action, next_observation, reward, hh)
-
-            observation = next_observation
-            if done:
-                break
-
-        # run backward induction
-        if not self.real_time_dp:
-            if self.stage_dependent:
-                backward_induction_sd(
-                    self.Q,
-                    self.V,
-                    self.R_hat + self.B_sa,
-                    self.P_hat,
-                    self.gamma,
-                    self.v_max[0],
-                )
-            else:
-                backward_induction_reward_sd(
-                    self.Q,
-                    self.V,
-                    self.R_hat + self.B_sa,
-                    self.P_hat,
-                    self.gamma,
-                    self.v_max[0],
-                )
-
-        # update info
-        self.episode += 1
-
-        # writer
-        if self.writer is not None:
-            self.writer.add_scalar("episode_rewards", episode_rewards, self.episode)
-            self.writer.add_scalar(
-                "n_visited_states", self.counter.get_n_visited_states(), self.episode
-            )
-
-        # return sum of rewards collected in the episode
-        return episode_rewards
-
-    def fit(self, budget: int, **kwargs):
-        """
-
-        Train the agent using the provided environment.
-
-        Parameters
-        ----------
-        budget: int
-            number of episodes
-        **kwargs
-            Extra arguments. Not used for this agent.
-        """
-        del kwargs
-        n_episodes_to_run = budget
-        count = 0
-        while count < n_episodes_to_run:
-            self._run_episode()
-            count += 1
-
-        # compute Q function for the recommended policy
-        if self.stage_dependent:
-            backward_induction_sd(
-                self.Q_policy,
-                self.V_policy,
-                self.R_hat,
-                self.P_hat,
-                self.gamma,
-                self.v_max[0],
-            )
-        else:
-            backward_induction_in_place(
-                self.Q_policy,
-                self.V_policy,
-                self.R_hat,
-                self.P_hat,
-                self.horizon,
-                self.gamma,
-                self.v_max[0],
-            )
diff --git a/rlberry/agents/ucbvi/utils.py b/rlberry/agents/ucbvi/utils.py
deleted file mode 100644
index 9e4823475..000000000
--- a/rlberry/agents/ucbvi/utils.py
+++ /dev/null
@@ -1,83 +0,0 @@
-from rlberry.utils.jit_setup import numba_jit
-
-
-@numba_jit
-def update_value_and_get_action(state, hh, V, R_hat, P_hat, B_sa, gamma, v_max):
-    """
-    state : int
-    hh : int
-    V : np.ndarray
-        shape (H, S)
-    R_hat : np.ndarray
-        shape (S, A)
-    P_hat : np.ndarray
-        shape (S, A, S)
-    B_sa : np.ndarray
-        shape (H, S, A)
-    gamma : double
-    v_max : np.ndarray
-        shape (H,)
-    """
-    H = V.shape[0]
-    S, A = R_hat.shape[-2:]
-    best_action = 0
-    max_val = 0
-    previous_value = V[hh, state]
-
-    for aa in range(A):
-        q_aa = R_hat[state, aa] + B_sa[hh, state, aa]
-
-        if hh < H - 1:
-            for sn in range(S):
-                q_aa += gamma * P_hat[state, aa, sn] * V[hh + 1, sn]
-
-        if aa == 0 or q_aa > max_val:
-            max_val = q_aa
-            best_action = aa
-
-    V[hh, state] = max_val
-    V[hh, state] = min(v_max[hh], V[hh, state])
-    V[hh, state] = min(previous_value, V[hh, state])
-
-    return best_action
-
-
-@numba_jit
-def update_value_and_get_action_sd(state, hh, V, R_hat, P_hat, B_sa, gamma, v_max):
-    """
-    state : int
-    hh : int
-    V : np.ndarray
-        shape (H, S)
-    R_hat : np.ndarray
-        shape (H, S, A)
-    P_hat : np.ndarray
-        shape (H, S, A, S)
-    B_sa : np.ndarray
-        shape (S, A)
-    gamma : double
-    v_max : np.ndarray
-        shape (H,)
-    """
-    H = V.shape[0]
-    S, A = R_hat.shape[-2:]
-    best_action = 0
-    max_val = 0
-    previous_value = V[hh, state]
-
-    for aa in range(A):
-        q_aa = R_hat[hh, state, aa] + B_sa[hh, state, aa]
-
-        if hh < H - 1:
-            for sn in range(S):
-                q_aa += gamma * P_hat[hh, state, aa, sn] * V[hh + 1, sn]
-
-        if aa == 0 or q_aa > max_val:
-            max_val = q_aa
-            best_action = aa
-
-    V[hh, state] = max_val
-    V[hh, state] = min(v_max[hh], V[hh, state])
-    V[hh, state] = min(previous_value, V[hh, state])
-
-    return best_action
diff --git a/rlberry/agents/utils/memories.py b/rlberry/agents/utils/memories.py
deleted file mode 100644
index 1677efd19..000000000
--- a/rlberry/agents/utils/memories.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import numpy as np
-from collections import namedtuple
-
-Transition = namedtuple(
-    "Transition", ("state", "action", "reward", "next_state", "terminal", "info")
-)
-
-
-class ReplayMemory(object):
-    """
-    Container that stores and samples transitions.
-    """
-
-    def __init__(self, capacity=10000, **kwargs):
-        self.capacity = int(capacity)
-        self.memory = []
-        self.position = 0
-
-    def push(self, item):
-        """Saves a thing."""
-        if len(self.memory) < self.capacity:
-            self.memory.append(item)
-        else:
-            self.memory[self.position] = item
-        # Faster than append and pop
-        self.position = (self.position + 1) % self.capacity
-
-    def _encode_sample(self, idxes):
-        return [self.memory[idx] for idx in idxes]
-
-    def sample(self, batch_size):
-        batch_size = min(batch_size, len(self))
-        idxes = np.random.choice(len(self.memory), size=batch_size)
-        return self._encode_sample(idxes), idxes
-
-    def __len__(self):
-        return len(self.memory)
-
-    def is_full(self):
-        return len(self.memory) == self.capacity
-
-    def is_empty(self):
-        return len(self.memory) == 0
-
-
-class Memory:
-    def __init__(self):
-        self.actions = []
-        self.states = []
-        self.logprobs = []
-        self.rewards = []
-        self.is_terminals = []
-
-    def clear_memory(self):
-        del self.actions[:]
-        del self.states[:]
-        del self.logprobs[:]
-        del self.rewards[:]
-        del self.is_terminals[:]
diff --git a/rlberry/colab_utils/__init__.py b/rlberry/colab_utils/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/rlberry/colab_utils/display_setup.py b/rlberry/colab_utils/display_setup.py
deleted file mode 100644
index 302e589eb..000000000
--- a/rlberry/colab_utils/display_setup.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#
-#  Code to visualize the environments.
-#
-
-import base64
-from pyvirtualdisplay import Display
-from IPython import display as ipythondisplay
-
-# from IPython.display import clear_output
-from pathlib import Path
-
-
-def show_video(filename=None, directory="./videos"):
-    """
-    Either show all videos in a directory (if filename is None) or
-    show video corresponding to filename.
-    """
-    html = []
-    if filename is not None:
-        files = Path("./").glob(filename)
-    else:
-        files = Path(directory).glob("*.mp4")
-    for mp4 in files:
-        video_b64 = base64.b64encode(mp4.read_bytes())
-        html.append(
-            """<video alt="{}" autoplay
-                      loop controls style="height: 400px;">
-                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
-                 </video>""".format(
-                mp4, video_b64.decode("ascii")
-            )
-        )
-    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))
-
-
-display = Display(visible=0, size=(1400, 900))
-display.start()
diff --git a/rlberry/envs/__init__.py b/rlberry/envs/__init__.py
index 96d4442f0..dd360af93 100644
--- a/rlberry/envs/__init__.py
+++ b/rlberry/envs/__init__.py
@@ -1,6 +1,5 @@
 from .gym_make import gym_make, atari_make
 from .basewrapper import Wrapper
-from .classic_control import Acrobot, MountainCar, Pendulum, SpringCartPole
-from .finite import Chain, FiniteMDP, GridWorld
 from .interface import Model
 from .pipeline import PipelineEnv
+from .finite_mdp import FiniteMDP
diff --git a/rlberry/envs/bandits/__init__.py b/rlberry/envs/bandits/__init__.py
deleted file mode 100644
index ca602a3af..000000000
--- a/rlberry/envs/bandits/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .bandit_base import AdversarialBandit, Bandit
-from .corrupted_bandits import CorruptedLaws, CorruptedNormalBandit
-from .stochastic_bandits import BernoulliBandit, NormalBandit
diff --git a/rlberry/envs/bandits/bandit_base.py b/rlberry/envs/bandits/bandit_base.py
deleted file mode 100644
index 95ceeb1a2..000000000
--- a/rlberry/envs/bandits/bandit_base.py
+++ /dev/null
@@ -1,115 +0,0 @@
-from collections import deque
-
-
-from rlberry.envs.interface import Model
-import rlberry.spaces as spaces
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class Bandit(Model):
-    """
-    Base class for a stochastic multi-armed bandit.
-
-    Parameters
-    ----------
-    laws: list of laws.
-        laws of the arms. can either be a frozen scipy law or any class that
-        has a method .rvs().
-
-    **kwargs: keywords arguments
-        additional arguments sent to :class:`~rlberry.envs.interface.Model`
-
-    Attributes
-    ----------
-    laws: list
-        laws of the arms. can either be a frozen scipy law or any class that
-        has a method .rvs().
-    n_arms: int
-        Number of arms.
-    action_space: spaces.Discrete
-        Action space when viewing the bandit as a single-state MDP.
-    rewards: list
-        For each arm, pre-sample 10 times.
-    n_rewards: list
-        Reward counter per arm.
-    """
-
-    name = ""
-
-    def __init__(self, laws=[], **kwargs):
-        Model.__init__(self, **kwargs)
-        self.laws = laws
-        self.n_arms = len(self.laws)
-        self.action_space = spaces.Discrete(self.n_arms)
-
-        # Pre-sample 10 samples
-        self.rewards = [
-            deque(self.laws[action].rvs(size=10, random_state=self.rng))
-            for action in range(self.n_arms)
-        ]
-        self.n_rewards = [10] * self.n_arms
-
-    def step(self, action):
-        """
-        Sample the reward associated to the action.
-        """
-        # test that the action exists
-        assert action < self.n_arms
-
-        reward = self.laws[action].rvs(random_state=self.rng, size=1)[0]
-        terminated = True
-        truncated = False
-
-        return 0, reward, terminated, truncated, {}
-
-    def reset(self, seed=None, option=None):
-        """
-        Reset the environment to a default state.
-        """
-        return 0, {}
-
-
-class AdversarialBandit(Model):
-    """
-    Base class for a adversarial multi-armed bandit with oblivious
-    opponent, i.e all rewards are fixed in advance at the start of the run.
-
-    Parameters
-    ----------
-    rewards: list of rewards, shape (T, A).
-        Possible rewards up to horizon T for each of the A arms.
-
-    **kwargs: keywords arguments
-        additional arguments sent to :class:`~rlberry.envs.interface.Model`
-
-    """
-
-    name = ""
-
-    def __init__(self, rewards=[], **kwargs):
-        Model.__init__(self, **kwargs)
-        self.n_arms = rewards.shape[1]
-        self.rewards = deque(rewards)
-        self.action_space = spaces.Discrete(self.n_arms)
-
-    def step(self, action):
-        """
-        Sample the reward associated to the action.
-        """
-        # test that the action exists
-        assert action < self.n_arms
-
-        rewards = self.rewards.popleft()
-        reward = rewards[action]
-        terminated = True
-        truncated = False
-        return 0, reward, terminated, truncated, {}
-
-    def reset(self, seed=None, option=None):
-        """
-        Reset the environment to a default state.
-        """
-        return 0, {}
diff --git a/rlberry/envs/bandits/corrupted_bandits.py b/rlberry/envs/bandits/corrupted_bandits.py
deleted file mode 100644
index 2ac703588..000000000
--- a/rlberry/envs/bandits/corrupted_bandits.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import numpy as np
-from scipy import stats
-
-from rlberry.envs.bandits import Bandit
-
-
-class CorruptedLaws:
-    """
-    Class for corrupted laws.
-
-    Parameters
-    ----------
-    law: law
-        Can either be a frozen scipy law or any class that
-        has a method .rvs() to sample according to the given law.
-
-
-    cor_prop: float in (0,1/2)
-        Proportion of corruption
-
-    cor_law: law
-        Laws of corruption.
-    """
-
-    def __init__(self, law, cor_prop, cor_law):
-        self.law = law
-        self.cor_prop = cor_prop
-        self.cor_law = cor_law
-
-    def rvs(self, size, random_state):
-        is_corrupted = random_state.binomial(1, self.cor_prop, size=size)
-        cor_sample = self.cor_law.rvs(size=size, random_state=random_state)
-        noncor_sample = self.law.rvs(size=size, random_state=random_state)
-        return is_corrupted * cor_sample + (1 - is_corrupted) * noncor_sample
-
-    def mean(self):
-        return (
-            1 - self.cor_prop
-        ) * self.law.mean() + self.cor_prop * self.cor_law.mean()
-
-
-class CorruptedNormalBandit(Bandit):
-    """
-    Class for Bandits corrupted by nature.
-
-    Parameters
-    ----------
-    means: array-like of size n_arms, default=array([0,1])
-        means of the law of inliers of each of the arms.
-
-    stds: array-like of size n_arms or None, default=None
-        stds of the law of inliers of each of the arms. If None, use array with
-        all ones.
-
-    cor_prop: float in (0,1/2), default=0.05
-        proportion of corruption
-
-    cor_laws: list of scipy frozen laws or None, default=None
-        laws of corruption on each arm. If None, all the arms are corrupted by
-        a normal of mean 1000 and std 1.
-    """
-
-    def __init__(
-        self,
-        means=np.array([0, 1]),
-        stds=None,
-        cor_prop=0.05,
-        cor_laws=None,
-    ):
-        laws = self.make_laws(means, stds, cor_prop, cor_laws)
-        Bandit.__init__(self, laws=laws)
-
-    def make_laws(self, means, stds, cor_prop, cor_laws):
-        if cor_laws is not None:
-            self.cor_laws = cor_laws
-        else:
-            self.cor_laws = [stats.norm(loc=1000) for a in range(len(means))]
-        if stds is None:
-            self.stds = np.ones(len(means))
-        else:
-            self.stds = stds
-        assert len(means) == len(self.stds)
-        assert cor_prop <= 0.5
-        inlier_laws = [
-            stats.norm(loc=means[a], scale=self.stds[a]) for a in range(len(means))
-        ]
-        return [
-            CorruptedLaws(inlier_laws[a], cor_prop, self.cor_laws[a])
-            for a in range(len(means))
-        ]
diff --git a/rlberry/envs/bandits/stochastic_bandits.py b/rlberry/envs/bandits/stochastic_bandits.py
deleted file mode 100644
index e4ecf4f88..000000000
--- a/rlberry/envs/bandits/stochastic_bandits.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import numpy as np
-from scipy import stats
-
-from rlberry.envs.bandits import Bandit
-
-
-class NormalBandit(Bandit):
-    """
-    Class for Normal Bandits
-
-    Parameters
-    ----------
-    means: array-like of size n_arms, default=array([0,1])
-        means of the law of each of the arms.
-
-    stds: array-like of size n_arms or None, default=None
-        stds of the law of each of the arms. If None, use array with
-        all ones.
-
-    """
-
-    def __init__(
-        self,
-        means=np.array([0, 1]),
-        stds=None,
-    ):
-        laws = self.make_laws(means, stds)
-        Bandit.__init__(self, laws=laws)
-
-    def make_laws(self, means, stds):
-        if stds is None:
-            self.stds = np.ones(len(means))
-        else:
-            self.stds = stds
-        assert len(means) == len(self.stds)
-        return [stats.norm(loc=means[a], scale=self.stds[a]) for a in range(len(means))]
-
-
-class BernoulliBandit(Bandit):
-    """
-    Class for Bernoulli Bandits
-
-    Parameters
-    ----------
-    p: array-like of size n_arms, default=array([0.1,0.9])
-        means of the law of inliers of each of the arms.
-
-    """
-
-    def __init__(
-        self,
-        p=np.array([0.1, 0.9]),
-    ):
-        laws = self.make_laws(p)
-        Bandit.__init__(self, laws=laws)
-
-    def make_laws(self, p):
-        return [stats.binom(n=1, p=p[a]) for a in range(len(p))]
diff --git a/rlberry/envs/benchmarks/__init__.py b/rlberry/envs/benchmarks/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/rlberry/envs/benchmarks/ball_exploration/__init__.py b/rlberry/envs/benchmarks/ball_exploration/__init__.py
deleted file mode 100644
index 390066140..000000000
--- a/rlberry/envs/benchmarks/ball_exploration/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .pball import PBall, PBall2D, SimplePBallND
diff --git a/rlberry/envs/benchmarks/ball_exploration/ball2d.py b/rlberry/envs/benchmarks/ball_exploration/ball2d.py
deleted file mode 100644
index bb1b43d7e..000000000
--- a/rlberry/envs/benchmarks/ball_exploration/ball2d.py
+++ /dev/null
@@ -1,220 +0,0 @@
-"""
-This files provides a set of 2D environments with increasing difficulty
-of exploration.
-
-The difficulty is ranked by the level.
-
-Important:
-    * To create instances, use the function get_benchmark_env(level).
-    * The horizon H is also set as an attribute of the environment.
-"""
-
-import numpy as np
-
-from rlberry.wrappers.autoreset import AutoResetWrapper
-from rlberry.envs.benchmarks.ball_exploration.pball import PBall2D
-
-
-def get_benchmark_env(level=1):
-    if level == 0:
-        env = _get_autoreset_env(BallLevel0())
-        return env
-    elif level == 1:
-        env = _get_autoreset_env(BallLevel1())
-        return env
-    elif level == 2:
-        env = _get_autoreset_env(BallLevel2())
-        return env
-    elif level == 3:
-        env = _get_autoreset_env(BallLevel3())
-        return env
-    elif level == 4:
-        env = _get_autoreset_env(BallLevel4())
-        return env
-    elif level == 5:
-        env = _get_autoreset_env(BallLevel5())
-        return env
-    else:
-        raise NotImplementedError("Invalid benchmark level.")
-
-
-def _get_autoreset_env(env):
-    horizon = env.horizon
-    return AutoResetWrapper(env, horizon)
-
-
-#
-# Level 0 (reward free!)
-#
-class BallLevel0(PBall2D):
-    """
-    Reward-free (0 reward)
-    """
-
-    def __init__(self):
-        self.horizon = 30
-        #
-        self.p = 2
-        self.action_list = [
-            np.array([0.0, 0.0]),
-            0.05 * np.array([1.0, 0.0]),
-            -0.05 * np.array([1.0, 0.0]),
-            0.05 * np.array([0.0, 1.0]),
-            -0.05 * np.array([0.0, 1.0]),
-        ]
-
-        self.reward_amplitudes = []
-        self.reward_smoothness = []
-        self.reward_centers = []
-        self.A = np.eye(2)
-        self.B = np.eye(2)
-        self.sigma = 0.01
-        self.sigma_init = 0.001
-        self.mu_init = np.array([0.0, 0.0])
-
-        PBall2D.__init__(
-            self,
-            self.p,
-            self.action_list,
-            self.reward_amplitudes,
-            self.reward_smoothness,
-            self.reward_centers,
-            self.A,
-            self.B,
-            self.sigma,
-            self.sigma_init,
-            self.mu_init,
-        )
-        self.name = "Ball Exploration Benchmark - Level 0 (Reward-Free)"
-
-
-#
-# Level 1
-#
-
-
-class BallLevel1(PBall2D):
-    """
-    Dense rewards
-    """
-
-    def __init__(self):
-        self.horizon = 30
-        #
-        self.p = 2
-        self.action_list = [
-            np.array([0.0, 0.0]),
-            0.05 * np.array([1.0, 0.0]),
-            -0.05 * np.array([1.0, 0.0]),
-            0.05 * np.array([0.0, 1.0]),
-            -0.05 * np.array([0.0, 1.0]),
-        ]
-
-        self.reward_amplitudes = np.array([1.0])
-        self.reward_smoothness = np.array([0.5 * np.sqrt(2)])
-        self.reward_centers = [np.array([0.5, 0.5])]
-        self.A = np.eye(2)
-        self.B = np.eye(2)
-        self.sigma = 0.01
-        self.sigma_init = 0.001
-        self.mu_init = np.array([0.0, 0.0])
-
-        PBall2D.__init__(
-            self,
-            self.p,
-            self.action_list,
-            self.reward_amplitudes,
-            self.reward_smoothness,
-            self.reward_centers,
-            self.A,
-            self.B,
-            self.sigma,
-            self.sigma_init,
-            self.mu_init,
-        )
-        self.name = "Ball Exploration Benchmark - Level 1"
-
-
-#
-# Level 2
-#
-
-
-class BallLevel2(BallLevel1):
-    """
-    Sparse rewards
-    """
-
-    def __init__(self):
-        BallLevel1.__init__(self)
-        self.reward_amplitudes = np.array([1.0])
-        self.reward_smoothness = np.array([0.2])
-        self.reward_centers = [np.array([0.5, 0.5])]
-        self.name = "Ball Exploration Benchmark - Level 2"
-
-
-#
-# Level 3
-#
-
-
-class BallLevel3(BallLevel2):
-    """
-    Sparse rewards, noisier
-    """
-
-    def __init__(self):
-        BallLevel2.__init__(self)
-        self.sigma = 0.025
-        self.name = "Ball Exploration Benchmark - Level 3"
-
-
-#
-# Level 4
-#
-
-
-class BallLevel4(BallLevel1):
-    """
-    Far sparse reward (as lvl 2) + dense suboptimal rewards
-    """
-
-    def __init__(self):
-        BallLevel1.__init__(self)
-
-        self.reward_amplitudes = np.array([1.0, 0.1])
-        self.reward_smoothness = np.array([0.2, 0.5 * np.sqrt(2)])
-        self.reward_centers = [
-            np.array([-0.5, -0.5]),  # far sparse
-            np.array([0.5, 0.5]),
-        ]  # dense
-        self.name = "Ball Exploration Benchmark - Level 4"
-
-
-#
-# Level 5
-#
-
-
-class BallLevel5(BallLevel4):
-    """
-    Far sparse reward (as lvl 2) + dense suboptimal rewards, noisier
-    """
-
-    def __init__(self):
-        BallLevel4.__init__(self)
-        self.sigma = 0.025
-        self.name = "Ball Exploration Benchmark - Level 5"
-
-
-# if __name__ == '__main__':
-#     env = get_benchmark_env(1)
-#     env.enable_rendering()
-#     for ii in range(100):
-#         # env.step(1)
-#         # env.step(3)
-#         # env.step(env.action_space.sample())
-#         # env.step(0)
-#         env.step(4)
-
-#     env.render()
diff --git a/rlberry/envs/benchmarks/ball_exploration/pball.py b/rlberry/envs/benchmarks/ball_exploration/pball.py
deleted file mode 100644
index 4f7e9c479..000000000
--- a/rlberry/envs/benchmarks/ball_exploration/pball.py
+++ /dev/null
@@ -1,482 +0,0 @@
-import numpy as np
-
-
-import rlberry.spaces as spaces
-from rlberry.envs.interface import Model
-from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D
-
-import rlberry
-
-logger = rlberry.logger
-
-
-def projection_to_pball(x, p):
-    """
-    Solve the problem:
-        min_z  ||x-z||_2^2
-        s.t.   ||z||_p  <= 1
-    for p = 2 or p = np.inf
-
-    If p is not 2 or np.inf, it returns x/norm_p(x) if norm_p(x) > 1
-
-    WARNING: projection_to_pball is not actually a projection for p!=2
-    or p=!np.inf
-    """
-    if np.linalg.norm(x, ord=p) <= 1.0:
-        return x
-
-    if p == 2:
-        z = x / np.linalg.norm(x, ord=p)
-        return z
-
-    if p == np.inf:
-        z = np.minimum(1.0, np.maximum(x, -1.0))
-        return z
-
-        # below it is not a projection
-    return x / np.linalg.norm(x, ord=p)
-
-
-class PBall(Model):
-    """
-    Parametric family of environments whose state space is a unit sphere
-    according to the p-norm in R^d.
-
-    Note:
-        The projection function is only a true projection for
-        p in {2, infinity}.
-
-    ----------------------------------------------------------------------
-    State space:
-        x in R^d: norm_p (x) <= 1
-
-        implemented as rlberry.spaces.Box representing [0, 1]^d
-    ----------------------------------------------------------------------
-    Action space:
-        {u_1, ..., u_m} such that u_i in R^d'  for i = 1, ..., m
-
-        implemented as rlberry.spaces.Discrete(m)
-    ----------------------------------------------------------------------
-    Reward function (independent of the actions):
-        r(x) = sum_{i=1}^n  b_i  max( 0,  1 - norm_p( x - x_i )/c_i )
-
-        requirements:
-            c_i >= 0
-            b_i in [0, 1]
-    ----------------------------------------------------------------------
-    Transitions:
-        x_{t+1} = A x_t + B u_t + N
-
-        where
-            A: square matrix of size d
-            B: matrix of size (d, d')
-            N: d-dimensional Gaussian noise with zero mean and covariance
-            matrix sigma*I
-    ----------------------------------------------------------------------
-    Initial state:
-        d-dimensional Gaussian with mean mu_init and covariance matrix
-        sigma_init*I
-    ----------------------------------------------------------------------
-
-    Default parameters are provided for a 2D environment, PBall2D
-    """
-
-    name = "LP-Ball"
-
-    def __init__(
-        self,
-        p,
-        action_list,
-        reward_amplitudes,
-        reward_smoothness,
-        reward_centers,
-        A,
-        B,
-        sigma,
-        sigma_init,
-        mu_init,
-    ):
-        """
-        Parameters
-        -----------
-        p : int
-            parameter of the p-norm
-        action_list : list
-            list of actions {u_1, ..., u_m}, each action u_i is a
-            d'-dimensional array
-        reward_amplitudes: list
-            list of reward amplitudes: {b_1, ..., b_n}
-        reward_smoothness : list
-            list of reward smoothness: {c_1, ..., c_n}
-        reward_centers : list
-            list of reward centers:    {x_1, ..., x_n}
-        A : numpy.ndarray
-            array A of size (d, d)
-        B : numpy.ndarray
-            array B of size (d, d')
-        sigma : double
-            transition noise sigma
-        sigma_init : double
-            initial state noise sigma_init
-        mu_init : numpy.ndarray
-            array of size (d,) containing the mean of the initial state
-        """
-        Model.__init__(self)
-
-        assert p >= 1, "PBall requires p>=1"
-        if p not in [2, np.inf]:
-            logger.warning(
-                "For p!=2 or p!=np.inf, PBall \
-does not make true projections onto the lp ball."
-            )
-        self.p = p
-        self.d, self.dp = B.shape  # d and d'
-        self.m = len(action_list)
-        self.action_list = action_list
-        self.reward_amplitudes = reward_amplitudes
-        self.reward_smoothness = reward_smoothness
-        self.reward_centers = reward_centers
-        self.A = A
-        self.B = B
-        self.sigma = sigma
-        self.sigma_init = sigma_init
-        self.mu_init = mu_init
-
-        # State and action spaces
-        low = -1.0 * np.ones(self.d, dtype=np.float64)
-        high = np.ones(self.d, dtype=np.float64)
-        self.observation_space = spaces.Box(low, high)
-        self.action_space = spaces.Discrete(self.m)
-
-        # reward range
-        assert len(self.reward_amplitudes) == len(self.reward_smoothness)
-        assert len(self.reward_amplitudes) == len(self.reward_centers)
-        if len(self.reward_amplitudes) > 0:
-            assert (
-                self.reward_amplitudes.max() <= 1.0
-                and self.reward_amplitudes.min() >= 0.0
-            ), "reward amplitudes b_i must be in [0, 1]"
-            assert (
-                self.reward_smoothness.min() > 0.0
-            ), "reward smoothness c_i must be > 0"
-        self.reward_range = (0, 1.0)
-
-        #
-        self.name = "Lp-Ball"
-
-        # Initalize state
-        self.reset()
-
-    def reset(self, state=None, seed=None, options=None):
-        if state is not None:
-            self.state = state
-        else:
-            self.state = self.mu_init + self.sigma_init * self.seeder.rng.normal(
-                size=self.d
-            )
-            # projection to unit ball
-        self.state = projection_to_pball(self.state, self.p)
-        return self.state.copy(), {}
-
-    def sample(self, state, action):
-        assert self.action_space.contains(action)
-        assert self.observation_space.contains(state)
-
-        # next state
-        action_vec = self.action_list[action]
-        next_s = (
-            self.A.dot(state)
-            + self.B.dot(action_vec)
-            + self.sigma * self.rng.normal(size=self.d)
-        )
-        next_s = projection_to_pball(next_s, self.p)
-
-        # done and reward
-        terminated = False
-        truncated = False
-        reward = self.compute_reward_at(state)
-
-        return next_s, reward, terminated, truncated, {}
-
-    def step(self, action):
-        next_s, reward, terminated, truncated, info = self.sample(self.state, action)
-        self.state = next_s.copy()
-        return next_s, reward, terminated, truncated, info
-
-    def compute_reward_at(self, x):
-        reward = 0.0
-        for ii, b_ii in enumerate(self.reward_amplitudes):
-            c_ii = self.reward_smoothness[ii]
-            x_ii = self.reward_centers[ii]
-            dist = np.linalg.norm(x - x_ii, ord=self.p)
-            reward += b_ii * max(0.0, 1.0 - dist / c_ii)
-        return reward
-
-    def get_reward_lipschitz_constant(self):
-        ratios = self.reward_amplitudes / self.reward_smoothness
-        Lr = ratios.max()
-        return Lr
-
-    def get_transitions_lipschitz_constant(self):
-        """
-        note: considers a fixed action, returns Lipschitz constant
-        w.r.t. to states.
-
-        If p!=1, p!=2 or p!=np.inf, returns an upper bound on the induced norm
-        """
-        if self.p == 1:
-            order = np.inf
-        else:
-            order = self.p / (self.p - 1.0)
-
-        if order in [1, 2]:
-            return np.linalg.norm(self.A, ord=order)
-
-        # If p!=1, p!=2 or p!=np.inf, return upper bound on the induced norm.
-        return np.power(self.d, 1.0 / self.p) * np.linalg.norm(self.A, ord=np.inf)
-
-
-class PBall2D(RenderInterface2D, PBall):
-    """
-    Parametric family of environments whose state space is a unit sphere
-    according to the p-norm in R^d.
-
-    Parameters
-    ----------
-    p : int, default = 2
-        value of p for which p-norm Sphere is considered.
-
-    action_list : list, default = [array([0.05, 0.]), array([- 0.05, - 0.]),
-                                    array([0., 0.05]), array([- 0., - 0.05])]
-        list of actions described as segment in 2D.
-
-    reward_amplitudes: array, default = array([1.]).
-        See reward function.
-
-    reward_smoothness: array, default = array([0.25])
-        See reward function.
-
-    reward_centers: list of arrays, default = [array([0.75, 0.])]
-        See reward function.
-
-    A: 2D array, default = array([[1., 0.], [0., 1.]])
-        See Transition function.
-
-    B: 2D array, default = array([[1., 0.], [0., 1.]])
-        See Transition function.
-
-    sigma: float, default = 0.01
-        See Transition function.
-
-    sigma_init: float, default = 0.001
-        See Initial state.
-
-    mu_init: array of length 2, default = array([0., 0.])
-        See Initial state.
-
-    Note:
-        The projection function is only a true projection for
-        p in {2, infinity}.
-
-    ----------------------------------------------------------------------
-    State space:
-        x in R^d: norm_p (x) <= 1
-
-        implemented as rlberry.spaces.Box representing [0, 1]^2
-    ----------------------------------------------------------------------
-    Action space:
-        {u_1, ..., u_m} such that u_i in R^2  for i = 1, ..., m
-
-        implemented as rlberry.spaces.Discrete(m)
-    ----------------------------------------------------------------------
-    Reward function (independent of the actions):
-        r(x) = sum_{i=1}^n  b_i  max( 0,  1 - norm_p( x - x_i )/c_i )
-
-        requirements:
-            c_i >= 0
-            b_i in [0, 1]
-    ----------------------------------------------------------------------
-    Transitions:
-        x_{t+1} = A x_t + B u_t + N
-
-        where
-            A: square matrix of size 2
-            B: matrix of size (2, 2)
-            N: d-dimensional Gaussian noise with zero mean and covariance
-            matrix sigma*I
-    ----------------------------------------------------------------------
-    Initial state:
-        2-dimensional Gaussian with mean mu_init and covariance matrix
-        sigma_init*I
-    ----------------------------------------------------------------------
-
-    """
-
-    def __init__(
-        self,
-        p=2,
-        action_list=[
-            0.05 * np.array([1, 0]),
-            -0.05 * np.array([1, 0]),
-            0.05 * np.array([0, 1]),
-            -0.05 * np.array([0, 1]),
-        ],
-        reward_amplitudes=np.array([1.0]),
-        reward_smoothness=np.array([0.25]),
-        reward_centers=[np.array([0.75, 0.0])],
-        A=np.eye(2),
-        B=np.eye(2),
-        sigma=0.01,
-        sigma_init=0.001,
-        mu_init=np.array([0.0, 0.0]),
-    ):
-        # Initialize PBall
-        PBall.__init__(
-            self,
-            p,
-            action_list,
-            reward_amplitudes,
-            reward_smoothness,
-            reward_centers,
-            A,
-            B,
-            sigma,
-            sigma_init,
-            mu_init,
-        )
-
-        # Render interface
-        RenderInterface2D.__init__(self)
-
-        # rendering info
-        self.set_clipping_area((-1, 1, -1, 1))
-        self.set_refresh_interval(50)  # in milliseconds
-
-    def step(self, action):
-        # save state for rendering
-        if self.is_render_enabled():
-            self.append_state_for_rendering(self.state.copy())
-        return PBall.step(self, action)
-
-    #
-    # Code for rendering
-    #
-
-    def _get_ball_shape(self, xcenter, radius):
-        shape = GeometricPrimitive("POLYGON")
-        n_points = 200
-        theta_vals = np.linspace(0.0, 2 * np.pi, n_points)
-        for theta in theta_vals:
-            pp = np.array([2.0 * np.cos(theta), 2.0 * np.sin(theta)])
-            pp = xcenter + radius * projection_to_pball(pp, self.p)
-            # project to the main ball after translation
-            pp = projection_to_pball(pp, self.p)
-            shape.add_vertex((pp[0], pp[1]))
-        return shape
-
-    def get_background(self):
-        bg = Scene()
-
-        # ball shape
-        contour = self._get_ball_shape(np.zeros(2), 1.0)
-        contour.set_color((0.0, 0.0, 0.5))
-        bg.add_shape(contour)
-
-        # reward position
-        for ii, ampl in enumerate(self.reward_amplitudes):
-            contour = self._get_ball_shape(
-                self.reward_centers[ii], self.reward_smoothness[ii]
-            )
-            ampl = 1.0 - ampl  # dark violet = more reward
-            contour.set_color((0.5, 0.0, 0.5 * (1.0 + ampl)))
-            bg.add_shape(contour)
-
-        return bg
-
-    def get_scene(self, state):
-        scene = Scene()
-
-        agent = GeometricPrimitive("QUADS")
-        agent.set_color((0.75, 0.0, 0.5))
-        size = 0.05
-        x = state[0]
-        y = state[1]
-        agent.add_vertex((x - size / 4.0, y - size))
-        agent.add_vertex((x + size / 4.0, y - size))
-        agent.add_vertex((x + size / 4.0, y + size))
-        agent.add_vertex((x - size / 4.0, y + size))
-
-        agent.add_vertex((x - size, y - size / 4.0))
-        agent.add_vertex((x + size, y - size / 4.0))
-        agent.add_vertex((x + size, y + size / 4.0))
-        agent.add_vertex((x - size, y + size / 4.0))
-
-        scene.add_shape(agent)
-        return scene
-
-
-class SimplePBallND(PBall):
-    """
-    PBall environment in d dimensions with simple dynamics.
-    """
-
-    def __init__(
-        self,
-        p=2,
-        dim=2,
-        action_amplitude=0.05,
-        r_smoothness=0.25,
-        sigma=0.01,
-        sigma_init=0.001,
-        mu_init=None,
-    ):
-        # Action list
-        action_list = []
-        for dd in range(dim):
-            aux = np.zeros(dim)
-            aux[dd] = action_amplitude
-            action_list.append(aux)
-            action_list.append(-1 * aux)
-
-        # Rewards
-        reward_amplitudes = np.array([1.0])
-        reward_smoothness = np.array([r_smoothness])
-        reward_centers = [np.zeros(dim)]
-        reward_centers[0][0] = 0.8
-
-        # Transitions
-        A = np.eye(dim)
-        B = np.eye(dim)
-
-        # Initial position
-        if mu_init is None:
-            mu_init = np.zeros(dim)
-
-        # Initialize PBall
-        PBall.__init__(
-            self,
-            p,
-            action_list,
-            reward_amplitudes,
-            reward_smoothness,
-            reward_centers,
-            A,
-            B,
-            sigma,
-            sigma_init,
-            mu_init,
-        )
-
-
-# if __name__ == '__main__':
-#     env = PBall2D(p=5)
-#     print(env.get_transitions_lipschitz_constant())
-#     print(env.get_reward_lipschitz_constant())
-
-#     env.enable_rendering()
-
-#     for ii in range(100):
-#         env.step(1)
-#         env.step(3)
-
-#     env.render()
diff --git a/rlberry/envs/benchmarks/generalization/__init__.py b/rlberry/envs/benchmarks/generalization/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/rlberry/envs/benchmarks/generalization/twinrooms.py b/rlberry/envs/benchmarks/generalization/twinrooms.py
deleted file mode 100644
index f0619e96b..000000000
--- a/rlberry/envs/benchmarks/generalization/twinrooms.py
+++ /dev/null
@@ -1,185 +0,0 @@
-import numpy as np
-import rlberry.spaces as spaces
-from rlberry.envs import Model
-from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D
-from rlberry.rendering.common_shapes import circle_shape
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class TwinRooms(RenderInterface2D, Model):
-    """
-    Two continuous grid worlds, side by side, separated by a wall.
-    Both are identical (or almost identical), and the agent has equal probability to
-    start in any of the two rooms.
-
-    It can be used to test the generalization capability of agents:
-    a policy learned in one of the rooms can be used to learn faster
-    a policy in the other room.
-
-    There are 4 actions, one for each direction (left/right/up/down).
-
-    Parameters
-    ----------
-    noise_room1: double, default: 0.01
-        Noise in the transitions of the first room.
-    noise_room2: double, default: 0.01
-        Noise in the transitions of the second room.
-
-    Notes
-    -----
-    The function env.sample() does not handle conversions to array states
-    when array_observation is True. Only the functions env.reset() and
-    env.step() are covered.
-    """
-
-    name = "TwinRooms"
-
-    def __init__(self, noise_room1=0.01, noise_room2=0.01):
-        Model.__init__(self)
-        RenderInterface2D.__init__(self)
-
-        self.noise_room1 = noise_room1
-        self.noise_room2 = noise_room2
-
-        self.observation_space = spaces.Box(
-            low=np.array([0.0, 0.0]),
-            high=np.array([2.0, 1.0]),
-        )
-        self.action_space = spaces.Discrete(4)
-        self.reward_range = (0.0, 1.0)
-
-        self.room_noises = [noise_room1, noise_room2]
-
-        # environment parameters
-        self.action_displacement = 0.1
-        self.wall_eps = 0.05
-
-        # base reward position
-        self.base_reward_pos = np.array([0.8, 0.8])
-
-        # rendering info
-        self.set_clipping_area((0, 2, 0, 1))
-        self.set_refresh_interval(100)  # in milliseconds
-        self.renderer_type = "opengl"
-
-        # reset
-        self.reset()
-
-    def reset(self, seed=None, options=None):
-        self.current_room = self.seeder.rng.integers(2)
-        if self.current_room == 0:
-            self.state = np.array([0.1, 0.1])
-        else:
-            self.state = np.array([1.1, 0.1])
-        return self.state.copy(), {}
-
-    def _reward_fn(self, state):
-        # max reward at (x, y) = reward_pos
-        reward_pos = self.base_reward_pos
-        if self.current_room == 1:
-            reward_pos = reward_pos + np.array([1.0, 0.0])
-        xr, yr = reward_pos
-
-        dist = np.sqrt((state[0] - xr) ** 2.0 + (state[1] - yr) ** 2.0)
-        reward = max(0.0, 1.0 - dist / 0.1)
-        return reward
-
-    def _clip_to_room(self, state):
-        state[1] = max(0.0, state[1])
-        state[1] = min(1.0, state[1])
-        if self.current_room == 0:
-            state[0] = max(0.0, state[0])
-            state[0] = min(1.0 - self.wall_eps, state[0])
-        else:
-            state[0] = max(1.0 + self.wall_eps, state[0])
-            state[0] = min(2.0, state[0])
-        return state
-
-    def step(self, action):
-        assert self.action_space.contains(action), "Invalid action!"
-
-        # save state for rendering
-        if self.is_render_enabled():
-            self.append_state_for_rendering(self.state)
-
-        next_state, reward, terminated, truncated, info = self.sample(
-            self.state, action
-        )
-        self.state = next_state
-        return self.state.copy(), reward, terminated, truncated, info
-
-    def sample(self, state, action):
-        delta = self.action_displacement
-        if action == 0:
-            displacement = np.array([delta, 0.0])
-        elif action == 1:
-            displacement = np.array([-delta, 0.0])
-        elif action == 2:
-            displacement = np.array([0.0, delta])
-        elif action == 3:
-            displacement = np.array([0.0, -delta])
-        else:
-            raise ValueError("Invalid action")
-
-        next_state = (
-            state
-            + displacement
-            + self.room_noises[self.current_room] * self.rng.normal(size=2)
-        )
-
-        # clip to room
-        next_state = self._clip_to_room(next_state)
-
-        reward = self._reward_fn(state)
-        terminated = False
-        truncated = False
-        info = {}
-
-        return next_state, reward, terminated, truncated, info
-
-    #
-    # Code for rendering
-    #
-
-    def get_background(self):
-        """
-        Returne a scene (list of shapes) representing the background
-        """
-        bg = Scene()
-
-        # wall
-        eps = self.wall_eps
-        shape = GeometricPrimitive("POLYGON")
-        shape.set_color((0.25, 0.25, 0.25))
-        shape.add_vertex((1 - eps, 0))
-        shape.add_vertex((1 - eps, 1))
-        shape.add_vertex((1 + eps, 1))
-        shape.add_vertex((1 + eps, 0))
-        bg.add_shape(shape)
-
-        # rewards
-        for x, y in [
-            self.base_reward_pos,
-            self.base_reward_pos + np.array([1.0, 0.0]),
-        ]:
-            reward = circle_shape((x, y), 0.1, n_points=50)
-            reward.type = "POLYGON"
-            reward.set_color((0.0, 0.5, 0.0))
-            bg.add_shape(reward)
-
-        return bg
-
-    def get_scene(self, state):
-        """
-        Return scene (list of shapes) representing a given state
-        """
-        x, y = state
-        scene = Scene()
-        agent = circle_shape((x, y), 0.02, n_points=5)
-        agent.type = "POLYGON"
-        agent.set_color((0.75, 0.0, 0.5))
-        scene.add_shape(agent)
-        return scene
diff --git a/rlberry/envs/benchmarks/grid_exploration/__init__.py b/rlberry/envs/benchmarks/grid_exploration/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/rlberry/envs/benchmarks/grid_exploration/apple_gold.py b/rlberry/envs/benchmarks/grid_exploration/apple_gold.py
deleted file mode 100644
index 4a4599156..000000000
--- a/rlberry/envs/benchmarks/grid_exploration/apple_gold.py
+++ /dev/null
@@ -1,180 +0,0 @@
-import numpy as np
-import rlberry.spaces as spaces
-from rlberry.envs.finite import GridWorld
-from rlberry.rendering import Scene, GeometricPrimitive
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class AppleGold(GridWorld):
-    """
-    AppleGold with six rooms: this is merely a slightly modified
-    version of SixRoom.
-
-    Parameters
-    ----------
-    reward_free : bool, default=False
-        If true, no rewards are given to the agent.
-    array_observation:
-        If true, the observations are converted to an array (x, y)
-        instead of a discrete index.
-
-    Notes
-    -----
-    The function env.sample() does not handle conversions to array states
-    when array_observation is True. Only the functions env.reset() and
-    env.step() are covered.
-
-    Reference
-    ---------
-    .. seaalso::
-        Guo et al.: Self-Imitation Learning via
-        Trajectory-Conditioned Policy
-        for Hard-Exploration Tasks
-        arXiv preprint arXiv:1907.10247
-    """
-
-    name = "AppleGold"
-
-    def __init__(self, reward_free=False, array_observation=False):
-        self.reward_free = reward_free
-        self.array_observation = array_observation
-
-        # Common parameters
-        nrows = 13
-        ncols = 17
-        start_coord = (5, 1)
-        terminal_states = ((7, 7),)
-        success_probability = 0.95
-        #
-        walls = ()
-        for ii in range(13):
-            walls += ((ii, 0),)
-            walls += ((ii, 16),)
-        for jj in range(17):
-            walls += ((0, jj),)
-            walls += ((12, jj),)
-        for ii in range(13):
-            if ii not in [1, 11]:
-                walls += ((ii, 6),)
-                walls += ((ii, 10),)
-        walls += ((11, 6),)
-        for jj in range(17):
-            if jj not in [1, 15]:
-                walls += ((6, jj),)
-
-        # Default reward according to the difficulty
-        default_reward = 0
-
-        # Rewards according to the difficulty
-        if self.reward_free:
-            reward_at = {}
-        else:
-            reward_at = {(7, 7): 10.0, (8, 2): 1.0, (10, 3): 1.0}
-            for jj in range(7, 16):
-                for ii in range(1, 12):
-                    if (ii, jj) not in walls and (ii, jj) != (7, 7):
-                        reward_at[(ii, jj)] = -0.05
-
-        # Init base class
-        GridWorld.__init__(
-            self,
-            nrows=nrows,
-            ncols=ncols,
-            start_coord=start_coord,
-            terminal_states=terminal_states,
-            success_probability=success_probability,
-            reward_at=reward_at,
-            walls=walls,
-            default_reward=default_reward,
-        )
-
-        # spaces
-        if self.array_observation:
-            self.observation_space = spaces.Box(0.0, 1.0, shape=(2,))
-
-    def _convert_index_to_float_coord(self, state_index):
-        yy, xx = self.index2coord[state_index]
-
-        # centering
-        xx = xx + 0.5
-        yy = yy + 0.5
-        # map to [0, 1]
-        xx = xx / self.ncols
-        yy = yy / self.nrows
-        return np.array([xx, yy])
-
-    def reset(self, seed=None, options=None):
-        self.state = self.coord2index[self.start_coord]
-        state_to_return = self.state
-        if self.array_observation:
-            state_to_return = self._convert_index_to_float_coord(self.state)
-        if self.render_mode == "human":
-            self.render()
-        return state_to_return, {}
-
-    def step(self, action):
-        assert self.action_space.contains(action), "Invalid action!"
-
-        # save state for rendering
-        if self.is_render_enabled():
-            self.append_state_for_rendering(self.state)
-
-        # take step
-        next_state, reward, terminated, truncated, info = self.sample(
-            self.state, action
-        )
-        self.state = next_state
-
-        state_to_return = self.state
-        if self.array_observation:
-            state_to_return = self._convert_index_to_float_coord(self.state)
-        if self.render_mode == "human":
-            self.render()
-        return state_to_return, reward, terminated, truncated, info
-
-    def get_background(self):
-        """
-        Returne a scene (list of shapes) representing the background
-        """
-        bg = Scene()
-
-        # walls
-        for wall in self.walls:
-            y, x = wall
-            shape = GeometricPrimitive("POLYGON")
-            shape.set_color((0.25, 0.25, 0.25))
-            shape.add_vertex((x, y))
-            shape.add_vertex((x + 1, y))
-            shape.add_vertex((x + 1, y + 1))
-            shape.add_vertex((x, y + 1))
-            bg.add_shape(shape)
-
-        # rewards
-        for y, x in self.reward_at:
-            rwd = self.reward_at[(y, x)]
-            if rwd == -0.05:
-                rock = GeometricPrimitive("POLYGON")
-                rock.set_color((0.6, 0.6, 0.6))
-                rock.add_vertex((x, y))
-                rock.add_vertex((x + 1, y))
-                rock.add_vertex((x + 1, y + 1))
-                rock.add_vertex((x, y + 1))
-                bg.add_shape(rock)
-            else:
-                flag = GeometricPrimitive("POLYGON")
-                if rwd == 10:
-                    flag.set_color((0.0, 0.5, 0.0))
-                elif rwd == 1:
-                    flag.set_color((0.0, 0.0, 0.5))
-
-                x += 0.5
-                y += 0.25
-                flag.add_vertex((x, y))
-                flag.add_vertex((x + 0.25, y + 0.5))
-                flag.add_vertex((x - 0.25, y + 0.5))
-                bg.add_shape(flag)
-
-        return bg
diff --git a/rlberry/envs/benchmarks/grid_exploration/four_room.py b/rlberry/envs/benchmarks/grid_exploration/four_room.py
deleted file mode 100644
index b4e2d67a5..000000000
--- a/rlberry/envs/benchmarks/grid_exploration/four_room.py
+++ /dev/null
@@ -1,130 +0,0 @@
-import numpy as np
-import rlberry.spaces as spaces
-from rlberry.envs.finite import GridWorld
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class FourRoom(GridWorld):
-    """
-    GridWorld with four rooms.
-
-    Parameters
-    ----------
-    reward_free : bool, default=False
-        If true, no rewards are given to the agent.
-    difficulty: int, {0, 1 or 2}
-        Difficulty 0: reward in one location
-        Difficulty 1: easy suboptimal reward, hard optimal reward
-        Difficulty 2: easy suboptimal reward, hard optimal reward,
-            negative rewards by default.
-        Note: this parameter is ignored if reward_free is True.
-    array_observation:
-        If true, the observations are converted to an array (x, y)
-        instead of a discrete index.
-
-    Notes
-    -----
-    The function env.sample() does not handle conversions to array states
-    when array_observation is True. Only the functions env.reset() and
-    env.step() are covered.
-    """
-
-    name = "FourRoom"
-
-    def __init__(self, reward_free=False, difficulty=0, array_observation=False):
-        self.reward_free = reward_free
-        self.difficulty = difficulty
-        self.array_observation = array_observation
-
-        if difficulty not in [0, 1, 2]:
-            raise ValueError("FourRoom difficulty must be in [0, 1, 2]")
-
-        # Common parameters
-        nrows = 9
-        ncols = 9
-        start_coord = (0, 0)
-        terminal_states = ((8, 0),)
-        success_probability = 0.95
-        #
-        walls = ()
-        for ii in range(9):
-            if ii not in [2, 6]:
-                walls += ((ii, 4),)
-        for jj in range(9):
-            if jj != 7:
-                walls += ((4, jj),)
-
-        # Default reward according to the difficulty
-        if difficulty in [0, 1]:
-            default_reward = 0.0
-        elif difficulty == 2:
-            default_reward = -0.005
-
-        # Rewards according to the difficulty
-        if self.reward_free:
-            reward_at = {}
-        else:
-            if difficulty == 0:
-                reward_at = {(8, 0): 1.0}
-            elif difficulty in [1, 2]:
-                reward_at = {
-                    (8, 0): 1.0,
-                    (3, 3): 0.1,
-                }
-
-        # Init base class
-        GridWorld.__init__(
-            self,
-            nrows=nrows,
-            ncols=ncols,
-            start_coord=start_coord,
-            terminal_states=terminal_states,
-            success_probability=success_probability,
-            reward_at=reward_at,
-            walls=walls,
-            default_reward=default_reward,
-        )
-
-        # spaces
-        if self.array_observation:
-            self.observation_space = spaces.Box(0.0, 1.0, shape=(2,))
-
-    def _convert_index_to_float_coord(self, state_index):
-        yy, xx = self.index2coord[state_index]
-
-        # centering
-        xx = xx + 0.5
-        yy = yy + 0.5
-        # map to [0, 1]
-        xx = xx / self.ncols
-        yy = yy / self.nrows
-        return np.array([xx, yy])
-
-    def reset(self, seed=None, options=None):
-        self.state = self.coord2index[self.start_coord]
-        state_to_return = self.state
-        if self.array_observation:
-            state_to_return = self._convert_index_to_float_coord(self.state)
-        return state_to_return, {}
-
-    def step(self, action):
-        assert self.action_space.contains(action), "Invalid action!"
-
-        # save state for rendering
-        if self.is_render_enabled():
-            self.append_state_for_rendering(self.state)
-
-        # take step
-        next_state, reward, terminated, truncated, info = self.sample(
-            self.state, action
-        )
-        self.state = next_state
-
-        state_to_return = self.state
-        if self.array_observation:
-            state_to_return = self._convert_index_to_float_coord(self.state)
-
-        return state_to_return, reward, terminated, truncated, info
diff --git a/rlberry/envs/benchmarks/grid_exploration/nroom.py b/rlberry/envs/benchmarks/grid_exploration/nroom.py
deleted file mode 100644
index 51cc0f279..000000000
--- a/rlberry/envs/benchmarks/grid_exploration/nroom.py
+++ /dev/null
@@ -1,305 +0,0 @@
-import math
-import numpy as np
-import rlberry.spaces as spaces
-from rlberry.envs.finite import GridWorld
-from rlberry.rendering import Scene, GeometricPrimitive
-
-import rlberry
-
-logger = rlberry.logger
-
-
-# def get_nroom_state_coord(state_index, nroom_env):
-#     yy, xx = nroom_env.index2coord[state_index]
-#     # centering
-#     xx = xx + 0.5
-#     yy = yy + 0.5
-#     # map to [0, 1]
-#     xx = xx / nroom_env.ncols
-#     yy = yy / nroom_env.nrows
-#     return np.array([xx, yy])
-
-
-class NRoom(GridWorld):
-    """
-    GridWorld with N rooms of size L x L. The agent starts in the middle room.
-
-    There is one small and easy reward in the first room,
-    one big reward in the last room and zero reward elsewhere.
-
-    There is a 5% error probability in the transitions when taking an action.
-
-    Parameters
-    ----------
-    nrooms : int
-        Number of rooms.
-    reward_free : bool, default=False
-        If true, no rewards are given to the agent.
-    array_observation:
-        If true, the observations are converted to an array (x, y)
-        instead of a discrete index.
-        The underlying discrete space is saved in env.discrete_observation_space.
-    room_size : int
-        Dimension (L) of each room (L x L).
-    success_probability : double, default: 0.95
-        Sucess probability of an action. A failure is going to the wrong direction.
-    remove_walls : bool, default: False
-        If True, remove walls. Useful for debug.
-    initial_state_distribution: {'center', 'uniform'}
-        If 'center', always start at the center.
-        If 'uniform', start anywhere with uniform probability.
-    include_traps: bool, default: False
-        If true, each room will have a terminal state (a "trap").
-    Notes
-    -----
-    The function env.sample() does not handle conversions to array states
-    when array_observation is True. Only the functions env.reset() and
-    env.step() are covered.
-    """
-
-    name = "N-Room"
-
-    def __init__(
-        self,
-        nrooms=7,
-        reward_free=False,
-        array_observation=False,
-        room_size=5,
-        success_probability=0.95,
-        remove_walls=False,
-        initial_state_distribution="center",
-        include_traps=False,
-    ):
-        assert nrooms > 0, "nrooms must be > 0"
-        assert initial_state_distribution in ("center", "uniform")
-
-        self.reward_free = reward_free
-        self.array_observation = array_observation
-        self.nrooms = nrooms
-        self.room_size = room_size
-        self.success_probability = success_probability
-        self.remove_walls = remove_walls
-        self.initial_state_distribution = initial_state_distribution
-        self.include_traps = include_traps
-
-        # Max number of rooms/columns per row
-        self.max_rooms_per_row = 5
-
-        # Room size (default = 5x5)
-        self.room_size = room_size
-
-        # Grid size
-        self.room_nrows = math.ceil(nrooms / self.max_rooms_per_row)
-        if self.room_nrows > 1:
-            self.room_ncols = self.max_rooms_per_row
-        else:
-            self.room_ncols = nrooms
-        nrows = self.room_size * self.room_nrows + (self.room_nrows - 1)
-        ncols = self.room_size * self.room_ncols + (self.room_ncols - 1)
-
-        # # walls
-        walls = []
-        for room_col in range(self.room_ncols - 1):
-            col = (room_col + 1) * (self.room_size + 1) - 1
-            for jj in range(nrows):
-                if (jj % (self.room_size + 1)) != (self.room_size // 2):
-                    walls.append((jj, col))
-
-        for room_row in range(self.room_nrows - 1):
-            row = (room_row + 1) * (self.room_size + 1) - 1
-            for jj in range(ncols):
-                walls.append((row, jj))
-
-        # process each room
-        start_coord = None
-        terminal_state = None
-        self.traps = []
-        count = 0
-        for room_r in range(self.room_nrows):
-            if room_r % 2 == 0:
-                cols_iterator = range(self.room_ncols)
-            else:
-                cols_iterator = reversed(range(self.room_ncols))
-            for room_c in cols_iterator:
-                # existing rooms
-                if count < self.nrooms:
-                    # remove top wall
-                    if ((room_c == self.room_ncols - 1) and (room_r % 2 == 0)) or (
-                        (room_c == 0) and (room_r % 2 == 1)
-                    ):
-                        if room_r != self.room_nrows - 1:
-                            wall_to_remove = self._convert_room_coord_to_global(
-                                room_r, room_c, self.room_size, self.room_size // 2
-                            )
-                            if wall_to_remove in walls:
-                                walls.remove(wall_to_remove)
-                # rooms to remove
-                else:
-                    for ii in range(-1, self.room_size + 1):
-                        for jj in range(-1, self.room_size + 1):
-                            wall_to_include = self._convert_room_coord_to_global(
-                                room_r, room_c, ii, jj
-                            )
-                            if (
-                                wall_to_include[0] >= 0
-                                and wall_to_include[0] < nrows
-                                and wall_to_include[1] >= 0
-                                and wall_to_include[1] < ncols
-                                and (wall_to_include not in walls)
-                            ):
-                                walls.append(wall_to_include)
-                    pass
-
-                # start coord
-                if count == nrooms // 2:
-                    start_coord = self._convert_room_coord_to_global(
-                        room_r, room_c, self.room_size // 2, self.room_size // 2
-                    )
-                # terminal state
-                if count == nrooms - 1:
-                    terminal_state = self._convert_room_coord_to_global(
-                        room_r, room_c, self.room_size // 2, self.room_size // 2
-                    )
-                # trap
-                if include_traps:
-                    self.traps.append(
-                        self._convert_room_coord_to_global(
-                            room_r,
-                            room_c,
-                            self.room_size // 2 + 1,
-                            self.room_size // 2 + 1,
-                        )
-                    )
-                count += 1
-
-        terminal_states = (terminal_state,) + tuple(self.traps)
-
-        if self.reward_free:
-            reward_at = {}
-        else:
-            reward_at = {
-                terminal_state: 1.0,
-                start_coord: 0.01,
-                (self.room_size // 2, self.room_size // 2): 0.1,
-            }
-
-        # Check remove_walls
-        if remove_walls:
-            walls = ()
-
-        # Init base class
-        GridWorld.__init__(
-            self,
-            nrows=nrows,
-            ncols=ncols,
-            start_coord=start_coord,
-            terminal_states=terminal_states,
-            success_probability=success_probability,
-            reward_at=reward_at,
-            walls=walls,
-            default_reward=0.0,
-        )
-
-        # Check initial distribution
-        if initial_state_distribution == "uniform":
-            distr = np.ones(self.observation_space.n) / self.observation_space.n
-            self.set_initial_state_distribution(distr)
-
-        # spaces
-        if self.array_observation:
-            self.discrete_observation_space = self.observation_space
-            self.observation_space = spaces.Box(0.0, 1.0, shape=(2,))
-
-    def _convert_room_coord_to_global(
-        self, room_row, room_col, room_coord_row, room_coord_col
-    ):
-        col_offset = (self.room_size + 1) * room_col
-        row_offset = (self.room_size + 1) * room_row
-
-        row = room_coord_row + row_offset
-        col = room_coord_col + col_offset
-        return (row, col)
-
-    def _convert_index_to_float_coord(self, state_index):
-        yy, xx = self.index2coord[state_index]
-
-        # centering
-        xx = xx + 0.5
-        yy = yy + 0.5
-        # map to [0, 1]
-        xx = xx / self.ncols
-        yy = yy / self.nrows
-        return np.array([xx, yy])
-
-    def reset(self, seed=None, options=None):
-        self.state, info = GridWorld.reset(self, seed=seed, options=options)
-        state_to_return = self.state
-        if self.array_observation:
-            state_to_return = self._convert_index_to_float_coord(self.state)
-        return state_to_return, info
-
-    def step(self, action):
-        assert self.action_space.contains(action), "Invalid action!"
-
-        # save state for rendering
-        if self.is_render_enabled():
-            self.append_state_for_rendering(self.state)
-
-        # take step
-        next_state, reward, terminated, truncated, info = self.sample(
-            self.state, action
-        )
-        self.state = next_state
-
-        state_to_return = self.state
-        if self.array_observation:
-            state_to_return = self._convert_index_to_float_coord(self.state)
-
-        return state_to_return, reward, terminated, truncated, info
-
-    def get_background(self):
-        """
-        Returne a scene (list of shapes) representing the background
-        """
-        bg = Scene()
-
-        # traps
-        for y, x in self.traps:
-            shape = GeometricPrimitive("POLYGON")
-            shape.set_color((0.5, 0.0, 0.0))
-            shape.add_vertex((x, y))
-            shape.add_vertex((x + 1, y))
-            shape.add_vertex((x + 1, y + 1))
-            shape.add_vertex((x, y + 1))
-            bg.add_shape(shape)
-
-        # walls
-        for wall in self.walls:
-            y, x = wall
-            shape = GeometricPrimitive("POLYGON")
-            shape.set_color((0.25, 0.25, 0.25))
-            shape.add_vertex((x, y))
-            shape.add_vertex((x + 1, y))
-            shape.add_vertex((x + 1, y + 1))
-            shape.add_vertex((x, y + 1))
-            bg.add_shape(shape)
-
-        # rewards
-        for y, x in self.reward_at:
-            flag = GeometricPrimitive("POLYGON")
-            rwd = self.reward_at[(y, x)]
-            if rwd == 1.0:
-                flag.set_color((0.0, 0.5, 0.0))
-            elif rwd == 0.1:
-                flag.set_color((0.0, 0.0, 0.5))
-            else:
-                flag.set_color((0.5, 0.0, 0.0))
-
-            x += 0.5
-            y += 0.25
-            flag.add_vertex((x, y))
-            flag.add_vertex((x + 0.25, y + 0.5))
-            flag.add_vertex((x - 0.25, y + 0.5))
-            bg.add_shape(flag)
-
-        return bg
diff --git a/rlberry/envs/benchmarks/grid_exploration/six_room.py b/rlberry/envs/benchmarks/grid_exploration/six_room.py
deleted file mode 100644
index 4af6fdb28..000000000
--- a/rlberry/envs/benchmarks/grid_exploration/six_room.py
+++ /dev/null
@@ -1,151 +0,0 @@
-import numpy as np
-import rlberry.spaces as spaces
-from rlberry.envs.finite import GridWorld
-from rlberry.rendering import Scene, GeometricPrimitive
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class SixRoom(GridWorld):
-    """
-    GridWorld with six rooms.
-
-    Parameters
-    ----------
-    reward_free : bool, default=False
-        If true, no rewards are given to the agent.
-    array_observation:
-        If true, the observations are converted to an array (x, y)
-        instead of a discrete index.
-
-    Notes
-    -----
-    The function env.sample() does not handle conversions to array states
-    when array_observation is True. Only the functions env.reset() and
-    env.step() are covered.
-    """
-
-    name = "SixRoom"
-
-    def __init__(self, reward_free=False, array_observation=False):
-        self.reward_free = reward_free
-        self.array_observation = array_observation
-
-        # Common parameters
-        nrows = 11
-        ncols = 17
-        start_coord = (0, 0)
-        terminal_states = ((10, 0),)
-        success_probability = 0.95
-        #
-        walls = ()
-        for ii in range(11):
-            if ii not in [2, 8]:
-                walls += ((ii, 5),)
-                walls += ((ii, 11),)
-        for jj in range(17):
-            if jj != 15:
-                walls += ((5, jj),)
-
-        # Default reward according to the difficulty
-        default_reward = -0.001
-
-        # Rewards according to the difficulty
-        if self.reward_free:
-            reward_at = {}
-        else:
-            reward_at = {
-                (10, 0): 10.0,
-                (4, 4): 0.1,
-            }
-
-        # Init base class
-        GridWorld.__init__(
-            self,
-            nrows=nrows,
-            ncols=ncols,
-            start_coord=start_coord,
-            terminal_states=terminal_states,
-            success_probability=success_probability,
-            reward_at=reward_at,
-            walls=walls,
-            default_reward=default_reward,
-        )
-
-        # spaces
-        if self.array_observation:
-            self.observation_space = spaces.Box(0.0, 1.0, shape=(2,))
-
-    def _convert_index_to_float_coord(self, state_index):
-        yy, xx = self.index2coord[state_index]
-
-        # centering
-        xx = xx + 0.5
-        yy = yy + 0.5
-        # map to [0, 1]
-        xx = xx / self.ncols
-        yy = yy / self.nrows
-        return np.array([xx, yy])
-
-    def reset(self, seed=None, options=None):
-        self.state = self.coord2index[self.start_coord]
-        state_to_return = self.state
-        if self.array_observation:
-            state_to_return = self._convert_index_to_float_coord(self.state)
-        return state_to_return, {}
-
-    def step(self, action):
-        assert self.action_space.contains(action), "Invalid action!"
-
-        # save state for rendering
-        if self.is_render_enabled():
-            self.append_state_for_rendering(self.state)
-
-        # take step
-        next_state, reward, terminated, truncated, info = self.sample(
-            self.state, action
-        )
-        self.state = next_state
-
-        state_to_return = self.state
-        if self.array_observation:
-            state_to_return = self._convert_index_to_float_coord(self.state)
-
-        return state_to_return, reward, terminated, truncated, info
-
-    def get_background(self):
-        """
-        Returne a scene (list of shapes) representing the background
-        """
-        bg = Scene()
-
-        # walls
-        for wall in self.walls:
-            y, x = wall
-            shape = GeometricPrimitive("POLYGON")
-            shape.set_color((0.25, 0.25, 0.25))
-            shape.add_vertex((x, y))
-            shape.add_vertex((x + 1, y))
-            shape.add_vertex((x + 1, y + 1))
-            shape.add_vertex((x, y + 1))
-            bg.add_shape(shape)
-
-        # rewards
-        for y, x in self.reward_at:
-            flag = GeometricPrimitive("POLYGON")
-            rwd = self.reward_at[(y, x)]
-            if rwd == 10:
-                flag.set_color((0.0, 0.5, 0.0))
-            else:
-                flag.set_color((0.0, 0.0, 0.5))
-
-            x += 0.5
-            y += 0.25
-            flag.add_vertex((x, y))
-            flag.add_vertex((x + 0.25, y + 0.5))
-            flag.add_vertex((x - 0.25, y + 0.5))
-            bg.add_shape(flag)
-
-        return bg
diff --git a/rlberry/envs/bullet3/data/__init__.py b/rlberry/envs/bullet3/data/__init__.py
deleted file mode 100644
index fa3615af0..000000000
--- a/rlberry/envs/bullet3/data/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import os
-
-
-def getDataPath():
-    resdir = os.path.join(os.path.dirname(__file__))
-    return resdir
diff --git a/rlberry/envs/bullet3/data/mjcf/pendulum.xml b/rlberry/envs/bullet3/data/mjcf/pendulum.xml
deleted file mode 100644
index e27fc88f4..000000000
--- a/rlberry/envs/bullet3/data/mjcf/pendulum.xml
+++ /dev/null
@@ -1,28 +0,0 @@
-<mujoco model="pendulum">
-    <compiler inertiafromgeom="true"/>
-    <default>
-        <joint armature="0" damping="1" limited="true"/>
-        <geom contype="0" friction="1 0.1 0.1" rgba="0.7 0.7 0 1"/>
-        <tendon/>
-        <motor ctrlrange="-3 3"/>
-    </default>
-    <option gravity="0 0 -9.81" integrator="RK4" timestep="0.02"/>
-    <size nstack="3000"/>
-    <worldbody>
-        <body name="pole" pos="0 0 0">
-            <joint axis="0 1 0" name="hinge" pos="0 0 0" damping="0" limited="false" range="-90 90" type="hinge"/>
-            <geom fromto="0 0 0 0.001 0 0.5" name="cpole" rgba="0 0.7 0.7 1" size="0.049 0.3" type="capsule"/>
-        </body>
-    </worldbody>
-    <!--	<worldbody>-->
-    <!--		<body name="pole" pos="0 0 .6">-->
-    <!--		  <joint axis="0 1 0" name="hinge" pos="0 0 0" limited="false" range="-90 90" type="hinge" damping="0.1"/>-->
-    <!--		  <geom name="base" material="decoration" type="cylinder" fromto="0 -.03 0 0 .03 0" size="0.021" mass="0"/>-->
-    <!--		  <geom name="cpole" material="self" type="capsule" fromto="0 0 0 0 0 0.5" size="0.02" mass="0"/>-->
-    <!--		  <geom name="mass" material="effector" type="sphere" pos="0 0 0.5" size="0.05" mass="1"/>-->
-    <!--		</body>-->
-    <!--	</worldbody>-->
-    <actuator>
-        <motor name="torque" joint="hinge" gear="1" ctrlrange="-1 1" ctrllimited="true"/>
-    </actuator>
-</mujoco>
diff --git a/rlberry/envs/bullet3/data/pendulum.urdf b/rlberry/envs/bullet3/data/pendulum.urdf
deleted file mode 100644
index af450f425..000000000
--- a/rlberry/envs/bullet3/data/pendulum.urdf
+++ /dev/null
@@ -1,51 +0,0 @@
-<?xml version="1.0"?>
-<robot name="physics">
-
-    <link name="slideBar">
-    <visual>
-      <geometry>
-        <box size="0.05 0.05 0.05"/>
-      </geometry>
-      <origin xyz="0 0 0"/>
-      <material name="green">
-        <color rgba="0 0.8 .8 1"/>
-      </material>
-    </visual>
-    <inertial>
-      <mass value="0"/>
-      <inertia ixx="1.0" ixy="0.0" ixz="0.0" iyy="1.0" iyz="0.0" izz="1.0"/>
-    </inertial>
-  </link>
-
-   <link name="pole">
-    <visual>
-      <geometry>
-        <box size="0.05 0.05 1.0"/>
-      </geometry>
-      <origin rpy="0 0 0" xyz="0 0 0.5"/>
-      <material name="white">
-        <color rgba="1 1 1 1"/>
-      </material>
-    </visual>
-    <inertial>
-    <origin xyz="0 0 0.5"/>
-      <mass value="10"/>
-      <inertia ixx="1.0" ixy="0.0" ixz="0.0" iyy="1.0" iyz="0.0" izz="1.0"/>
-    </inertial>
-     <collision>
-      <geometry>
-        <box size="0.05 0.05 1.0"/>
-      </geometry>
-      <origin rpy="0 0 0" xyz="0 0 0.5"/>
-    </collision>
-  </link>
-
-  <joint name="hinge" type="continuous">
-    <axis xyz="0 1 0"/>
-    <origin xyz="0.0 0.0 0.0"/>
-    <parent link="slideBar"/>
-    <child link="pole"/>
-  </joint>
-
-
-</robot>
diff --git a/rlberry/envs/bullet3/pybullet_envs/__init__.py b/rlberry/envs/bullet3/pybullet_envs/__init__.py
deleted file mode 100644
index 093f8d9eb..000000000
--- a/rlberry/envs/bullet3/pybullet_envs/__init__.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import gymnasium as gym
-from gym.envs.registration import registry, make, spec
-
-
-def register(id, *args, **kvargs):
-    if id in registry.env_specs:
-        return
-    else:
-        return gym.envs.registration.register(id, *args, **kvargs)
-
-
-# ------------bullet-------------
-
-register(
-    id="PendulumBulletEnv-v0",
-    entry_point="rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:PendulumBulletEnv",
-    max_episode_steps=1000,
-    reward_threshold=950.0,
-)
-
-register(
-    id="PendulumSwingupBulletEnv-v0",
-    entry_point="rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:PendulumSwingupBulletEnv",
-    max_episode_steps=1000,
-    reward_threshold=800.0,
-)
-
-register(
-    id="DiscretePendulumBulletEnv-v0",
-    entry_point="rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:DiscretePendulumBulletEnv",
-    max_episode_steps=1000,
-    reward_threshold=950.0,
-)
-
-register(
-    id="DiscretePendulumSwingupBulletEnv-v0",
-    entry_point="rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:DiscretePendulumSwingupBulletEnv",
-    max_episode_steps=1000,
-    reward_threshold=800.0,
-)
diff --git a/rlberry/envs/bullet3/pybullet_envs/gym_pendulum_envs.py b/rlberry/envs/bullet3/pybullet_envs/gym_pendulum_envs.py
deleted file mode 100644
index 32ce80c6a..000000000
--- a/rlberry/envs/bullet3/pybullet_envs/gym_pendulum_envs.py
+++ /dev/null
@@ -1,80 +0,0 @@
-from gym import spaces
-from pybullet_envs.env_bases import MJCFBaseBulletEnv
-from pybullet_envs.gym_pendulum_envs import InvertedPendulumBulletEnv
-from pybullet_envs.scene_abstract import SingleRobotEmptyScene
-
-from rlberry.envs.bullet3.pybullet_envs.robot_pendula import Pendulum, PendulumSwingup
-import numpy as np
-
-
-class PendulumBulletEnv(InvertedPendulumBulletEnv):
-    """Simple pendulum"""
-
-    def __init__(self):
-        self.robot = Pendulum()
-        MJCFBaseBulletEnv.__init__(self, self.robot)
-        self.stateId = -1
-
-    def create_single_player_scene(self, bullet_client):
-        return SingleRobotEmptyScene(
-            bullet_client, gravity=9.81, timestep=0.02, frame_skip=1
-        )
-
-    def step(self, a):
-        self.robot.apply_action(a)
-        self.scene.global_step()
-        state = self.robot.calc_state()  # sets self.pos_x self.pos_y
-        if self.robot.swingup:
-            reward = np.cos(self.robot.theta)
-            done = False
-        else:
-            reward = 1.0
-            done = np.abs(self.robot.theta) > 0.2
-        self.rewards = [float(reward)]
-        self.HUD(state, a, done)
-        return state, sum(self.rewards), done, {}
-
-
-class PendulumSwingupBulletEnv(PendulumBulletEnv):
-    def __init__(self):
-        self.robot = PendulumSwingup()
-        MJCFBaseBulletEnv.__init__(self, self.robot)
-        self.stateId = -1
-
-
-class DiscretePendulumBulletEnv(PendulumBulletEnv):
-    """pybullet's InvertedPendulum with discrete actions"""
-
-    def __init__(self):
-        super().__init__()
-        self.continuous_action_space = self.action_space
-        self.action_space = spaces.Discrete(3)
-
-    def step(self, a):
-        if a == 0:
-            return super().step(self.continuous_action_space.low)
-        elif a == 1:
-            return super().step(self.continuous_action_space.high)
-        elif a == 2:
-            return super().step(np.zeros(self.continuous_action_space.shape))
-        else:
-            raise IndexError
-
-
-class DiscretePendulumSwingupBulletEnv(PendulumSwingupBulletEnv):
-    """pybullet's InvertedPendulumSwingup with discrete actions"""
-
-    def __init__(self):
-        super().__init__()
-        self.continuous_action_space = self.action_space
-        self.action_space = spaces.Discrete(3)
-
-    def step(self, a):
-        if a == 0:
-            return super().step(self.continuous_action_space.low)
-        elif a == 1:
-            return super().step(self.continuous_action_space.high)
-        elif a == 2:
-            return super().step(np.zeros(self.continuous_action_space.shape))
-        else:
-            raise IndexError
diff --git a/rlberry/envs/bullet3/pybullet_envs/robot_bases.py b/rlberry/envs/bullet3/pybullet_envs/robot_bases.py
deleted file mode 100644
index d2dc50e75..000000000
--- a/rlberry/envs/bullet3/pybullet_envs/robot_bases.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import os
-import pybullet
-from pybullet_envs.robot_bases import MJCFBasedRobot, URDFBasedRobot
-
-# Use our custom data
-from rlberry.envs.bullet3 import data
-
-
-class MJCFBasedRobot2(MJCFBasedRobot):
-    def reset(self, bullet_client):
-        self._p = bullet_client
-        # print("Created bullet_client with id=", self._p._client)
-        if self.doneLoading == 0:
-            self.ordered_joints = []
-            self.doneLoading = 1
-            if self.self_collision:
-                self.objects = self._p.loadMJCF(
-                    os.path.join(data.getDataPath(), "mjcf", self.model_xml),
-                    flags=pybullet.URDF_USE_SELF_COLLISION
-                    | pybullet.URDF_USE_SELF_COLLISION_EXCLUDE_ALL_PARENTS
-                    | pybullet.URDF_GOOGLEY_UNDEFINED_COLORS,
-                )
-                (
-                    self.parts,
-                    self.jdict,
-                    self.ordered_joints,
-                    self.robot_body,
-                ) = self.addToScene(self._p, self.objects)
-            else:
-                self.objects = self._p.loadMJCF(
-                    os.path.join(
-                        data.getDataPath(),
-                        "mjcf",
-                        self.model_xml,
-                        flags=pybullet.URDF_GOOGLEY_UNDEFINED_COLORS,
-                    )
-                )
-                (
-                    self.parts,
-                    self.jdict,
-                    self.ordered_joints,
-                    self.robot_body,
-                ) = self.addToScene(self._p, self.objects)
-        self.robot_specific_reset(self._p)
-
-        s = (
-            self.calc_state()
-        )  # optimization: calc_state() can calculate something in self.* for calc_potential() to use
-
-        return s
-
-
-class URDFBasedRobot2(URDFBasedRobot):
-    def __init__(
-        self,
-        model_urdf,
-        robot_name,
-        action_dim,
-        obs_dim,
-        basePosition=[0, 0, 0],
-        baseOrientation=[0, 0, 0, 1],
-        fixed_base=False,
-        self_collision=False,
-    ):
-        super().__init__(
-            model_urdf,
-            robot_name,
-            action_dim,
-            obs_dim,
-            basePosition,
-            baseOrientation,
-            fixed_base,
-            self_collision,
-        )
-        self.doneLoading = 0
-
-    def reset(self, bullet_client):
-        self._p = bullet_client
-        if self.doneLoading == 0:
-            self.ordered_joints = []
-            self.doneLoading = 1
-            if self.self_collision:
-                (
-                    self.parts,
-                    self.jdict,
-                    self.ordered_joints,
-                    self.robot_body,
-                ) = self.addToScene(
-                    self._p,
-                    self._p.loadURDF(
-                        os.path.join(data.getDataPath(), self.model_urdf),
-                        basePosition=self.basePosition,
-                        baseOrientation=self.baseOrientation,
-                        useFixedBase=self.fixed_base,
-                        flags=pybullet.URDF_USE_SELF_COLLISION
-                        | pybullet.URDF_GOOGLEY_UNDEFINED_COLORS,
-                    ),
-                )
-            else:
-                (
-                    self.parts,
-                    self.jdict,
-                    self.ordered_joints,
-                    self.robot_body,
-                ) = self.addToScene(
-                    self._p,
-                    self._p.loadURDF(
-                        os.path.join(data.getDataPath(), self.model_urdf),
-                        basePosition=self.basePosition,
-                        baseOrientation=self.baseOrientation,
-                        useFixedBase=self.fixed_base,
-                        flags=pybullet.URDF_GOOGLEY_UNDEFINED_COLORS,
-                    ),
-                )
-
-        self.robot_specific_reset(self._p)
-
-        s = (
-            self.calc_state()
-        )  # optimization: calc_state() can calculate something in self.* for calc_potential() to use
-        self.potential = self.calc_potential()
-
-        return s
diff --git a/rlberry/envs/bullet3/pybullet_envs/robot_pendula.py b/rlberry/envs/bullet3/pybullet_envs/robot_pendula.py
deleted file mode 100644
index aca31c7c6..000000000
--- a/rlberry/envs/bullet3/pybullet_envs/robot_pendula.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import gymnasium as gym
-import numpy as np
-
-from rlberry.envs.bullet3.pybullet_envs.robot_bases import URDFBasedRobot2
-
-
-class Pendulum(URDFBasedRobot2):
-    swingup = False
-
-    def __init__(self):
-        # MJCFBasedRobot2.__init__(self, 'pendulum.xml', 'pole', action_dim=1, obs_dim=2)
-        URDFBasedRobot2.__init__(self, "pendulum.urdf", "pole", action_dim=1, obs_dim=2)
-        self.action_space = gym.spaces.Box(shape=(1,), low=-20, high=20)
-
-    def robot_specific_reset(self, bullet_client):
-        self._p = bullet_client
-        self.pole = self.parts["pole"]
-        self.j1 = self.jdict["hinge"]
-        u = self.np_random.uniform(low=-0.1, high=0.1)
-        self.j1.reset_current_position(u if not self.swingup else np.pi + u, 0)
-        self.j1.set_motor_torque(0)
-
-    def apply_action(self, a):
-        assert np.isfinite(a).all()
-        if not np.isfinite(a).all():
-            print("a is inf")
-            a[0] = 0
-        self.j1.set_motor_torque(
-            np.clip(a[0], self.action_space.low, self.action_space.high)
-        )
-
-    def calc_state(self):
-        self.theta, theta_dot = self.j1.current_position()
-        if not np.isfinite(self.theta):
-            print("theta is inf")
-            self.theta = 0
-
-        if not np.isfinite(theta_dot):
-            print("theta_dot is inf")
-            theta_dot = 0
-
-        return np.array([self.theta, theta_dot])
-
-
-class PendulumSwingup(Pendulum):
-    swingup = True
diff --git a/rlberry/envs/classic_control/SpringCartPole.py b/rlberry/envs/classic_control/SpringCartPole.py
deleted file mode 100644
index 4bfd5f634..000000000
--- a/rlberry/envs/classic_control/SpringCartPole.py
+++ /dev/null
@@ -1,604 +0,0 @@
-"""
-SpringCartPole environment introduced in J-F. Hren PhD thesis.
-"""
-
-import numpy as np
-import rlberry.spaces as spaces
-from rlberry.envs.interface import Model
-from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D
-from rlberry.rendering.common_shapes import bar_shape, circle_shape
-
-
-class SpringCartPole(RenderInterface2D, Model):
-    """
-    SpringCartPole is an extension of the CartPole environment proposed in
-    PhD thesis J-F. Hren. It consists of two carts connected by a spring.
-
-    Parameters
-    ----------
-    dt : float, default=0.02
-        Time step of the simulation.
-    obs_trans : bool, default=True
-        If True, state has dimension 10:
-        State:
-            'Cart position1', 'Cart velocity1', 'Pole cos1', 'Pole sin1', 'Pole angular velocity1',
-            'Cart position2', 'Cart velocity2', 'Pole cos2', 'Pole sin2', 'Pole angular velocity2'
-        If False, state has dimension 8:
-        State:
-            'Cart position1', 'Cart velocity1', 'Pole angle1', 'Pole angular velocity1',
-            'Cart position2', 'Cart velocity2', 'Pole angle2', 'Pole angular velocity2'
-    swing_up : bool, default=False
-        If True, the pole starting position is at the bottom
-        If False, the pole starting position is at the top
-    random_init : bool, default=True
-        If True, the noise is added to the carts and poles starting positions
-
-    Notes
-    -----
-    State:
-        The state consists of the position of cart 1, its speed, the angle
-        of pole 1 (expressed in radians or in a tuple of cos() and sin()) and
-        its angular speed, and the same set of values for cart 2 and pole 2.
-        For both poles, the angle of 0 corresponds to the vertical position,
-        the positive angles correspond to a counterclockwise rotation.
-
-    Actions:
-        The action is either 0, 1, 2, or 3, corresponding to the four possible
-        actions:
-            LL = 0, move cart 1 to the left, cart 2 to the left
-            RR = 1, move cart 1 to the right, cart 2 to the right
-            LR = 2, move cart 1 to the left, cart 2 to the right
-            RL = 3, move cart 1 to the right, cart 2 to the left
-        The magnitude of actions is fixed to 2.0.
-
-    Reward:
-        If spring is not deformed (its length is within [self.min_spring_length, self.max_spring_length])
-        and the carts stay on the track (|Cart Position| <= self.track_length / 2)
-        then the reward is ((1 + Pole cos1) + (1 + Pole cos2)) / 4, else reward is 0.
-
-    Reference:
-    .. seealso::
-        J-F. Hren: Planification optimiste pour systèmes déterministes, PhD thesis
-    .. warning::
-        This version of the domain uses the Runge-Kutta method for integrating
-        the system dynamics and is more realistic than Euler method
-    """
-
-    name = "SpringCartPole"
-
-    ACT_RNG = 2.0
-    AVAIL_TORQUES = [
-        np.array([-2.0, -2.0]),
-        np.array([2.0, 2.0]),
-        np.array([-2.0, 2.0]),
-        np.array([2.0, -2.0]),
-    ]
-
-    book_or_nips = "book"
-    action_arrow = None
-    domain_fig = None
-    actions_num = 4
-
-    def __init__(self, dt=0.02, obs_trans=True, swing_up=False, random_init=True):
-        Model.__init__(self)
-        RenderInterface2D.__init__(self)
-
-        self.dt = dt
-        self.gravity = 9.81
-        self.track_length = 2.0
-        self.L = 0.5 * self.track_length
-        self.pole_length = 1.0
-        self.l = 0.5 * self.pole_length
-        self.masspole = 0.1
-        self.masscart = 1.0
-        self.cart_friction = 5e-4
-        self.pole_friction = 2e-6
-        self.spring = 2.0
-        self.normal_spring_length = 0.5
-        self.min_spring_length = 0.1
-        self.max_spring_length = 1.5
-        self.max_velocity = 15.0
-        self.ang_velocity = 10.0
-        self.force_mag = self.ACT_RNG
-        self.swing_up = swing_up
-        self.random_init = random_init
-        self.obs_trans = obs_trans
-
-        if self.obs_trans:
-            self.obs_shape = 10
-        else:
-            self.obs_shape = 8
-
-        # init base classes
-        self.reward_range = (0.0, 1.0)
-
-        # rendering info
-        boundy = self.pole_length * 2 + 0.2
-        boundx = self.track_length + self.pole_length * 2 + 0.2
-        # (left, right, bottom, top)
-        self.set_clipping_area((-boundx, boundx, -boundy, boundy))
-        self.set_refresh_interval(10)  # in milliseconds
-
-        # observation and action spaces
-        if self.obs_trans:
-            high = np.array(
-                [
-                    self.track_length,
-                    np.finfo(np.float32).max,
-                    1,
-                    1,
-                    np.finfo(np.float32).max,
-                    self.track_length,
-                    np.finfo(np.float32).max,
-                    1,
-                    1,
-                    np.finfo(np.float32).max,
-                ]
-            )
-        else:
-            high = np.array(
-                [
-                    self.track_length,
-                    np.finfo(np.float32).max,
-                    2 * np.pi,
-                    np.finfo(np.float32).max,
-                    self.track_length,
-                    np.finfo(np.float32).max,
-                    2 * np.pi,
-                    np.finfo(np.float32).max,
-                ]
-            )
-        low = -high
-        self.observation_space = spaces.Box(low=low, high=high)
-        self.action_space = spaces.Discrete(4)
-
-        # initialize
-        self.state = None  # state in pos or angles
-        self.state_ = None  # state in angles
-        self.reset()
-
-    def transform_states(self, state):
-        """Transform state with dim=8 to the state with dim=10"""
-        assert state.shape[-1] == 8, "State has wrong shape, should be 8"
-        shape = list(state.shape)
-        shape[-1] = 10
-        state_ = np.zeros(shape)
-        state_[..., :2] = state[..., :2]
-        state_[..., 4:7] = state[..., 3:6]
-        state_[..., -1] = state[..., -1]
-        theta1 = state[..., 2]
-        theta2 = state[..., 6]
-        state_[..., 2] = np.cos(theta1)
-        state_[..., 3] = np.sin(theta1)
-        state_[..., 7] = np.cos(theta2)
-        state_[..., 8] = np.sin(theta2)
-        return state_
-
-    # def trigonometric2angle(self, costheta, sintheta):
-    #     C = costheta**2 + sintheta**2
-    #     costheta, sintheta = costheta / C, sintheta / C
-    #     theta = np.arctan2(sintheta / C, costheta / C)
-    #     return theta
-
-    def reset(self):
-        if self.random_init:
-            rand_state = self.rng.uniform(low=-0.1, high=0.1, size=(8,))
-        else:
-            rand_state = np.zeros((8,))
-        rand_state[4] += self.normal_spring_length
-        if self.swing_up:
-            rand_state[2] += np.pi
-            rand_state[6] += np.pi
-        if self.obs_trans:
-            self.state = self.transform_states(rand_state)
-        else:
-            self.state = rand_state
-        self.state_ = rand_state
-        return self.state, {}
-
-    def _reward(self):
-        state = self.state
-        if state.shape[-1] == 10:
-            (
-                _,
-                _,
-                cos1,
-                sin1,
-                _,
-                _,
-                _,
-                cos2,
-                sin2,
-                _,
-            ) = np.split(state, 10, axis=-1)
-            C1 = np.sqrt(cos1**2 + sin1**2)
-            C2 = np.sqrt(cos2**2 + sin2**2)
-            cos1 = cos1 / C1
-            sin1 = sin1 / C1
-            cos2 = cos2 / C2
-            sin2 = sin2 / C2
-        else:
-            _, _, theta1, _, _, _, theta2, _ = np.split(state, 8, axis=-1)
-            cos1 = np.cos(theta1)
-            sin1 = np.sin(theta1)
-            cos2 = np.cos(theta2)
-            sin2 = np.sin(theta2)
-
-        bad_condition = self._terminal()
-
-        pos_reward = (1 + cos1) / 4 + (1 + cos2) / 4
-        neg_reward = 0.0
-
-        return np.where(bad_condition, neg_reward, pos_reward)
-
-    def bound_states(self, state):
-        assert state.shape[-1] == 8, "state must be of shape (8,)"
-        x1, x1dot, theta1, theta1dot, x2, x2dot, theta2, theta2dot = np.split(
-            state, 8, axis=-1
-        )
-        theta1 = np.asarray(wrap(theta1, -np.pi, np.pi))
-        theta2 = np.asarray(wrap(theta2, -np.pi, np.pi))
-        x1dot = np.asarray(bound(x1dot, [-self.max_velocity, self.max_velocity]))
-        x2dot = np.asarray(bound(x2dot, -self.max_velocity, self.max_velocity))
-        theta1dot = np.asarray(bound(theta1dot, -self.ang_velocity, self.ang_velocity))
-        theta2dot = np.asarray(bound(theta2dot, -self.max_velocity, self.ang_velocity))
-        state = np.concatenate(
-            [x1, x1dot, theta1, theta1dot, x2, x2dot, theta2, theta2dot], axis=-1
-        )
-        return state
-
-    def step(self, action):
-        assert self.action_space.contains(action), "%r (%s) invalid" % (
-            action,
-            type(action),
-        )
-
-        # save state for rendering
-        if self.is_render_enabled():
-            self.append_state_for_rendering(np.array(self.state_))
-
-        s = self.state_
-        torque = self.AVAIL_TORQUES[action]
-
-        # # Add noise to the force action
-        # if self.torque_noise_max > 0:
-        #     torque += self.rng.uniform(-self.torque_noise_max, self.torque_noise_max)
-
-        # Now, augment the state with our force action so it can be passed to
-        # _dsdt
-        s_augmented = np.append(s, torque)
-
-        try:
-            from scipy.integrate import solve_ivp
-
-            ns = solve_ivp(lambda t, y: self._dsdt(y, t), [0, self.dt], s_augmented)
-            ns = ns.y[:, -1]  # final timestep
-        except:
-            print("Can't import scipy library, use rk4 function")
-            ns = rk4(self._dsdt, s_augmented, [0, self.dt])
-            # only care about final timestep of integration returned by integrator
-            ns = ns[-1]
-
-        ns = ns[:-2]  # omit action
-
-        ns = self.bound_states(ns)
-        self.state_ = ns
-        if self.obs_trans:
-            self.state = self.transform_states(ns)
-        else:
-            self.state = ns
-        terminated = self._terminal()
-        truncated = False
-        reward = self._reward()[0]
-        return self.state, reward, terminated, truncated, {}
-
-    def _terminal(self):
-        s = self.state_
-        x1 = s[0]
-        x2 = s[4]
-        bad_condition = False
-        bad_condition += np.abs(x1) > self.L
-        bad_condition += np.abs(x2) > self.L
-        bad_condition += x2 <= x1
-        bad_condition += np.abs(x1 - x2) < self.min_spring_length
-        bad_condition += np.abs(x1 - x2) > self.max_spring_length
-
-        return bool(bad_condition)
-
-    def _dsdt(self, sa, t):
-        assert sa.shape[-1] == 10, "state + action must be of shape (10,)"
-        x1, x1dot, theta1, theta1dot, x2, x2dot, theta2, theta2dot, a1, a2 = np.split(
-            sa, 10, axis=-1
-        )
-        cos1 = np.cos(theta1)
-        sin1 = np.sin(theta1)
-        cos2 = np.cos(theta2)
-        sin2 = np.sin(theta2)
-        # x1 - size [N, 1] or [L, N, 1]
-
-        f1 = a1 + self.spring * (self.normal_spring_length - np.abs(x1 - x2))
-        f2 = a2 + self.spring * (self.normal_spring_length - np.abs(x1 - x2))
-
-        a11 = 4 * self.l / 3
-        a22 = -self.masscart - self.masspole
-
-        a121 = -cos1
-        a122 = -cos2
-        a211 = self.l * self.masspole * cos1
-        a212 = self.l * self.masspole * cos2
-
-        b11 = (
-            self.gravity * sin1
-            - self.pole_friction * theta1dot / self.l / self.masspole
-        )
-        b12 = (
-            self.gravity * sin2
-            - self.pole_friction * theta2dot / self.l / self.masspole
-        )
-
-        b21 = (
-            self.l * self.masspole * sin1 * theta1dot**2
-            - f1
-            + self.cart_friction * np.sign(x1dot)
-        )
-        b22 = (
-            self.l * self.masspole * sin2 * theta2dot**2
-            - f2
-            + self.cart_friction * np.sign(x2dot)
-        )
-
-        theta1acc = (a121 * b21 - a22 * b11) / (a121 * a211 - a11 * a22)
-        theta2acc = (a122 * b22 - a22 * b12) / (a122 * a212 - a11 * a22)
-
-        x1acc = (b11 - a11 * theta1acc) / a121
-        x2acc = (b12 - a11 * theta2acc) / a122
-
-        a1dot = np.zeros_like(a1)
-        a2dot = np.zeros_like(a2)
-
-        return np.concatenate(
-            [
-                x1dot,
-                x1acc,
-                theta1dot,
-                theta1acc,
-                x2dot,
-                x2acc,
-                theta2dot,
-                theta2acc,
-                a1dot,
-                a2dot,
-            ],
-            axis=-1,
-        )
-
-    #
-    # Below: code for rendering
-    #
-
-    def get_background(self):
-        bg = Scene()
-        return bg
-
-    def get_scene(self, state):
-        scene = Scene()
-        SCALE = 3
-
-        assert state.shape[-1] == 8, "state must be of shape (8,)"
-
-        x1 = state[0]
-        x2 = state[4]
-        theta1 = state[2]
-        theta2 = state[6]
-
-        cartx1 = x1 * SCALE  # MIDDLE OF CART 1
-
-        cartx2 = x2 * SCALE  # MIDDLE OF CART 2
-
-        cartwidth = 0.05 * SCALE
-
-        c1p1 = (
-            cartx1 - cartwidth / 2,
-            0,
-        )
-        c1p2 = (
-            cartx1 + cartwidth / 2,
-            0,
-        )
-
-        c2p1 = (
-            cartx2 - cartwidth / 2,
-            0,
-        )
-        c2p2 = (
-            cartx2 + cartwidth / 2,
-            0,
-        )
-
-        p1 = (
-            cartx1 - np.sin(theta1) * self.pole_length * SCALE,
-            np.cos(theta1) * self.pole_length * SCALE,
-        )
-
-        p01 = (cartx1, 0)
-        p02 = (cartx2, 0)
-
-        p2 = (
-            cartx2 - np.sin(theta2) * self.pole_length * SCALE,
-            np.cos(theta2) * self.pole_length * SCALE,
-        )
-
-        cart1 = bar_shape(c1p1, c1p2, 0.02 * SCALE)
-        cart1.set_color((255 / 255, 100 / 255, 0 / 255))
-
-        cart2 = bar_shape(c2p1, c2p2, 0.02 * SCALE)
-        cart2.set_color((255 / 255, 100 / 255, 0 / 255))
-
-        pole1 = bar_shape(p01, p1, 0.01 * SCALE)
-        pole1.set_color((255 / 255, 215 / 255, 0 / 255))
-
-        pole2 = bar_shape(p02, p2, 0.01 * SCALE)
-        pole2.set_color((255 / 255, 215 / 255, 0 / 255))
-
-        spring = bar_shape(
-            p01,
-            p02,
-            0.03
-            * np.sqrt(self.normal_spring_length)
-            * SCALE
-            / np.sqrt(cartx2 - cartx1),
-        )
-        spring.set_color((50 / 255, 50 / 255, 50 / 255))
-
-        joint1 = circle_shape(p01, 0.03)
-        joint1.set_color((0 / 255, 255 / 255, 0 / 255))
-
-        joint2 = circle_shape(p02, 0.03)
-        joint2.set_color((0 / 255, 255 / 255, 0 / 255))
-
-        track_line = GeometricPrimitive("LINES")
-        track_line.add_vertex((-self.track_length / 2 * SCALE, -0.02 * SCALE))
-        track_line.add_vertex((self.track_length / 2 * SCALE, -0.02 * SCALE))
-
-        axis1 = GeometricPrimitive("LINES")
-        axis1.add_vertex((cartx1, 0))
-        axis1.add_vertex((cartx1, self.pole_length * SCALE))
-        axis1.set_color((250 / 255, 250 / 255, 250 / 255))
-
-        axis2 = GeometricPrimitive("LINES")
-        axis2.add_vertex((cartx2, 0))
-        axis2.add_vertex((cartx2, self.pole_length * SCALE))
-        axis2.set_color((250 / 255, 250 / 255, 250 / 255))
-
-        scene.add_shape(cart1)
-        scene.add_shape(cart2)
-        scene.add_shape(pole1)
-        scene.add_shape(pole2)
-        scene.add_shape(joint1)
-        scene.add_shape(joint2)
-        scene.add_shape(spring)
-        scene.add_shape(track_line)
-
-        return scene
-
-
-def wrap(x, m, M):
-    """Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which
-    truncates, ``wrap()`` wraps x around the coordinate system defined
-    by m, M.
-    For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
-
-    Parameters
-    ----------
-        x: a scalar
-        m:
-            minimum possible value in range
-        M:
-            maximum possible value in range
-
-    Returns
-    -------
-        x:
-            a scalar, wrapped
-    """
-    diff = M - m
-    while x > M:
-        x = x - diff
-    while x < m:
-        x = x + diff
-    return x
-
-
-def bound(x, m, M=None):
-    """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
-    have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
-
-    Parameters
-    ----------
-    x:
-        scalar
-
-    Returns
-    -------
-    x:
-        scalar, bound between min (m) and Max (M)
-    """
-    if M is None:
-        M = m[1]
-        m = m[0]
-    # bound x between min (m) and Max (M)
-    return np.clip(x, m, M)
-
-
-def rk4(derivs, y0, t, *args, **kwargs):
-    """
-    Integrate 1D or ND system of ODEs using 4-th order Runge-Kutta.
-    This is a toy implementation which may be useful if you find
-    yourself stranded on a system w/o scipy.  Otherwise use
-    :func:`scipy.integrate`.
-
-    Parameters:
-    -----------
-    derivs:
-        the derivative of the system and has the signature
-        ``dy = derivs(yi, ti)``
-    y0:
-        initial state vector
-    t:
-        sample times
-    args:
-        additional arguments passed to the derivative function
-    kwargs:
-        additional keyword arguments passed to the derivative function
-
-    Returns
-    -------
-    yout:
-        Runge-Kutta approximation of the ODE
-
-    Examples
-    --------
-    Example 1::
-        ## 2D system
-        def derivs6(x,t):
-            d1 =  x[0] + 2*x[1]
-            d2 =  -3*x[0] + 4*x[1]
-            return (d1, d2)
-        dt = 0.0005
-        t = arange(0.0, 2.0, dt)
-        y0 = (1,2)
-        yout = rk4(derivs6, y0, t)
-
-    Example 2::
-        ## 1D system
-        alpha = 2
-        def derivs(x,t):
-            return -alpha*x + exp(-t)
-        y0 = 1
-        yout = rk4(derivs, y0, t)
-
-    If you have access to scipy, you should probably be using the
-    scipy.integrate tools rather than this function.
-    """
-
-    try:
-        Ny = len(y0)
-    except TypeError:
-        yout = np.zeros((len(t),), np.float_)
-    else:
-        yout = np.zeros((len(t), Ny), np.float_)
-
-    yout[0] = y0
-
-    for i in np.arange(len(t) - 1):
-        thist = t[i]
-        dt = t[i + 1] - thist
-        dt2 = dt / 2.0
-        y0 = yout[i]
-
-        k1 = np.asarray(derivs(y0, thist, *args, **kwargs))
-        k2 = np.asarray(derivs(y0 + dt2 * k1, thist + dt2, *args, **kwargs))
-        k3 = np.asarray(derivs(y0 + dt2 * k2, thist + dt2, *args, **kwargs))
-        k4 = np.asarray(derivs(y0 + dt * k3, thist + dt, *args, **kwargs))
-        yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4)
-    return yout
diff --git a/rlberry/envs/classic_control/__init__.py b/rlberry/envs/classic_control/__init__.py
deleted file mode 100644
index a6cd76c14..000000000
--- a/rlberry/envs/classic_control/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .mountain_car import MountainCar
-from .acrobot import Acrobot
-from .pendulum import Pendulum
-from .SpringCartPole import SpringCartPole
diff --git a/rlberry/envs/classic_control/acrobot.py b/rlberry/envs/classic_control/acrobot.py
deleted file mode 100644
index 2404b66e1..000000000
--- a/rlberry/envs/classic_control/acrobot.py
+++ /dev/null
@@ -1,394 +0,0 @@
-"""
-Acrobot environment adapted from OpenAI gym [1]. (updated to gymnasium template [2])
-
-Modifications:
-* define reward_range
-* render function follows the rlberry rendering interface.
-
-[1] https://github.com/openai/gym/blob/master/gym/
-[2] https://gymnasium.farama.org/api/env/
-envs/classic_control/acrobot.py
-"""
-
-import numpy as np
-import rlberry.spaces as spaces
-from rlberry.envs.interface import Model
-from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D
-from rlberry.rendering.common_shapes import bar_shape, circle_shape
-
-__copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy"
-__credits__ = [
-    "Alborz Geramifard",
-    "Robert H. Klein",
-    "Christoph Dann",
-    "William Dabney",
-    "Jonathan P. How",
-]
-__license__ = "BSD 3-Clause"
-__author__ = "Christoph Dann <cdann@cdann.de>"
-
-
-# SOURCE:
-# https://github.com/rlpy/rlpy/blob/master/rlpy/Domains/Acrobot.py
-
-
-class Acrobot(RenderInterface2D, Model):
-    """
-    Acrobot is a 2-link pendulum with only the second joint actuated.
-    Initially, both links point downwards. The goal is to swing the
-    end-effector at a height at least the length of one link above the base.
-    Both links can swing freely and can pass by each other, i.e., they don't
-    collide when they have the same angle.
-
-    Notes
-    -----
-    State:
-        The state consists of the sin() and cos() of the two rotational joint
-        angles and the joint angular velocities:
-        [cos(theta1) sin(theta1) cos(theta2) sin(theta2) thetaDot1 thetaDot2].
-        For the first link, an angle of 0 corresponds to the link pointing
-        downwards.
-        The angle of the second link is relative to the angle of the first link.
-        An angle of 0 corresponds to having the same angle between the two links.
-        A state of [1, 0, 1, 0, ..., ...] means that both links point downwards.
-
-    Actions:
-        The action is either applying +1, 0 or -1 torque on the joint between
-        the two pendulum links.
-    .. note::
-        The dynamics equations were missing some terms in the NIPS paper which
-        are present in the book. R. Sutton confirmed in personal correspondence
-        that the experimental results shown in the paper and the book were
-        generated with the equations shown in the book.
-        However, there is the option to run the domain with the paper equations
-        by setting book_or_nips = 'nips'
-
-    Reference:
-    .. seealso::
-        R. Sutton: Generalization in Reinforcement Learning:
-        Successful Examples Using Sparse Coarse Coding (NIPS 1996)
-    .. seealso::
-        R. Sutton and A. G. Barto:
-        Reinforcement learning: An introduction.
-        Cambridge: MIT press, 1998.
-    .. warning::
-        This version of the domain uses the Runge-Kutta method for integrating
-        the system dynamics and is more realistic, but also considerably harder
-        than the original version which employs Euler integration,
-        see the AcrobotLegacy class.
-    """
-
-    name = "Acrobot"
-
-    dt = 0.2
-
-    LINK_LENGTH_1 = 1.0  # [m]
-    LINK_LENGTH_2 = 1.0  # [m]
-    LINK_MASS_1 = 1.0  #: [kg] mass of link 1
-    LINK_MASS_2 = 1.0  #: [kg] mass of link 2
-    LINK_COM_POS_1 = 0.5  #: [m] position of the center of mass of link 1
-    LINK_COM_POS_2 = 0.5  #: [m] position of the center of mass of link 2
-    LINK_MOI = 1.0  #: moments of inertia for both links
-
-    MAX_VEL_1 = 4 * np.pi
-    MAX_VEL_2 = 9 * np.pi
-
-    AVAIL_TORQUE = [-1.0, 0.0, +1]
-
-    torque_noise_max = 0.0
-
-    #: use dynamics equations from the nips paper or the book
-    book_or_nips = "book"
-    action_arrow = None
-    domain_fig = None
-    actions_num = 3
-
-    def __init__(self):
-        # init base classes
-        Model.__init__(self)
-        RenderInterface2D.__init__(self)
-        self.reward_range = (-1.0, 0.0)
-
-        # rendering info
-        bound = self.LINK_LENGTH_1 + self.LINK_LENGTH_2 + 0.2
-        # (left, right, bottom, top)
-        self.set_clipping_area((-bound, bound, -bound, bound))
-        self.set_refresh_interval(10)  # in milliseconds
-
-        # observation and action spaces
-        high = np.array([1.0, 1.0, 1.0, 1.0, self.MAX_VEL_1, self.MAX_VEL_2])
-        low = -high
-        self.observation_space = spaces.Box(low=low, high=high)
-        self.action_space = spaces.Discrete(3)
-
-        # initialize
-        self.state = None
-        self.reset()
-
-    def reset(self, seed=None, options=None):
-        self.state = self.rng.uniform(low=-0.1, high=0.1, size=(4,))
-        return self._get_ob(), {}
-
-    def step(self, action):
-        assert self.action_space.contains(action), "%r (%s) invalid" % (
-            action,
-            type(action),
-        )
-
-        # save state for rendering
-        if self.is_render_enabled():
-            self.append_state_for_rendering(np.array(self.state))
-
-        s = self.state
-        torque = self.AVAIL_TORQUE[action]
-
-        # Add noise to the force action
-        if self.torque_noise_max > 0:
-            torque += self.rng.uniform(-self.torque_noise_max, self.torque_noise_max)
-
-        # Now, augment the state with our force action so it can be passed to
-        # _dsdt
-        s_augmented = np.append(s, torque)
-
-        ns = rk4(self._dsdt, s_augmented, [0, self.dt])
-        # only care about final timestep of integration returned by integrator
-        ns = ns[-1]
-        ns = ns[:4]  # omit action
-        # ODEINT IS TOO SLOW!
-        # ns_continuous = integrate.odeint(self._dsdt, self.s_continuous,
-        # [0, self.dt])
-        # self.s_continuous = ns_continuous[-1] # We only care about the state
-        # at the ''final timestep'', self.dt
-
-        ns[0] = wrap(ns[0], -np.pi, np.pi)
-        ns[1] = wrap(ns[1], -np.pi, np.pi)
-        ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1)
-        ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2)
-        self.state = ns
-        terminated = self._terminal()
-        truncated = False
-        reward = -1.0 if not terminated else 0.0
-        return self._get_ob(), reward, terminated, truncated, {}
-
-    def _get_ob(self):
-        s = self.state
-        return np.array(
-            [np.cos(s[0]), np.sin(s[0]), np.cos(s[1]), np.sin(s[1]), s[2], s[3]]
-        )
-
-    def _terminal(self):
-        s = self.state
-        return bool(-np.cos(s[0]) - np.cos(s[1] + s[0]) > 1.0)
-
-    def _dsdt(self, s_augmented, t):
-        m1 = self.LINK_MASS_1
-        m2 = self.LINK_MASS_2
-        l1 = self.LINK_LENGTH_1
-        lc1 = self.LINK_COM_POS_1
-        lc2 = self.LINK_COM_POS_2
-        I1 = self.LINK_MOI
-        I2 = self.LINK_MOI
-        g = 9.8
-        a = s_augmented[-1]
-        s = s_augmented[:-1]
-        theta1 = s[0]
-        theta2 = s[1]
-        dtheta1 = s[2]
-        dtheta2 = s[3]
-        d1 = (
-            m1 * lc1**2
-            + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * np.cos(theta2))
-            + I1
-            + I2
-        )
-        d2 = m2 * (lc2**2 + l1 * lc2 * np.cos(theta2)) + I2
-        phi2 = m2 * lc2 * g * np.cos(theta1 + theta2 - np.pi / 2.0)
-        phi1 = (
-            -m2 * l1 * lc2 * dtheta2**2 * np.sin(theta2)
-            - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * np.sin(theta2)
-            + (m1 * lc1 + m2 * l1) * g * np.cos(theta1 - np.pi / 2)
-            + phi2
-        )
-        if self.book_or_nips == "nips":
-            # the following line is consistent with the description in the
-            # paper
-            ddtheta2 = (a + d2 / d1 * phi1 - phi2) / (m2 * lc2**2 + I2 - d2**2 / d1)
-        else:
-            # the following line is consistent with the java implementation
-            # and the book
-            ddtheta2 = (
-                a
-                + d2 / d1 * phi1
-                - m2 * l1 * lc2 * dtheta1**2 * np.sin(theta2)
-                - phi2
-            ) / (m2 * lc2**2 + I2 - d2**2 / d1)
-        ddtheta1 = -(d2 * ddtheta2 + phi1) / d1
-        return (dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0)
-
-    #
-    # Below: code for rendering
-    #
-
-    def get_background(self):
-        bg = Scene()
-        return bg
-
-    def get_scene(self, state):
-        scene = Scene()
-
-        p0 = (0.0, 0.0)
-
-        p1 = (
-            self.LINK_LENGTH_1 * np.sin(state[0]),
-            -self.LINK_LENGTH_1 * np.cos(state[0]),
-        )
-        p2 = (
-            p1[0] + self.LINK_LENGTH_2 * np.sin(state[0] + state[1]),
-            p1[1] - self.LINK_LENGTH_2 * np.cos(state[0] + state[1]),
-        )
-
-        link1 = bar_shape(p0, p1, 0.1)
-        link1.set_color((255 / 255, 140 / 255, 0 / 255))
-
-        link2 = bar_shape(p1, p2, 0.1)
-        link2.set_color((210 / 255, 105 / 255, 30 / 255))
-
-        joint1 = circle_shape(p0, 0.075)
-        joint1.set_color((255 / 255, 215 / 255, 0 / 255))
-
-        joint2 = circle_shape(p1, 0.075)
-        joint2.set_color((255 / 255, 215 / 255, 0 / 255))
-
-        goal_line = GeometricPrimitive("LINES")
-        goal_line.add_vertex((-5, 1))
-        goal_line.add_vertex((5, 1))
-
-        scene.add_shape(link1)
-        scene.add_shape(link2)
-        scene.add_shape(joint1)
-        scene.add_shape(joint2)
-        scene.add_shape(goal_line)
-
-        return scene
-
-
-def wrap(x, m, M):
-    """Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which
-    truncates, ``wrap()`` wraps x around the coordinate system defined
-    by m, M.
-    For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
-
-    Parameters
-    ----------
-        x: a scalar
-        m:
-            minimum possible value in range
-        M:
-            maximum possible value in range
-
-    Returns
-    -------
-        x:
-            a scalar, wrapped
-    """
-    diff = M - m
-    while x > M:
-        x = x - diff
-    while x < m:
-        x = x + diff
-    return x
-
-
-def bound(x, m, M=None):
-    """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
-    have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
-
-    Parameters
-    ----------
-    x:
-        scalar
-
-    Returns
-    -------
-    x:
-        scalar, bound between min (m) and Max (M)
-    """
-    if M is None:
-        M = m[1]
-        m = m[0]
-    # bound x between min (m) and Max (M)
-    return min(max(x, m), M)
-
-
-def rk4(derivs, y0, t, *args, **kwargs):
-    """
-    Integrate 1D or ND system of ODEs using 4-th order Runge-Kutta.
-    This is a toy implementation which may be useful if you find
-    yourself stranded on a system w/o scipy.  Otherwise use
-    :func:`scipy.integrate`.
-
-    Parameters:
-    -----------
-    derivs:
-        the derivative of the system and has the signature
-        ``dy = derivs(yi, ti)``
-    y0:
-        initial state vector
-    t:
-        sample times
-    args:
-        additional arguments passed to the derivative function
-    kwargs:
-        additional keyword arguments passed to the derivative function
-
-    Returns
-    -------
-    yout:
-        Runge-Kutta approximation of the ODE
-
-    Examples
-    --------
-    Example 1::
-        ## 2D system
-        def derivs6(x,t):
-            d1 =  x[0] + 2*x[1]
-            d2 =  -3*x[0] + 4*x[1]
-            return (d1, d2)
-        dt = 0.0005
-        t = arange(0.0, 2.0, dt)
-        y0 = (1,2)
-        yout = rk4(derivs6, y0, t)
-
-    Example 2::
-        ## 1D system
-        alpha = 2
-        def derivs(x,t):
-            return -alpha*x + exp(-t)
-        y0 = 1
-        yout = rk4(derivs, y0, t)
-
-    If you have access to scipy, you should probably be using the
-    scipy.integrate tools rather than this function.
-    """
-
-    try:
-        Ny = len(y0)
-    except TypeError:
-        yout = np.zeros((len(t),), np.float_)
-    else:
-        yout = np.zeros((len(t), Ny), np.float_)
-
-    yout[0] = y0
-
-    for i in np.arange(len(t) - 1):
-        thist = t[i]
-        dt = t[i + 1] - thist
-        dt2 = dt / 2.0
-        y0 = yout[i]
-
-        k1 = np.asarray(derivs(y0, thist, *args, **kwargs))
-        k2 = np.asarray(derivs(y0 + dt2 * k1, thist + dt2, *args, **kwargs))
-        k3 = np.asarray(derivs(y0 + dt2 * k2, thist + dt2, *args, **kwargs))
-        k4 = np.asarray(derivs(y0 + dt * k3, thist + dt, *args, **kwargs))
-        yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4)
-    return yout
diff --git a/rlberry/envs/classic_control/mountain_car.py b/rlberry/envs/classic_control/mountain_car.py
deleted file mode 100644
index ff3cb1335..000000000
--- a/rlberry/envs/classic_control/mountain_car.py
+++ /dev/null
@@ -1,202 +0,0 @@
-"""
-Mountain Car environment adapted from OpenAI gym [1]. (updated to gymnasium template [2])
-
-* default reward is 0       (instead of -1)
-* reward in goal state is 1 (instead of 0)
-* also implemented as a generative model (in addition to an online model)
-* render function follows the rlberry rendering interface.
-
-[1] https://github.com/openai/gym/blob/master/gym/envs/
-[2] https://gymnasium.farama.org/api/env/
-classic_control/mountain_car.py
-"""
-
-import math
-
-import numpy as np
-
-import rlberry.spaces as spaces
-from rlberry.envs.interface import Model
-from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D
-
-
-class MountainCar(RenderInterface2D, Model):
-    """
-    The agent (a car) is started at the bottom of a valley. For any given
-    state the agent may choose to accelerate to the left, right or cease
-    any acceleration.
-
-    Notes
-    -----
-    Source:
-        The environment appeared first in Andrew Moore's PhD Thesis (1990).
-
-    Observation:
-        Type: Box(2)
-        Num    Observation               Min            Max
-        0      Car Position              -1.2           0.6
-        1      Car Velocity              -0.07          0.07
-
-    Actions:
-        Type: Discrete(3)
-        Num    Action
-        0      Accelerate to the Left
-        1      Don't accelerate
-        2      Accelerate to the Right
-
-        Note: This does not affect the amount of velocity affected by the
-        gravitational pull acting on the car.
-
-    Reward:
-        Reward of 1 is awarded if the agent reached the flag (position = 0.5)
-        on top of the mountain.
-        Reward of 0 is awarded if the position of the agent is less than 0.5.
-
-    Starting State:
-        The position of the car is assigned a uniform random value in
-        [-0.6 , -0.4].
-        The starting velocity of the car is always assigned to 0.
-
-    Episode Termination:
-        The car position is more than 0.5
-    """
-
-    name = "MountainCar"
-
-    def __init__(self, goal_velocity=0):
-        # init base classes
-        Model.__init__(self)
-        RenderInterface2D.__init__(self)
-
-        self.min_position = -1.2
-        self.max_position = 0.6
-        self.max_speed = 0.07
-        self.goal_position = 0.5
-        self.goal_velocity = goal_velocity
-
-        self.force = 0.001
-        self.gravity = 0.0025
-
-        self.low = np.array([self.min_position, -self.max_speed])
-        self.high = np.array([self.max_position, self.max_speed])
-
-        self.action_space = spaces.Discrete(3)
-        self.observation_space = spaces.Box(self.low, self.high)
-
-        self.reward_range = (0.0, 1.0)
-
-        # rendering info
-        self.set_clipping_area((-1.2, 0.6, -0.2, 1.1))
-        self.set_refresh_interval(10)  # in milliseconds
-
-        # initial reset
-        self.reset()
-
-    def step(self, action):
-        assert self.action_space.contains(action), "%r (%s) invalid" % (
-            action,
-            type(action),
-        )
-
-        # save state for rendering
-        if self.is_render_enabled():
-            self.append_state_for_rendering(np.array(self.state))
-
-        next_state, reward, terminated, truncated, info = self.sample(
-            self.state, action
-        )
-        self.state = next_state.copy()
-
-        return next_state, reward, terminated, truncated, info
-
-    def reset(self, seed=None, options=None):
-        self.state = np.array([self.rng.uniform(low=-0.6, high=-0.4), 0])
-        return self.state.copy(), {}
-
-    def sample(self, state, action):
-        if not isinstance(state, np.ndarray):
-            state = np.array(state)
-        assert self.observation_space.contains(
-            state
-        ), "Invalid state as argument of reset()."
-        assert self.action_space.contains(action), "%r (%s) invalid" % (
-            action,
-            type(action),
-        )
-
-        position = state[0]
-        velocity = state[1]
-        velocity += (action - 1) * self.force + math.cos(3 * position) * (-self.gravity)
-        velocity = np.clip(velocity, -self.max_speed, self.max_speed)
-        position += velocity
-        position = np.clip(position, self.min_position, self.max_position)
-        if position == self.min_position and velocity < 0:
-            velocity = 0
-
-        terminated = bool(
-            position >= self.goal_position and velocity >= self.goal_velocity
-        )
-        truncated = False
-        done = terminated or truncated
-        reward = 0.0
-        if done:
-            reward = 1.0
-
-        next_state = np.array([position, velocity])
-        return next_state, reward, terminated, truncated, {}
-
-    @staticmethod
-    def _height(xs):
-        return np.sin(3 * xs) * 0.45 + 0.55
-
-    #
-    # Below: code for rendering
-    #
-
-    def get_background(self):
-        bg = Scene()
-        mountain = GeometricPrimitive("TRIANGLE_FAN")
-        flag = GeometricPrimitive("TRIANGLES")
-        mountain.set_color((0.6, 0.3, 0.0))
-        flag.set_color((0.0, 0.5, 0.0))
-
-        # Mountain
-        mountain.add_vertex((-0.3, -1.0))
-        mountain.add_vertex((0.6, -1.0))
-
-        n_points = 50
-        obs_range = self.observation_space.high[0] - self.observation_space.low[0]
-        eps = obs_range / (n_points - 1)
-        for ii in reversed(range(n_points)):
-            x = self.observation_space.low[0] + ii * eps
-            y = self._height(x)
-            mountain.add_vertex((x, y))
-        mountain.add_vertex((-1.2, -1.0))
-
-        # Flag
-        goal_x = self.goal_position
-        goal_y = self._height(goal_x)
-        flag.add_vertex((goal_x, goal_y))
-        flag.add_vertex((goal_x + 0.025, goal_y + 0.075))
-        flag.add_vertex((goal_x - 0.025, goal_y + 0.075))
-
-        bg.add_shape(mountain)
-        bg.add_shape(flag)
-
-        return bg
-
-    def get_scene(self, state):
-        scene = Scene()
-
-        agent = GeometricPrimitive("QUADS")
-        agent.set_color((0.0, 0.0, 0.0))
-        size = 0.025
-        x = state[0]
-        y = self._height(x)
-        agent.add_vertex((x - size, y - size))
-        agent.add_vertex((x + size, y - size))
-        agent.add_vertex((x + size, y + size))
-        agent.add_vertex((x - size, y + size))
-
-        scene.add_shape(agent)
-        return scene
diff --git a/rlberry/envs/classic_control/pendulum.py b/rlberry/envs/classic_control/pendulum.py
deleted file mode 100644
index 972db1ceb..000000000
--- a/rlberry/envs/classic_control/pendulum.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""
-Pendulum environment adapted from OpenAI gym [1]. (updated to gymnasium template [2])
-
-Modifications:
-* render function follows the rlberry rendering interface
-
-[1] https://github.com/openai/gym/blob/master/gym/
-[2] https://gymnasium.farama.org/api/env/
-envs/classic_control/pendulum.py
-"""
-
-import numpy as np
-import rlberry.spaces as spaces
-from rlberry.envs.interface import Model
-from rlberry.rendering import Scene, RenderInterface2D
-from rlberry.rendering.common_shapes import bar_shape, circle_shape
-
-
-class Pendulum(RenderInterface2D, Model):
-    """
-    The inverted pendulum swingup problem is a classic problem
-    in the control literature. In this version of the problem,
-    the pendulum starts in a random position, and the goal
-    is to swing it up so it stays upright.
-    """
-
-    name = "Pendulum"
-
-    def __init__(self):
-        # init base classes
-        Model.__init__(self)
-        RenderInterface2D.__init__(self)
-
-        # environment parameters
-        self.max_speed = 8.0
-        self.max_torque = 2.0
-        self.dt = 0.5
-        self.gravity = 10.0
-        self.mass = 1.0
-        self.length = 1.0
-
-        # rendering info
-        self.set_clipping_area((-2.2, 2.2, -2.2, 2.2))
-        self.set_refresh_interval(10)
-
-        # observation and action spaces
-        high = np.array([1.0, 1.0, self.max_speed])
-        low = -high
-        self.action_space = spaces.Box(
-            low=-self.max_torque, high=self.max_torque, shape=(1,)
-        )
-        self.observation_space = spaces.Box(low=low, high=high)
-
-        # initialize
-        self.reset()
-
-    def reset(self, seed=None, options=None):
-        high = np.array([np.pi, 1])
-        low = -high
-        self.state = self.rng.uniform(low=low, high=high)
-        self.last_action = None
-        return self._get_ob(), {}
-
-    def step(self, action):
-        assert self.action_space.contains(action), "%r (%s) invalid" % (
-            action,
-            type(action),
-        )
-
-        # save state for rendering
-        if self.is_render_enabled():
-            self.append_state_for_rendering(np.array(self.state))
-
-        theta, thetadot = self.state
-        gravity = self.gravity
-        mass = self.mass
-        length = self.length
-        dt = self.dt
-
-        action = np.clip(action, -self.max_torque, self.max_torque)[0]
-        self.last_action = action  # for rendering
-        costs = (
-            angle_normalize(theta) ** 2 + 0.1 * thetadot**2 + 0.001 * (action**2)
-        )
-
-        # compute the next state after action
-        newthetadot = (
-            thetadot
-            + (
-                -3 * gravity / (2 * length) * np.sin(theta + np.pi)
-                + 3.0 / (mass * length**2) * action
-            )
-            * dt
-        )
-        newtheta = theta + newthetadot * dt
-        newthetadot = np.clip(newthetadot, -self.max_speed, self.max_speed)
-
-        self.state = np.array([newtheta, newthetadot])
-        return self._get_ob(), -costs, False, False, {}
-
-    def _get_ob(self):
-        theta, thetadot = self.state
-        return np.array([np.cos(theta), np.sin(theta), thetadot])
-
-    #
-    # Below code for rendering
-    #
-
-    def get_background(self):
-        bg = Scene()
-        return bg
-
-    def get_scene(self, state):
-        scene = Scene()
-
-        p0 = (0.0, 0.0)
-        p1 = (self.length * np.sin(state[0]), -self.length * np.cos(state[0]))
-
-        link = bar_shape(p0, p1, 0.1)
-        link.set_color((255 / 255, 105 / 255, 30 / 255))
-
-        joint = circle_shape(p0, 0.075)
-        joint.set_color((255 / 255, 215 / 255, 0 / 255))
-
-        scene.add_shape(link)
-        scene.add_shape(joint)
-
-        return scene
-
-
-def angle_normalize(x):
-    return ((x + np.pi) % (2 * np.pi)) - np.pi
diff --git a/rlberry/envs/finite/__init__.py b/rlberry/envs/finite/__init__.py
deleted file mode 100644
index 036e4520a..000000000
--- a/rlberry/envs/finite/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .finite_mdp import FiniteMDP
-from .gridworld import GridWorld
-from .chain import Chain
diff --git a/rlberry/envs/finite/chain.py b/rlberry/envs/finite/chain.py
deleted file mode 100644
index da333d713..000000000
--- a/rlberry/envs/finite/chain.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import numpy as np
-
-from rlberry.envs.finite import FiniteMDP
-from rlberry.rendering import RenderInterface2D, Scene, GeometricPrimitive
-
-
-class Chain(RenderInterface2D, FiniteMDP):
-    """
-    Simple chain environment.
-    Reward 0.05 in initial state, reward 1.0 in final state.
-
-    Parameters
-    ----------
-    L : int
-        length of the chain
-    fail_prob : double
-        fail probability
-    """
-
-    name = "Chain"
-
-    def __init__(self, L=5, fail_prob=0.1):
-        assert L >= 2
-        self.L = L
-        self.fail_prob = fail_prob
-
-        # transition probabilities
-        P = np.zeros((L, 2, L))
-        for ss in range(L):
-            for _ in range(2):
-                if ss == 0:
-                    P[ss, 0, ss] = 1.0 - fail_prob  # action 0 = don't move
-                    P[ss, 1, ss + 1] = 1.0 - fail_prob  # action 1 = right
-                    P[ss, 0, ss + 1] = fail_prob
-                    P[ss, 1, ss] = fail_prob
-                elif ss == L - 1:
-                    P[ss, 0, ss - 1] = 1.0 - fail_prob  # action 0 = left
-                    P[ss, 1, ss] = 1.0 - fail_prob  # action 1 = don't move
-                    P[ss, 0, ss] = fail_prob
-                    P[ss, 1, ss - 1] = fail_prob
-                else:
-                    P[ss, 0, ss - 1] = 1.0 - fail_prob  # action 0 = left
-                    P[ss, 1, ss + 1] = 1.0 - fail_prob  # action 1 = right
-                    P[ss, 0, ss + 1] = fail_prob
-                    P[ss, 1, ss - 1] = fail_prob
-
-                    # mean reward
-        S = L
-        A = 2
-        R = np.zeros((S, A))
-        R[L - 1, :] = 1.0
-        R[0, :] = 0.05
-
-        # init base classes
-        FiniteMDP.__init__(self, R, P, initial_state_distribution=0)
-        RenderInterface2D.__init__(self)
-        self.reward_range = (0.0, 1.0)
-
-        # rendering info
-        self.set_clipping_area((0, L, 0, 1))
-        self.set_refresh_interval(100)  # in milliseconds
-
-    def step(self, action):
-        assert action in self._actions, "Invalid action!"
-
-        # save state for rendering
-        if self.is_render_enabled():
-            self.append_state_for_rendering(self.state)
-
-        # take step
-        next_state, reward, terminated, truncated, info = self.sample(
-            self.state, action
-        )
-
-        self.state = next_state
-        return next_state, reward, terminated, truncated, info
-
-    #
-    # Code for rendering
-    #
-
-    def get_background(self):
-        """
-        Returne a scene (list of shapes) representing the background
-        """
-        bg = Scene()
-        colors = [(0.8, 0.8, 0.8), (0.9, 0.9, 0.9)]
-        for ii in range(self.L):
-            shape = GeometricPrimitive("QUADS")
-            shape.add_vertex((ii, 0))
-            shape.add_vertex((ii + 1, 0))
-            shape.add_vertex((ii + 1, 1))
-            shape.add_vertex((ii, 1))
-            shape.set_color(colors[ii % 2])
-            bg.add_shape(shape)
-
-        flag = GeometricPrimitive("TRIANGLES")
-        flag.set_color((0.0, 0.5, 0.0))
-        x = self.L - 0.5
-        y = 0.25
-        flag.add_vertex((x, y))
-        flag.add_vertex((x + 0.25, y + 0.5))
-        flag.add_vertex((x - 0.25, y + 0.5))
-        bg.add_shape(flag)
-
-        return bg
-
-    def get_scene(self, state):
-        """
-        Return scene (list of shapes) representing a given state
-        """
-        scene = Scene()
-
-        agent = GeometricPrimitive("QUADS")
-        agent.set_color((0.75, 0.0, 0.5))
-
-        size = 0.25
-        x = state + 0.5
-        y = 0.5
-
-        agent.add_vertex((x - size / 4.0, y - size))
-        agent.add_vertex((x + size / 4.0, y - size))
-        agent.add_vertex((x + size / 4.0, y + size))
-        agent.add_vertex((x - size / 4.0, y + size))
-
-        agent.add_vertex((x - size, y - size / 4.0))
-        agent.add_vertex((x + size, y - size / 4.0))
-        agent.add_vertex((x + size, y + size / 4.0))
-        agent.add_vertex((x - size, y + size / 4.0))
-
-        scene.add_shape(agent)
-        return scene
diff --git a/rlberry/envs/finite/gridworld.py b/rlberry/envs/finite/gridworld.py
deleted file mode 100644
index ce585317d..000000000
--- a/rlberry/envs/finite/gridworld.py
+++ /dev/null
@@ -1,490 +0,0 @@
-import matplotlib
-import numpy as np
-
-import matplotlib.pyplot as plt
-from matplotlib import cm
-
-from rlberry.envs.finite import FiniteMDP
-from rlberry.envs.finite import gridworld_utils
-from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D
-from rlberry.rendering.common_shapes import circle_shape
-
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class GridWorld(RenderInterface2D, FiniteMDP):
-    """
-    Simple GridWorld environment.
-
-    Parameters
-    -----------
-    nrows : int
-        number of rows
-    ncols : int
-        number of columns
-    start_coord : tuple
-        tuple with coordinates of initial position
-    terminal_states : tuple
-        ((row_0, col_0), (row_1, col_1), ...) = coordinates of
-        terminal states
-    success_probability : double
-        probability of moving in the chosen direction
-    reward_at: dict
-        dictionary, keys = tuple containing coordinates, values = reward
-        at each coordinate
-    walls : tuple
-        ((row_0, col_0), (row_1, col_1), ...) = coordinates of walls
-    default_reward : double
-        reward received at states not in  'reward_at'
-
-    """
-
-    name = "GridWorld"
-
-    def __init__(
-        self,
-        nrows=5,
-        ncols=5,
-        start_coord=(0, 0),
-        terminal_states=None,
-        success_probability=0.9,
-        reward_at=None,
-        walls=((1, 1), (2, 2)),
-        default_reward=0.0,
-    ):
-        # Grid dimensions
-        self.nrows = nrows
-        self.ncols = ncols
-
-        # Reward parameters
-        self.default_reward = default_reward
-
-        # Default config
-        if reward_at is not None:
-            self.reward_at = reward_at
-        else:
-            self.reward_at = {(nrows - 1, ncols - 1): 1}
-        if walls is not None:
-            self.walls = walls
-        else:
-            self.walls = ()
-        if terminal_states is not None:
-            self.terminal_states = terminal_states
-        else:
-            self.terminal_states = ()
-
-        # Probability of going left/right/up/down when choosing the
-        # correspondent action
-        # The remaining probability mass is distributed uniformly to other
-        # available actions
-        self.success_probability = success_probability
-
-        # Start coordinate
-        self.start_coord = tuple(start_coord)
-
-        # Actions (string to index & index to string)
-        self.a_str2idx = {"left": 0, "right": 1, "down": 2, "up": 3}
-        self.a_idx2str = {0: "left", 1: "right", 2: "down", 3: "up"}
-
-        # --------------------------------------------
-        # The variables below are defined in _build()
-        # --------------------------------------------
-
-        # Mappings (state index) <-> (state coordinate)
-        self.index2coord = {}
-        self.coord2index = {}
-
-        # MDP parameters for base class
-        self.P = None
-        self.R = None
-        self.Ns = None
-        self.Na = 4
-
-        # Build
-        self._build()
-        init_state_idx = self.coord2index[start_coord]
-        FiniteMDP.__init__(
-            self, self.R, self.P, initial_state_distribution=init_state_idx
-        )
-        RenderInterface2D.__init__(self)
-        self.reset()
-        self.reward_range = (self.R.min(), self.R.max())
-
-        # rendering info
-        self.set_clipping_area((0, self.ncols, 0, self.nrows))
-        self.set_refresh_interval(100)  # in milliseconds
-        self.renderer_type = "pygame"
-
-    @classmethod
-    def from_layout(
-        cls, layout: str = gridworld_utils.DEFAULT_LAYOUT, success_probability=0.95
-    ):
-        """
-        Create GridWorld instance from a layout.
-
-        Layout symbols:
-
-        '#' : wall
-        'r' : reward of 1, terminal state
-        'R' : reward of 1, non-terminal state
-        'T' : terminal state
-        'I' : initial state (if several, start uniformly among I)
-        'O' : empty state
-        any other char : empty state
-
-        Layout example:
-
-        IOOOO # OOOOO  O OOOOR
-        OOOOO # OOOOO  # OOOOO
-        OOOOO O OOOOO  # OOOOO
-        OOOOO # OOOOO  # OOOOO
-        IOOOO # OOOOO  # OOOOr
-        """
-        info = gridworld_utils.get_layout_info(layout)
-        nrows = info["nrows"]
-        ncols = info["ncols"]
-        walls = info["walls"]
-        reward_at = info["reward_at"]
-        terminal_states = info["terminal_states"]
-        initial_states_coord = info["initial_states"]
-
-        # Init base class
-        env = cls(
-            nrows=nrows,
-            ncols=ncols,
-            terminal_states=terminal_states,
-            success_probability=success_probability,
-            reward_at=reward_at,
-            walls=walls,
-            default_reward=0.0,
-        )
-
-        # Set initial distribution
-        distr = np.zeros(env.observation_space.n)
-        for init_coord in initial_states_coord:
-            init_index = env.coord2index[init_coord]
-            distr[init_index] = 1.0
-        distr = distr / distr.sum()
-        env.set_initial_state_distribution(distr)
-
-        return env
-
-    def is_terminal(self, state):
-        state_coord = self.index2coord[state]
-        return state_coord in self.terminal_states
-
-    def reward_fn(self, state, action, next_state):
-        row, col = self.index2coord[state]
-        if (row, col) in self.reward_at:
-            return self.reward_at[(row, col)]
-        if (row, col) in self.walls:
-            return 0.0
-        return self.default_reward
-
-    def _build(self):
-        self._build_state_mappings_and_states()
-        self._build_transition_probabilities()
-        self._build_mean_rewards()
-
-    def _build_state_mappings_and_states(self):
-        index = 0
-        for rr in range(self.nrows):
-            for cc in range(self.ncols):
-                if (rr, cc) in self.walls:
-                    self.coord2index[(rr, cc)] = -1
-                else:
-                    self.coord2index[(rr, cc)] = index
-                    self.index2coord[index] = (rr, cc)
-                    index += 1
-        states = np.arange(index).tolist()
-        self.Ns = len(states)
-
-    def _build_mean_rewards(self):
-        S = self.Ns
-        A = self.Na
-        self.R = np.zeros((S, A))
-        for ss in range(S):
-            for aa in range(A):
-                mean_r = 0
-                for ns in range(S):
-                    mean_r += self.reward_fn(ss, aa, ns) * self.P[ss, aa, ns]
-                self.R[ss, aa] = mean_r
-
-    def _build_transition_probabilities(self):
-        Ns = self.Ns
-        Na = self.Na
-        self.P = np.zeros((Ns, Na, Ns))
-        for s in range(Ns):
-            s_coord = self.index2coord[s]
-            neighbors = self._get_neighbors(*s_coord)
-            valid_neighbors = [neighbors[nn][0] for nn in neighbors if neighbors[nn][1]]
-            n_valid = len(valid_neighbors)
-            for a in range(Na):  # each action corresponds to a direction
-                for nn in neighbors:
-                    next_s_coord = neighbors[nn][0]
-                    if next_s_coord in valid_neighbors:
-                        next_s = self.coord2index[next_s_coord]
-                        if a == nn:  # action is successful
-                            self.P[s, a, next_s] = self.success_probability + (
-                                1 - self.success_probability
-                            ) * (n_valid == 1)
-                        elif neighbors[a][0] not in valid_neighbors:
-                            self.P[s, a, s] = 1.0
-                        else:
-                            if n_valid > 1:
-                                self.P[s, a, next_s] = (
-                                    1.0 - self.success_probability
-                                ) / (n_valid - 1)
-
-    def _get_neighbors(self, row, col):
-        aux = {}
-        aux["left"] = (row, col - 1)  # left
-        aux["right"] = (row, col + 1)  # right
-        aux["up"] = (row - 1, col)  # up
-        aux["down"] = (row + 1, col)  # down
-        neighbors = {}
-        for direction_str in aux:
-            direction = self.a_str2idx[direction_str]
-            next_s = aux[direction_str]
-            neighbors[direction] = (next_s, self._is_valid(*next_s))
-        return neighbors
-
-    def get_transition_support(self, state):
-        row, col = self.index2coord[state]
-        neighbors = [(row, col - 1), (row, col + 1), (row - 1, col), (row + 1, col)]
-        return [
-            self.coord2index[coord] for coord in neighbors if self._is_valid(*coord)
-        ]
-
-    def _is_valid(self, row, col):
-        if (row, col) in self.walls:
-            return False
-        elif row < 0 or row >= self.nrows:
-            return False
-        elif col < 0 or col >= self.ncols:
-            return False
-        return True
-
-    def _build_ascii(self):
-        grid = [[""] * self.ncols for rr in range(self.nrows)]
-        grid_idx = [[""] * self.ncols for rr in range(self.nrows)]
-        for rr in range(self.nrows):
-            for cc in range(self.ncols):
-                if (rr, cc) in self.walls:
-                    grid[rr][cc] = "x "
-                else:
-                    grid[rr][cc] = "o "
-                grid_idx[rr][cc] = str(self.coord2index[(rr, cc)]).zfill(3)
-
-        for rr, cc in self.reward_at:
-            rwd = self.reward_at[(rr, cc)]
-            if rwd > 0:
-                grid[rr][cc] = "+ "
-            if rwd < 0:
-                grid[rr][cc] = "-"
-
-        grid[self.start_coord[0]][self.start_coord[1]] = "I "
-
-        # current position of the agent
-        x, y = self.index2coord[self.state]
-        grid[x][y] = "A "
-
-        #
-        grid_ascii = ""
-        for rr in range(self.nrows + 1):
-            if rr < self.nrows:
-                grid_ascii += str(rr).zfill(2) + 2 * " " + " ".join(grid[rr]) + "\n"
-            else:
-                grid_ascii += 3 * " " + " ".join(
-                    [str(jj).zfill(2) for jj in range(self.ncols)]
-                )
-
-        self.grid_ascii = grid_ascii
-        self.grid_idx = grid_idx
-        return self.grid_ascii
-
-    def display_values(self, values):
-        assert len(values) == self.Ns
-        grid_values = [["X".ljust(9)] * self.ncols for ii in range(self.nrows)]
-        for s_idx in range(self.Ns):
-            v = values[s_idx]
-            row, col = self.index2coord[s_idx]
-            grid_values[row][col] = ("%0.2f" % v).ljust(9)
-
-        grid_values_ascii = ""
-        for rr in range(self.nrows + 1):
-            if rr < self.nrows:
-                grid_values_ascii += (
-                    str(rr).zfill(2) + 2 * " " + " ".join(grid_values[rr]) + "\n"
-                )
-            else:
-                grid_values_ascii += 4 * " " + " ".join(
-                    [str(jj).zfill(2).ljust(9) for jj in range(self.ncols)]
-                )
-        logger.info(grid_values_ascii)
-
-    def print_transition_at(self, row, col, action):
-        s_idx = self.coord2index[(row, col)]
-        if s_idx < 0:
-            logger.info("wall!")
-            return
-        a_idx = self.a_str2idx[action]
-        for next_s_idx, prob in enumerate(self.P[s_idx, a_idx]):
-            if prob > 0:
-                logger.info(
-                    "to (%d, %d) with prob %f"
-                    % (self.index2coord[next_s_idx] + (prob,))
-                )
-
-    def render_ascii(self):
-        print(self._build_ascii())
-
-    def step(self, action):
-        assert self.action_space.contains(action), "Invalid action!"
-
-        # save state for rendering
-        if self.is_render_enabled():
-            self.append_state_for_rendering(self.state)
-
-        # take step
-        next_state, reward, terminated, truncated, info = self.sample(
-            self.state, action
-        )
-        self.state = next_state
-        return next_state, reward, terminated, truncated, info
-
-    #
-    # Code for rendering
-    #
-    def get_layout_array(self, state_data=None, fill_walls_with=np.nan):
-        """
-        Returns an array 'layout' of shape (nrows, ncols) such that:
-
-            layout[row, col] = state_data[self.coord2idx[row, col]]
-
-        If (row, col) is a wall:
-
-            layout[row, col] = fill_walls_with
-
-        Parameters
-        ----------
-        state_data : np.array, default = None
-            Array of shape (self.observation_space.n,)
-        fill_walls_with : float, default: np.nan
-            Value to set in the layout in the coordinates corresponding to walls.
-
-        Returns
-        -------
-        Gridworld layout array of shape (nrows, ncols).
-        """
-        layout = np.zeros((self.nrows, self.ncols))
-        if state_data is not None:
-            assert state_data.shape == (self.observation_space.n,)
-            data_rows = [self.index2coord[idx][0] for idx in self.index2coord]
-            data_cols = [self.index2coord[idx][1] for idx in self.index2coord]
-            layout[data_rows, data_cols] = state_data
-        else:
-            state_rr, state_cc = self.index2coord[self.state]
-            layout[state_rr, state_cc] = 1.0
-
-        walls_rows = [ww[0] for ww in self.walls]
-        walls_cols = [ww[1] for ww in self.walls]
-        layout[walls_rows, walls_cols] = fill_walls_with
-        return layout
-
-    def get_layout_img(
-        self, state_data=None, colormap_name="cool", wall_color=(0.0, 0.0, 0.0)
-    ):
-        """
-        Returns an image array representing the value of `state_data` on
-        the gridworld layout.
-
-        Parameters
-        ----------
-        state_data : np.array, default = None
-            Array of shape (self.observation_space.n,)
-        colormap_name : str, default = 'cool'
-            Colormap name.
-            See https://matplotlib.org/tutorials/colors/colormaps.html
-        wall_color : tuple
-            RGB color for walls.
-        Returns
-        -------
-        Gridworld image array of shape (nrows, ncols, 3).
-        """
-        # map data to [0.0, 1.0]
-        if state_data is not None:
-            state_data = state_data - state_data.min()
-            if state_data.max() > 0.0:
-                state_data = state_data / state_data.max()
-
-        colormap_fn = plt.get_cmap(colormap_name)
-        layout = self.get_layout_array(state_data, fill_walls_with=np.nan)
-        norm = matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
-        scalar_map = cm.ScalarMappable(norm=norm, cmap=colormap_fn)
-        img = np.zeros(layout.shape + (3,))
-        for rr in range(layout.shape[0]):
-            for cc in range(layout.shape[1]):
-                if np.isnan(layout[rr, cc]):
-                    img[self.nrows - 1 - rr, cc, :] = wall_color
-                else:
-                    img[self.nrows - 1 - rr, cc, :3] = scalar_map.to_rgba(
-                        layout[rr, cc]
-                    )[:3]
-        return img
-
-    def get_background(self):
-        """
-        Return a scene (list of shapes) representing the background
-        """
-        bg = Scene()
-
-        # walls
-        for wall in self.walls:
-            y, x = wall
-            shape = GeometricPrimitive("POLYGON")
-            shape.set_color((0.25, 0.25, 0.25))
-            shape.add_vertex((x, y))
-            shape.add_vertex((x + 1, y))
-            shape.add_vertex((x + 1, y + 1))
-            shape.add_vertex((x, y + 1))
-            bg.add_shape(shape)
-
-        # rewards
-        for y, x in self.reward_at:
-            flag = GeometricPrimitive("POLYGON")
-            rwd = self.reward_at[(y, x)]
-            color = 0.5 * np.abs(rwd) / self.reward_range[1]
-            if rwd > 0:
-                flag.set_color((0.0, color, 0.0))
-            if rwd < 0:
-                flag.set_color((color, 0.0, 0.0))
-
-            x += 0.5
-            y += 0.25
-            flag.add_vertex((x, y))
-            flag.add_vertex((x + 0.25, y + 0.5))
-            flag.add_vertex((x - 0.25, y + 0.5))
-            bg.add_shape(flag)
-
-        return bg
-
-    def get_scene(self, state):
-        """
-        Return scene (list of shapes) representing a given state
-        """
-        y, x = self.index2coord[state]
-        x = x + 0.5  # centering
-        y = y + 0.5  # centering
-
-        scene = Scene()
-
-        agent = circle_shape((x, y), 0.25, n_points=5)
-        agent.type = "POLYGON"
-        agent.set_color((0.75, 0.0, 0.5))
-
-        scene.add_shape(agent)
-        return scene
diff --git a/rlberry/envs/finite/gridworld_utils.py b/rlberry/envs/finite/gridworld_utils.py
deleted file mode 100644
index ce0390f10..000000000
--- a/rlberry/envs/finite/gridworld_utils.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import numpy as np
-
-WALL_SYMBOL = "#"
-REWARD_TERMINAL_SYMBOL = "r"
-REWARD_SYMBOL = "R"
-TERMINAL_STATE_SYMBOL = "T"
-INITIAL_STATE_SYMBOL = "I"
-
-
-# spaces are ignored
-DEFAULT_LAYOUT = """
-IOOOO # OOOOO  O OOOOR
-OOOOO # OOOOO  # OOOOO
-OOOOO O OOOOO  # OOOOO
-OOOOO # OOOOO  # OOOOO
-IOOOO # OOOOO  # OOOOr
-"""
-
-
-def _preprocess_layout(layout):
-    layout = layout.replace(" ", "")  # remove spaces
-    # remove first and last line breaks
-    if layout[0] == "\n":
-        layout = layout[1:]
-    if layout[-1] == "\n":
-        layout = layout[:-1]
-
-    # make sure all lines have the same length
-    lines = layout.split("\n")
-    len_lines = [len(line) for line in lines]
-    max_len = np.max(len_lines)
-    # below, also reverse lines (so that render is not inversed in the y-direction)
-    adjusted_lines = [
-        line.ljust(max_len, "O") for line in reversed(lines)
-    ]  # fill with empty state
-    layout = "\n".join(adjusted_lines)
-    return layout
-
-
-def get_layout_info(layout):
-    layout = _preprocess_layout(layout)
-    lines = layout.split("\n")
-    nrows = len(lines)
-    ncols = len(lines[0])
-    walls = []
-    initial_states = []
-    terminal_states = []
-    reward_at = dict()
-    for rr in range(nrows):
-        line = lines[rr]
-        for cc in range(ncols):
-            symbol = line[cc]
-            state_coord = (rr, cc)
-            if symbol == WALL_SYMBOL:
-                walls.append(state_coord)
-            if symbol == TERMINAL_STATE_SYMBOL or symbol == REWARD_TERMINAL_SYMBOL:
-                terminal_states.append(state_coord)
-            if symbol == REWARD_SYMBOL or symbol == REWARD_TERMINAL_SYMBOL:
-                reward_at[state_coord] = 1.0
-            if symbol == INITIAL_STATE_SYMBOL:
-                initial_states.append(state_coord)
-    info = dict(
-        nrows=nrows,
-        ncols=ncols,
-        initial_states=tuple(initial_states),
-        terminal_states=tuple(terminal_states),
-        walls=tuple(walls),
-        reward_at=reward_at,
-    )
-    return info
diff --git a/rlberry/envs/finite/finite_mdp.py b/rlberry/envs/finite_mdp.py
similarity index 100%
rename from rlberry/envs/finite/finite_mdp.py
rename to rlberry/envs/finite_mdp.py
diff --git a/rlberry/envs/tests/test_bandits.py b/rlberry/envs/tests/test_bandits.py
deleted file mode 100644
index 0e35fccf4..000000000
--- a/rlberry/envs/tests/test_bandits.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import numpy as np
-from rlberry.seeding import safe_reseed
-from rlberry.seeding import Seeder
-from rlberry.envs.bandits import (
-    AdversarialBandit,
-    BernoulliBandit,
-    NormalBandit,
-    CorruptedNormalBandit,
-)
-
-
-TEST_SEED = 42
-
-
-def test_bernoulli():
-    env = BernoulliBandit(p=[0.05, 0.95])
-    safe_reseed(env, Seeder(TEST_SEED))
-
-    sample = [env.step(1)[1] for f in range(1000)]
-
-    safe_reseed(env, Seeder(TEST_SEED))
-
-    sample2 = [env.step(1)[1] for f in range(1000)]
-
-    assert np.abs(np.mean(sample) - 0.95) < 0.1
-    assert np.mean(sample) == np.mean(sample2), "Not reproducible"
-
-
-def test_normal():
-    env = NormalBandit(means=[0, 1])
-    safe_reseed(env, Seeder(TEST_SEED))
-
-    sample = [env.step(1)[1] for f in range(1000)]
-    safe_reseed(env, Seeder(TEST_SEED))
-
-    sample2 = [env.step(1)[1] for f in range(1000)]
-
-    assert np.abs(np.mean(sample) - 1) < 0.1
-    assert np.abs(sample[0] - sample2[0]) < 0.01, "Not reproducible"
-
-
-def test_cor_normal():
-    env = CorruptedNormalBandit(means=[0, 1], cor_prop=0.1)
-    safe_reseed(env, Seeder(TEST_SEED))
-
-    sample = [env.step(1)[1] for f in range(1000)]
-    assert np.abs(np.median(sample) - 1) < 0.5
-
-
-def test_adversarial():
-    r1 = np.concatenate((2 * np.ones((500, 1)), np.ones((500, 1))), axis=1)
-
-    r2 = np.concatenate((np.ones((500, 1)), 2 * np.ones((500, 1))), axis=1)
-
-    rewards = np.concatenate((r1, r2))
-
-    env = AdversarialBandit(rewards=rewards)
-    safe_reseed(env, Seeder(TEST_SEED))
-
-    sample = [env.step(1)[1] for f in range(1000)]
-    assert np.abs(np.mean(sample) - 1.5) < 1e-10
diff --git a/rlberry/envs/tests/test_env_seeding.py b/rlberry/envs/tests/test_env_seeding.py
index 26682e286..336dae652 100644
--- a/rlberry/envs/tests/test_env_seeding.py
+++ b/rlberry/envs/tests/test_env_seeding.py
@@ -3,13 +3,13 @@
 import rlberry.seeding as seeding
 
 from copy import deepcopy
-from rlberry.envs.classic_control import MountainCar, Acrobot, Pendulum
-from rlberry.envs.finite import Chain
-from rlberry.envs.finite import GridWorld
-from rlberry.envs.benchmarks.grid_exploration.four_room import FourRoom
-from rlberry.envs.benchmarks.grid_exploration.six_room import SixRoom
-from rlberry.envs.benchmarks.grid_exploration.apple_gold import AppleGold
-from rlberry.envs.benchmarks.ball_exploration import PBall2D, SimplePBallND
+from rlberry_research.envs.classic_control import MountainCar, Acrobot, Pendulum
+from rlberry_research.envs.finite import Chain
+from rlberry_research.envs.finite import GridWorld
+from rlberry_research.envs.benchmarks.grid_exploration.four_room import FourRoom
+from rlberry_research.envs.benchmarks.grid_exploration.six_room import SixRoom
+from rlberry_research.envs.benchmarks.grid_exploration.apple_gold import AppleGold
+from rlberry_research.envs.benchmarks.ball_exploration import PBall2D, SimplePBallND
 
 classes = [
     MountainCar,
diff --git a/rlberry/envs/tests/test_gym_make.py b/rlberry/envs/tests/test_gym_make.py
index 9ad80d2ee..e7d53be85 100644
--- a/rlberry/envs/tests/test_gym_make.py
+++ b/rlberry/envs/tests/test_gym_make.py
@@ -23,11 +23,11 @@ def test_atari_make():
 
 def test_rendering_with_atari_make():
     from rlberry.manager import ExperimentManager
-    from rlberry.agents.torch import PPOAgent
+    from rlberry_research.agents.torch import PPOAgent
     from gymnasium.wrappers.record_video import RecordVideo
     import os
     from rlberry.envs.gym_make import atari_make
-    from rlberry.agents.torch.utils.training import model_factory_from_env
+    from rlberry_research.agents.torch.utils.training import model_factory_from_env
     import tempfile
 
     with tempfile.TemporaryDirectory() as tmpdirname:
diff --git a/rlberry/envs/tests/test_instantiation.py b/rlberry/envs/tests/test_instantiation.py
deleted file mode 100644
index d66722484..000000000
--- a/rlberry/envs/tests/test_instantiation.py
+++ /dev/null
@@ -1,252 +0,0 @@
-import numpy as np
-import pytest
-
-from rlberry.envs import gym_make, PipelineEnv
-from rlberry.envs.classic_control import MountainCar, Acrobot, Pendulum
-from rlberry.envs.finite import Chain
-from rlberry.envs.finite import GridWorld
-from rlberry.envs.benchmarks.ball_exploration import PBall2D, SimplePBallND
-from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
-from rlberry.envs.benchmarks.grid_exploration.four_room import FourRoom
-from rlberry.envs.benchmarks.grid_exploration.six_room import SixRoom
-from rlberry.envs.benchmarks.grid_exploration.nroom import NRoom
-from rlberry.envs.benchmarks.grid_exploration.apple_gold import AppleGold
-from rlberry.rendering.render_interface import RenderInterface2D
-
-classes = [
-    MountainCar,
-    GridWorld,
-    Chain,
-    PBall2D,
-    SimplePBallND,
-    Acrobot,
-    Pendulum,
-    FourRoom,
-    SixRoom,
-    AppleGold,
-    NRoom,
-]
-
-
-@pytest.mark.parametrize("ModelClass", classes)
-def test_instantiation(ModelClass):
-    env = ModelClass()
-
-    if env.is_online():
-        for _ in range(2):
-            state, info = env.reset()
-            for _ in range(50):
-                assert env.observation_space.contains(state)
-                action = env.action_space.sample()
-                next_s, _, _, _, _ = env.step(action)
-                state = next_s
-
-    if env.is_generative():
-        for _ in range(100):
-            state = env.observation_space.sample()
-            action = env.action_space.sample()
-            next_s, _, _, _, _ = env.sample(state, action)
-            assert env.observation_space.contains(next_s)
-
-
-@pytest.mark.parametrize("ModelClass", classes)
-def test_rendering_calls(ModelClass):
-    env = ModelClass()
-    if isinstance(env, RenderInterface2D):
-        _ = env.get_background()
-        _ = env.get_scene(env.observation_space.sample())
-
-
-def test_gridworld_aux_functions():
-    env = GridWorld(
-        nrows=5, ncols=8, walls=((1, 1),), reward_at={(4, 4): 1, (4, 3): -1}
-    )
-    env.log()  # from FiniteMDP
-    env.render_ascii()  # from GridWorld
-    vals = np.arange(env.observation_space.n)
-    env.display_values(vals)
-    env.print_transition_at(0, 0, "up")
-
-    layout = env.get_layout_array(vals, fill_walls_with=np.inf)
-    for rr in range(env.nrows):
-        for cc in range(env.ncols):
-            if (rr, cc) in env.walls:
-                assert layout[rr, cc] == np.inf
-            else:
-                assert layout[rr, cc] == vals[env.coord2index[(rr, cc)]]
-
-
-def test_gridworld_from_layout():
-    layout = """
-    IOOOO # OOOOO  O OOOOR
-    OOOOO # OOOOO  # OOOOO
-    OOOOO O OOOOO  # OOTOO
-    OOOOO # OOOOO  # OOOOO
-    IOOOO # OOOOO  # OOOOr"""
-    env = GridWorld.from_layout(layout)
-    env.reset()
-
-
-def test_ball2d_benchmark_instantiation():
-    for level in [0, 1, 2, 3, 4, 5]:
-        env = get_benchmark_env(level)
-        for aa in range(env.action_space.n):
-            env.step(aa)
-            env.sample(env.observation_space.sample(), aa)
-
-
-@pytest.mark.parametrize("p", [1, 2, 3, 4, 5, np.inf])
-def test_pball_env(p):
-    env = PBall2D(p=p)
-    env.get_reward_lipschitz_constant()
-    env.get_transitions_lipschitz_constant()
-
-
-@pytest.mark.parametrize(
-    "reward_free, difficulty, array_observation",
-    [
-        (True, 0, False),
-        (False, 0, False),
-        (False, 0, True),
-        (False, 1, False),
-        (False, 1, True),
-        (False, 2, False),
-        (False, 2, True),
-    ],
-)
-def test_four_room(reward_free, difficulty, array_observation):
-    env = FourRoom(
-        reward_free=reward_free,
-        difficulty=difficulty,
-        array_observation=array_observation,
-    )
-
-    initial_state, info = env.reset()
-    next_state, reward, _, _, _ = env.step(1)
-
-    assert env.observation_space.contains(initial_state)
-    assert env.observation_space.contains(next_state)
-
-    if reward_free:
-        assert env.reward_at == {}
-
-    if difficulty == 2:
-        assert reward < 0.0
-
-    if array_observation:
-        assert isinstance(initial_state, np.ndarray)
-        assert isinstance(next_state, np.ndarray)
-
-
-@pytest.mark.parametrize(
-    "reward_free, array_observation",
-    [
-        (False, False),
-        (False, True),
-        (True, False),
-        (True, True),
-    ],
-)
-def test_six_room(reward_free, array_observation):
-    env = SixRoom(reward_free=reward_free, array_observation=array_observation)
-
-    initial_state, info = env.reset()
-    next_state, reward, _, _, _ = env.step(1)
-
-    assert env.observation_space.contains(initial_state)
-    assert env.observation_space.contains(next_state)
-
-    if reward_free:
-        assert env.reward_at == {}
-
-    if array_observation:
-        assert isinstance(initial_state, np.ndarray)
-        assert isinstance(next_state, np.ndarray)
-
-
-@pytest.mark.parametrize(
-    "reward_free, array_observation",
-    [
-        (False, False),
-        (False, True),
-        (True, False),
-        (True, True),
-    ],
-)
-def test_apple_gold(reward_free, array_observation):
-    env = AppleGold(reward_free=reward_free, array_observation=array_observation)
-
-    initial_state, info = env.reset()
-    next_state, reward, _, _, _ = env.step(1)
-    assert env.observation_space.contains(initial_state)
-    assert env.observation_space.contains(next_state)
-
-    if reward_free:
-        assert env.reward_at == {}
-
-    if array_observation:
-        assert isinstance(initial_state, np.ndarray)
-        assert isinstance(next_state, np.ndarray)
-
-
-@pytest.mark.parametrize(
-    "reward_free, array_observation, initial_state_distribution",
-    [
-        (False, False, "center"),
-        (False, True, "center"),
-        (True, False, "center"),
-        (True, True, "center"),
-        (True, False, "uniform"),
-    ],
-)
-def test_n_room(reward_free, array_observation, initial_state_distribution):
-    env = NRoom(
-        reward_free=reward_free,
-        array_observation=array_observation,
-        initial_state_distribution=initial_state_distribution,
-    )
-
-    initial_state, info = env.reset()
-    next_state, reward, _, _, _ = env.step(1)
-
-    if initial_state_distribution == "uniform":
-        assert env.initial_state_distribution[0] == 1.0 / env.observation_space.n
-
-    assert env.observation_space.contains(initial_state)
-    assert env.observation_space.contains(next_state)
-
-    if reward_free:
-        assert env.reward_at == {}
-
-    if array_observation:
-        assert isinstance(initial_state, np.ndarray)
-        assert isinstance(next_state, np.ndarray)
-
-
-def test_pipeline():
-    from rlberry.wrappers import RescaleRewardWrapper
-    from rlberry.wrappers.discretize_state import DiscretizeStateWrapper
-
-    env_ctor, env_kwargs = PipelineEnv, {
-        "env_ctor": gym_make,
-        "env_kwargs": {"id": "Acrobot-v1"},
-        "wrappers": [(RescaleRewardWrapper, {"reward_range": (0, 1)})],
-    }
-    env = env_ctor(**env_kwargs)
-    _, reward, _, _, _ = env.step(0)
-    assert (reward <= 1) and (reward >= 0)
-
-    env_ctor, env_kwargs = PipelineEnv, {
-        "env_ctor": gym_make,
-        "env_kwargs": {"id": "Acrobot-v1"},
-        "wrappers": [
-            (RescaleRewardWrapper, {"reward_range": (0, 1)}),
-            (DiscretizeStateWrapper, {"n_bins": 10}),
-        ],
-    }
-    env = env_ctor(**env_kwargs)
-    # check that wrapped in the right order
-    assert isinstance(
-        env.env, RescaleRewardWrapper
-    ), "the environments in Pipeline env may not be wrapped in order"
-    assert isinstance(env.env.env, DiscretizeStateWrapper)
diff --git a/rlberry/envs/tests/test_spring_env.py b/rlberry/envs/tests/test_spring_env.py
deleted file mode 100644
index 9809cd55b..000000000
--- a/rlberry/envs/tests/test_spring_env.py
+++ /dev/null
@@ -1,104 +0,0 @@
-import numpy as np
-from rlberry.envs import SpringCartPole
-from rlberry.envs.classic_control.SpringCartPole import rk4
-
-
-# # actions
-# LL = 0
-# RR = 1
-# LR = 2
-# RL = 3
-
-# action_dict = {0: "LL", 1: "RR", 2: "LR", 3: "RL"}
-
-
-HORIZON = 50
-
-
-def test_spring_cartpole():
-    # test 1 - default
-    env = SpringCartPole()
-
-    _, info = env.reset()
-    for _ in range(2):
-        action = np.random.randint(0, env.action_space.n)
-        next_observation, reward, terminated, truncated, info = env.step(action)
-        done = terminated or truncated
-        # if done:
-        #     next_observation,info = env.reset()
-        # observation = next_observation
-
-    # test 2 - obs_trans = True and random_init = False
-    env = SpringCartPole(obs_trans=True, random_init=False)
-
-    _, info = env.reset()
-    for _ in range(2):
-        action = np.random.randint(0, env.action_space.n)
-        next_observation, reward, terminated, truncated, info = env.step(action)
-        done = terminated or truncated
-        # if done:
-        #     next_observation,info = env.reset()
-        # observation = next_observation
-
-    # # test 3 - swingup = False and random_init = False
-    # env = SpringCartPole(dt=0.01, swing_up=False, random_init=False)
-    # # env.enable_rendering()
-
-    # observation,info = env.reset()
-    # for tt in range(5):
-    #     if observation[2] > 0:
-    #         if observation[6] > 0:
-    #             action = LL
-    #         else:
-    #             action = LR
-    #     else:
-    #         if observation[6] > 0:
-    #             action = RL
-    #         else:
-    #             action = RR
-    #     # print("Time: ", tt, "Action: ", action_dict[action], "Angle1: ", observation[2], "Angle2: ", observation[6])
-    #     next_observation, reward, terminated, truncated, info= env.step(action)
-    #     done = terminated or truncated
-    #     if done:
-    #         next_observation,info = env.reset()
-    #     observation = next_observation
-
-    # test 4 - swingup = False and rendering = True
-
-    env = SpringCartPole(dt=0.02, swing_up=False, obs_trans=True)
-    env.enable_rendering()
-
-    _, info = env.reset()
-    action = 0
-    for _ in range(2 * HORIZON):
-        next_observation, reward, terminated, truncated, info = env.step(action)
-        done = terminated or truncated
-        if done:
-            action += 1
-            if action >= 4:
-                action = 0
-            next_observation, info = env.reset()
-        _ = next_observation
-
-    _ = env.get_video()
-
-
-def test_rk4():
-    """
-    Test of the rk4 utils defined in speingcartpole
-    """
-
-    ## 2D system
-    def derivs6(x, t):
-        d1 = x[0] + 2 * x[1]
-        d2 = -3 * x[0] + 4 * x[1]
-        return (d1, d2)
-
-    dt = 0.0005
-    t = np.arange(0.0, 2.0, dt)
-    y0 = (1, 2)
-    yout = rk4(derivs6, y0, t)
-    assert np.abs(yout[0][0] - 1) < 1e-2
-    assert np.abs(yout[0][1] - 2) < 1e-2
-    assert np.abs(yout[-1][0] + 238.087) < 1e-2
-    assert np.abs(yout[-1][1] + 220.827) < 1e-2
diff --git a/rlberry/experiment/tests/room.yaml b/rlberry/experiment/tests/room.yaml
index 8667278d5..702fbaa38 100644
--- a/rlberry/experiment/tests/room.yaml
+++ b/rlberry/experiment/tests/room.yaml
@@ -1,4 +1,4 @@
-constructor: 'rlberry.envs.benchmarks.grid_exploration.nroom.NRoom'
+constructor: 'rlberry_research.envs.benchmarks.grid_exploration.nroom.NRoom'
 params:
   reward_free: false
   array_observation: true
diff --git a/rlberry/experiment/tests/rsucbvi.yaml b/rlberry/experiment/tests/rsucbvi.yaml
index 4c9273e1b..c35777881 100644
--- a/rlberry/experiment/tests/rsucbvi.yaml
+++ b/rlberry/experiment/tests/rsucbvi.yaml
@@ -1,4 +1,4 @@
-agent_class: 'rlberry.agents.kernel_based.rs_ucbvi.RSUCBVIAgent'
+agent_class: 'rlberry_research.agents.kernel_based.rs_ucbvi.RSUCBVIAgent'
 init_kwargs:
   gamma: 1.0
   lp_metric: 2
diff --git a/rlberry/experiment/tests/test_experiment_generator.py b/rlberry/experiment/tests/test_experiment_generator.py
index 2c5297198..be361ea57 100644
--- a/rlberry/experiment/tests/test_experiment_generator.py
+++ b/rlberry/experiment/tests/test_experiment_generator.py
@@ -1,5 +1,5 @@
 from rlberry.experiment import experiment_generator
-from rlberry.agents.kernel_based.rs_ucbvi import RSUCBVIAgent
+from rlberry_research.agents.kernel_based.rs_ucbvi import RSUCBVIAgent
 
 import numpy as np
 
diff --git a/rlberry/experiment/yaml_utils.py b/rlberry/experiment/yaml_utils.py
index 581b254ec..ff26c21dc 100644
--- a/rlberry/experiment/yaml_utils.py
+++ b/rlberry/experiment/yaml_utils.py
@@ -30,7 +30,7 @@ def read_agent_config(config_path):
     Example:
 
     ``` myagent.yaml
-        agent_class: 'rlberry.agents.kernel_based.rs_ucbvi.RSUCBVIAgent'
+        agent_class: 'rlberry_research.agents.kernel_based.rs_ucbvi.RSUCBVIAgent'
         gamma: 1.0
         lp_metric: 2
         min_dist: 0.0
@@ -76,7 +76,7 @@ def read_env_config(config_path):
     Example:
 
     ``` env.yaml
-        constructor: 'rlberry.envs.benchmarks.grid_exploration.nroom.NRoom'
+        constructor: 'rlberry_research.envs.benchmarks.grid_exploration.nroom.NRoom'
         params:
             reward_free: false
             array_observation: true
diff --git a/rlberry/exploration_tools/__init__.py b/rlberry/exploration_tools/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/rlberry/exploration_tools/discrete_counter.py b/rlberry/exploration_tools/discrete_counter.py
deleted file mode 100644
index 549a39955..000000000
--- a/rlberry/exploration_tools/discrete_counter.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import numpy as np
-from rlberry.exploration_tools.uncertainty_estimator import UncertaintyEstimator
-from rlberry.exploration_tools.typing import preprocess_args
-from rlberry.spaces import Discrete
-from rlberry.utils.space_discretizer import Discretizer
-
-
-class DiscreteCounter(UncertaintyEstimator):
-    """
-    Parameters
-    ----------
-    observation_space : spaces.Box or spaces.Discrete
-    action_space : spaces.Box or spaces.Discrete
-    n_bins_obs: int
-        number of bins to discretize observation space
-    n_bins_actions: int
-        number of bins to discretize action space
-    rate_power : float
-        Returns bonuses in 1/n ** rate_power.
-    """
-
-    def __init__(
-        self,
-        observation_space,
-        action_space,
-        n_bins_obs=10,
-        n_bins_actions=10,
-        rate_power=0.5,
-        **kwargs
-    ):
-        UncertaintyEstimator.__init__(self, observation_space, action_space)
-
-        self.rate_power = rate_power
-
-        self.continuous_state = False
-        self.continuous_action = False
-
-        if isinstance(observation_space, Discrete):
-            self.n_states = observation_space.n
-        else:
-            self.continuous_state = True
-            self.state_discretizer = Discretizer(self.observation_space, n_bins_obs)
-            self.n_states = self.state_discretizer.discrete_space.n
-
-        if isinstance(action_space, Discrete):
-            self.n_actions = action_space.n
-        else:
-            self.continuous_action = True
-            self.action_discretizer = Discretizer(self.action_space, n_bins_actions)
-            self.n_actions = self.action_discretizer.discrete_space.n
-
-        self.N_sa = np.zeros((self.n_states, self.n_actions))
-
-    def _preprocess(self, state, action):
-        if self.continuous_state:
-            state = self.state_discretizer.discretize(state)
-        if self.continuous_action:
-            action = self.action_discretizer.discretize(action)
-        return state, action
-
-    def reset(self):
-        self.N_sa = np.zeros((self.n_states, self.n_actions))
-
-    @preprocess_args(expected_type="numpy")
-    def update(self, state, action, next_state=None, reward=None, **kwargs):
-        state, action = self._preprocess(state, action)
-        self.N_sa[state, action] += 1
-
-    @preprocess_args(expected_type="numpy")
-    def measure(self, state, action, **kwargs):
-        state, action = self._preprocess(state, action)
-        n = np.maximum(1.0, self.N_sa[state, action])
-        return np.power(1.0 / n, self.rate_power)
-
-    def count(self, state, action):
-        state, action = self._preprocess(state, action)
-        return self.N_sa[state, action]
-
-    def get_n_visited_states(self):
-        """
-        Returns the number of different states sent to the .update() function.
-        For continuous state spaces, counts the number of different discretized states.
-        """
-        n_visited_states = (self.N_sa.sum(axis=1) > 0).sum()
-        return n_visited_states
-
-    def get_entropy(self):
-        """
-        Returns the entropy of the empirical distribution over states, induced by the state counts.
-        Uses log2.
-        """
-        visited = self.N_sa.sum(axis=1) > 0
-        if visited.sum() == 0.0:
-            return 0.0
-        # number of visits of visited states only
-        n_visits = self.N_sa[visited, :].sum(axis=1)
-        # empirical distribution
-        dist = n_visits / n_visits.sum()
-        entropy = (-dist * np.log2(dist)).sum()
-        return entropy
diff --git a/rlberry/exploration_tools/online_discretization_counter.py b/rlberry/exploration_tools/online_discretization_counter.py
deleted file mode 100644
index 575114df5..000000000
--- a/rlberry/exploration_tools/online_discretization_counter.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import numpy as np
-from rlberry.utils.jit_setup import numba_jit
-from rlberry.exploration_tools.uncertainty_estimator import UncertaintyEstimator
-from rlberry.exploration_tools.typing import preprocess_args
-from gymnasium.spaces import Box, Discrete
-from rlberry.utils.metrics import metric_lp
-
-import rlberry
-
-logger = rlberry.logger
-
-
-@numba_jit
-def map_to_representative(
-    state,
-    lp_metric,
-    representative_states,
-    n_representatives,
-    min_dist,
-    scaling,
-    accept_new_repr,
-):
-    """
-    Map state to representative state.
-    """
-    dist_to_closest = np.inf
-    argmin = -1
-    for ii in range(n_representatives):
-        dist = metric_lp(state, representative_states[ii, :], lp_metric, scaling)
-        if dist < dist_to_closest:
-            dist_to_closest = dist
-            argmin = ii
-
-    max_representatives = representative_states.shape[0]
-    if (
-        dist_to_closest > min_dist
-        and n_representatives < max_representatives
-        and accept_new_repr
-    ):
-        new_index = n_representatives
-        representative_states[new_index, :] = state
-        return new_index, 0.0
-    return argmin, dist_to_closest
-
-
-class OnlineDiscretizationCounter(UncertaintyEstimator):
-    """
-    Note: currently, only implemented for continuous (Box) states and
-    discrete actions.
-
-    Parameters
-    ----------
-    observation_space : spaces.Box
-    action_space : spaces.Discrete
-    lp_metric: int
-        The metric on the state space is the one induced by the p-norm,
-        where p = lp_metric. Default = 2, for the Euclidean metric.
-    scaling: numpy.ndarray
-        Must have the same size as state array, used to scale the states
-        before computing the metric.
-        If None, set to:
-        - (env.observation_space.high - env.observation_space.low) if high
-        and low are bounded
-        - np.ones(env.observation_space.shape[0]) if high or low are
-        unbounded
-    min_dist: double
-        Minimum distance between two representative states
-    max_repr: int
-        Maximum number of representative states.
-        If None, it is set to  (sqrt(d)/min_dist)**d, where d
-        is the dimension of the state space
-    rate_power : float
-        returns bonuses in n^power.
-    """
-
-    def __init__(
-        self,
-        observation_space,
-        action_space,
-        lp_metric=2,
-        min_dist=0.1,
-        max_repr=1000,
-        scaling=None,
-        rate_power=1,
-        **kwargs
-    ):
-        UncertaintyEstimator.__init__(self, observation_space, action_space)
-
-        assert isinstance(action_space, Discrete)
-        assert isinstance(observation_space, Box)
-
-        self.lp_metric = lp_metric
-        self.min_dist = min_dist
-        self.max_repr = max_repr
-        self.state_dim = self.observation_space.shape[0]
-        self.n_actions = self.action_space.n
-        self.rate_power = rate_power
-
-        # compute scaling, if it is None
-        if scaling is None:
-            # if high and low are bounded
-            if self.observation_space.is_bounded():
-                scaling = self.observation_space.high - self.observation_space.low
-                # if high or low are unbounded
-            else:
-                scaling = np.ones(self.state_dim)
-        else:
-            assert scaling.ndim == 1
-            assert scaling.shape[0] == self.state_dim
-        self.scaling = scaling
-
-        # initialize
-        self.n_representatives = None
-        self.representative_states = None
-        self.N_sa = None
-        self.reset()
-
-    def reset(self):
-        self.n_representatives = 0
-        self.representative_states = np.zeros((self.max_repr, self.state_dim))
-        self.N_sa = np.zeros((self.max_repr, self.n_actions))
-
-        self._overflow_warning = False
-
-    def _get_representative_state(self, state, accept_new_repr=True):
-        state_idx, dist_to_closest = map_to_representative(
-            state,
-            self.lp_metric,
-            self.representative_states,
-            self.n_representatives,
-            self.min_dist,
-            self.scaling,
-            accept_new_repr,
-        )
-        # check if new representative state
-        if state_idx == self.n_representatives:
-            self.n_representatives += 1
-
-        if self.n_representatives >= self.max_repr and (not self._overflow_warning):
-            logger.warning(
-                "OnlineDiscretizationCounter reached \
-the maximum number of representative states."
-            )
-            self._overflow_warning = True
-
-        return state_idx, dist_to_closest
-
-    @preprocess_args(expected_type="numpy")
-    def update(self, state, action, next_state=None, reward=None, **kwargs):
-        state_idx, _ = self._get_representative_state(state)
-        self.N_sa[state_idx, action] += 1
-
-    @preprocess_args(expected_type="numpy")
-    def measure(self, state, action, **kwargs):
-        n = np.maximum(1.0, self.count(state, action))
-        return np.power(1 / n, self.rate_power)
-
-    def count(self, state, action):
-        state_idx, dist_to_closest = self._get_representative_state(
-            state, accept_new_repr=False
-        )
-        # if state is too far from the closest representative,
-        # its count is zero.
-        if dist_to_closest > self.min_dist:
-            return 0.0
-        return self.N_sa[state_idx, action]
-
-    def get_n_visited_states(self):
-        """
-        Returns the number of different states sent to the .update() function.
-        For continuous state spaces, counts the number of different discretized states.
-        """
-        n_visited_states = (self.N_sa.sum(axis=1) > 0).sum()
-        return n_visited_states
-
-    def get_entropy(self):
-        """
-        Returns the entropy of the empirical distribution over states, induced by the state counts.
-        Uses log2.
-        """
-        visited = self.N_sa.sum(axis=1) > 0
-        if visited.sum() == 0.0:
-            return 0.0
-        # number of visits of visited states only
-        n_visits = self.N_sa[visited, :].sum(axis=1)
-        # empirical distribution
-        dist = n_visits / n_visits.sum()
-        entropy = (-dist * np.log2(dist)).sum()
-        return entropy
diff --git a/rlberry/exploration_tools/tests/__init__.py b/rlberry/exploration_tools/tests/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/rlberry/exploration_tools/tests/test_discrete_counter.py b/rlberry/exploration_tools/tests/test_discrete_counter.py
deleted file mode 100644
index ad6c1f2bb..000000000
--- a/rlberry/exploration_tools/tests/test_discrete_counter.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import pytest
-import numpy as np
-from rlberry.envs import GridWorld
-from rlberry.envs import MountainCar
-from rlberry.envs.benchmarks.grid_exploration.nroom import NRoom
-from rlberry.exploration_tools.discrete_counter import DiscreteCounter
-from rlberry.exploration_tools.online_discretization_counter import (
-    OnlineDiscretizationCounter,
-)
-
-
-@pytest.mark.parametrize("rate_power", [0.5, 1])
-def test_discrete_env(rate_power):
-    env = GridWorld()
-    counter = DiscreteCounter(
-        env.observation_space, env.action_space, rate_power=rate_power
-    )
-
-    for N in range(10, 20):
-        assert counter.get_n_visited_states() == 0
-        assert counter.get_entropy() == 0.0
-
-        for ss in range(env.observation_space.n):
-            for aa in range(env.action_space.n):
-                for _ in range(N):
-                    ns, rr, _, _, _ = env.sample(ss, aa)
-                    counter.update(ss, aa, ns, rr)
-                assert counter.N_sa[ss, aa] == N
-                assert counter.count(ss, aa) == N
-                if rate_power == pytest.approx(1):
-                    assert np.allclose(counter.measure(ss, aa), 1.0 / N)
-                elif rate_power == pytest.approx(0.5):
-                    assert np.allclose(counter.measure(ss, aa), np.sqrt(1.0 / N))
-
-        assert counter.get_n_visited_states() == env.observation_space.n
-        assert np.allclose(counter.get_entropy(), np.log2(env.observation_space.n))
-
-        counter.reset()
-
-
-@pytest.mark.parametrize("rate_power", [0.5, 1])
-def test_continuous_state_env(rate_power):
-    env = MountainCar()
-    counter = DiscreteCounter(
-        env.observation_space, env.action_space, rate_power=rate_power
-    )
-
-    for N in [10, 20]:
-        for _ in range(10):
-            ss = env.observation_space.sample()
-            aa = env.action_space.sample()
-            for _ in range(N):
-                ns, rr, _, _, _ = env.sample(ss, aa)
-                counter.update(ss, aa, ns, rr)
-
-            dss = counter.state_discretizer.discretize(ss)
-            assert counter.N_sa[dss, aa] == N
-            assert counter.count(ss, aa) == N
-            if rate_power == pytest.approx(1):
-                assert np.allclose(counter.measure(ss, aa), 1.0 / N)
-            elif rate_power == pytest.approx(0.5):
-                assert np.allclose(counter.measure(ss, aa), np.sqrt(1.0 / N))
-            counter.reset()
-
-
-@pytest.mark.parametrize("rate_power", [True, False])
-def test_continuous_state_env_2(rate_power):
-    env = MountainCar()
-    counter = OnlineDiscretizationCounter(
-        env.observation_space, env.action_space, rate_power=rate_power
-    )
-
-    for N in [10, 20]:
-        for _ in range(10):
-            ss = env.observation_space.sample()
-            aa = env.action_space.sample()
-            for nn in range(N):
-                ns, rr, _, _, _ = env.sample(ss, aa)
-                counter.update(ss, aa, ns, rr)
-            assert counter.count(ss, aa) == N
-            if rate_power == pytest.approx(1):
-                assert np.allclose(counter.measure(ss, aa), 1.0 / N)
-            elif rate_power == pytest.approx(0.5):
-                assert np.allclose(counter.measure(ss, aa), np.sqrt(1.0 / N))
-            counter.reset()
-
-
-def test_continuous_state_env_3():
-    env = NRoom(nrooms=3, array_observation=True)
-    counter = OnlineDiscretizationCounter(
-        env.observation_space, env.action_space, rate_power=0.5, min_dist=0.0
-    )
-
-    for N in range(10, 20, 3):
-        assert counter.get_n_visited_states() == 0
-        assert counter.get_entropy() == 0.0
-
-        for ss in range(env.discrete_observation_space.n):
-            for aa in range(env.action_space.n):
-                for _ in range(N):
-                    ns, rr, _, _, _ = env.sample(ss, aa)
-                    continuous_ss = env._convert_index_to_float_coord(ss)
-                    counter.update(continuous_ss, aa, None, rr)
-                assert counter.N_sa[ss, aa] == N
-                assert counter.count(continuous_ss, aa) == N
-                assert np.allclose(counter.measure(continuous_ss, aa), np.sqrt(1.0 / N))
-
-        assert counter.get_n_visited_states() == env.discrete_observation_space.n
-        assert np.allclose(
-            counter.get_entropy(), np.log2(env.discrete_observation_space.n)
-        )
-
-        counter.reset()
diff --git a/rlberry/exploration_tools/torch/__init__.py b/rlberry/exploration_tools/torch/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/rlberry/exploration_tools/torch/rnd.py b/rlberry/exploration_tools/torch/rnd.py
deleted file mode 100644
index ac6971c22..000000000
--- a/rlberry/exploration_tools/torch/rnd.py
+++ /dev/null
@@ -1,212 +0,0 @@
-from functools import partial
-
-import torch
-import gymnasium.spaces as spaces
-from torch.nn import functional as F
-
-from rlberry.agents.utils.memories import ReplayMemory
-from rlberry.exploration_tools.uncertainty_estimator import UncertaintyEstimator
-from rlberry.exploration_tools.typing import preprocess_args
-from rlberry.agents.torch.utils.models import ConvolutionalNetwork
-from rlberry.agents.torch.utils.models import MultiLayerPerceptron
-from rlberry.utils.factory import load
-from rlberry.utils.torch import choose_device
-
-
-def get_network(shape, embedding_dim):
-    if len(shape) == 3:
-        if shape[2] < shape[0] and shape[2] < shape[1]:
-            W, H, C = shape
-            transpose_obs = True
-        elif shape[0] < shape[1] and shape[0] < shape[2]:
-            C, H, W = shape
-            transpose_obs = False
-        else:
-            raise ValueError("Unknown image convention")
-
-        return ConvolutionalNetwork(
-            in_channels=C,
-            in_width=W,
-            in_height=H,
-            out_size=embedding_dim,
-            activation="ELU",
-            transpose_obs=transpose_obs,
-            is_policy=False,
-        )
-    elif len(shape) == 2:
-        H, W = shape
-        return ConvolutionalNetwork(
-            in_channels=1,
-            in_width=W,
-            in_height=H,
-            activation="ELU",
-            out_size=embedding_dim,
-        )
-
-    elif len(shape) == 1:
-        return MultiLayerPerceptron(
-            in_size=shape[0],
-            activation="RELU",
-            layer_sizes=[64, 64],
-            out_size=embedding_dim,
-        )
-    else:
-        raise ValueError("Incompatible observation shape: {}".format(shape))
-
-
-class RandomNetworkDistillation(UncertaintyEstimator):
-    """
-    References
-    ----------
-    Burda Yuri, Harrison Edwards, Amos Storkey, and Oleg Klimov. 2018.
-    "Exploration by random network distillation."
-    In International Conference on Learning Representations.
-    """
-
-    def __init__(
-        self,
-        observation_space,
-        action_space,
-        learning_rate=0.001,
-        update_period=100,
-        embedding_dim=10,
-        net_fn=None,
-        net_kwargs=None,
-        device="cuda:best",
-        rate_power=0.5,
-        batch_size=10,
-        memory_size=10000,
-        with_action=False,
-        **kwargs
-    ):
-        assert isinstance(observation_space, spaces.Box)
-        UncertaintyEstimator.__init__(self, observation_space, action_space)
-        self.learning_rate = learning_rate
-        self.loss_fn = F.mse_loss
-        self.update_period = update_period
-        self.embedding_dim = embedding_dim
-        out_size = embedding_dim * action_space.n if with_action else embedding_dim
-        self.net_fn = (
-            load(net_fn)
-            if isinstance(net_fn, str)
-            else net_fn
-            or partial(
-                get_network, shape=observation_space.shape, embedding_dim=out_size
-            )
-        )
-        self.net_kwargs = net_kwargs or {}
-        if "out_size" in self.net_kwargs:
-            self.net_kwargs["out_size"] = out_size
-        self.device = choose_device(device)
-        self.rate_power = rate_power
-        self.batch_size = batch_size
-        self.memory = ReplayMemory(capacity=memory_size)
-        self.with_action = with_action
-        self.reset()
-
-    def reset(self, **kwargs):
-        self.random_target_network = self.net_fn(**self.net_kwargs).to(self.device)
-        self.predictor_network = self.net_fn(**self.net_kwargs).to(self.device)
-        self.rnd_optimizer = torch.optim.Adam(
-            self.predictor_network.parameters(),
-            lr=self.learning_rate,
-            betas=(0.9, 0.999),
-        )
-
-        self.count = 0
-        self.loss = torch.tensor(0.0).to(self.device)
-
-    def _get_embeddings(self, state, action=None, batch=False, all_actions=False):
-        state = state.to(self.device)
-        if not batch:
-            state = state.unsqueeze(0)
-
-        random_embedding = self.random_target_network(state)
-        predicted_embedding = self.predictor_network(state)
-
-        if self.with_action:
-            random_embedding = random_embedding.view(
-                (state.shape[0], self.action_space.n, -1)
-            )
-            predicted_embedding = predicted_embedding.view(
-                (state.shape[0], self.action_space.n, -1)
-            )
-            if not all_actions:
-                action = action.long().to(self.device)
-                if not batch:
-                    action = action.unsqueeze(0)
-                action = (
-                    action.unsqueeze(1)
-                    .repeat(1, random_embedding.shape[-1])
-                    .unsqueeze(1)
-                )
-                random_embedding = random_embedding.gather(1, action).squeeze(1)
-                predicted_embedding = predicted_embedding.gather(1, action).squeeze(1)
-        return random_embedding, predicted_embedding
-
-    @preprocess_args(expected_type="torch")
-    def update(self, state, action=None, next_state=None, reward=None, **kwargs):
-        batch = [(state, action)]
-        if self.batch_size > 0 and not self.memory.is_empty():
-            batch += self.memory.sample(self.batch_size)
-            self.memory.push((state, action))
-        states, actions = zip(*batch)
-        states = torch.stack(states)
-        if self.with_action:
-            actions = torch.stack(actions)
-
-        random_embedding, predicted_embedding = self._get_embeddings(
-            states, actions, batch=True
-        )
-
-        self.loss += self.loss_fn(random_embedding.detach(), predicted_embedding)
-
-        self.count += 1
-        if self.count % self.update_period == 0:
-            self.loss /= self.update_period
-            self.rnd_optimizer.zero_grad()
-            self.loss.backward()
-            self.rnd_optimizer.step()
-            self.loss = torch.tensor(0.0).to(self.device)
-
-    @preprocess_args(expected_type="torch")
-    def measure(self, state, action=None, **kwargs):
-        random_embedding, predicted_embedding = self._get_embeddings(
-            state, action, batch=False
-        )
-        error = torch.norm(
-            predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1
-        )
-        return error.pow(2 * self.rate_power).item()
-
-    @preprocess_args(expected_type="torch")
-    def measure_batch(self, states, actions, **kwargs):
-        random_embedding, predicted_embedding = self._get_embeddings(
-            states, actions, batch=True
-        )
-        error = torch.norm(
-            predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1
-        )
-        return error.pow(2 * self.rate_power)
-
-    @preprocess_args(expected_type="torch")
-    def measure_batch_all_actions(self, states, **kwargs):
-        """
-        Measure N(s,a) for all a in A.
-
-        Parameters
-        ----------
-        states: a batch of states, of shape [B x <state_shape>]
-
-        Returns
-        -------
-        N(s,a): an array of shape B x A
-        """
-        assert self.with_action
-        random_embedding, predicted_embedding = self._get_embeddings(
-            states, None, batch=True, all_actions=True
-        )
-        error = torch.norm(
-            predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1
-        )
-        return error.pow(2 * self.rate_power)
diff --git a/rlberry/exploration_tools/torch/tests/__init__.py b/rlberry/exploration_tools/torch/tests/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/rlberry/exploration_tools/torch/tests/test_rnd.py b/rlberry/exploration_tools/torch/tests/test_rnd.py
deleted file mode 100644
index 5e8d506fa..000000000
--- a/rlberry/exploration_tools/torch/tests/test_rnd.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from rlberry.exploration_tools.torch.rnd import RandomNetworkDistillation
-from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
-
-
-def test_rnd():
-    # Environment
-    env = get_benchmark_env(level=1)
-
-    # RND
-    rnd = RandomNetworkDistillation(
-        env.observation_space,
-        env.action_space,
-        learning_rate=0.1,
-        update_period=100,
-        embedding_dim=2,
-    )
-
-    # Test
-    observation, info = env.reset()
-    for ii in range(1000):
-        action = env.action_space.sample()
-        next_observation, reward, terminated, truncated, info = env.step(action)
-        done = terminated or truncated
-        rnd.update(observation, action, next_observation, reward)
-        observation = next_observation
-        # measure uncertainty
-        _ = rnd.measure(observation, action)
diff --git a/rlberry/exploration_tools/typing.py b/rlberry/exploration_tools/typing.py
deleted file mode 100644
index b51aabf64..000000000
--- a/rlberry/exploration_tools/typing.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import numpy as np
-
-_TORCH_INSTALLED = True
-try:
-    import torch
-except ImportError:
-    _TORCH_INSTALLED = False
-
-
-def _get_type(arg):
-    if _TORCH_INSTALLED and isinstance(arg, torch.Tensor):
-        return "torch"
-    elif isinstance(arg, np.ndarray):
-        return "numpy"
-    else:
-        return type(arg)
-
-
-def process_type(arg, expected_type):
-    """
-    Utility function to preprocess numpy/torch arguments,
-    according to a expected type.
-
-    For instance, if arg is numpy and expected_type is torch,
-    converts arg to torch.tensor.
-
-    Parameters
-    ----------
-    expected_type: {'torch', 'numpy'}
-        Desired type for output.
-    """
-    if arg is None:
-        return None
-
-    if expected_type == "torch":
-        assert _TORCH_INSTALLED, "expected_type is 'torch', but torch is not installed!"
-        if isinstance(arg, torch.Tensor):
-            return arg
-        elif isinstance(arg, np.ndarray):
-            return torch.from_numpy(arg)
-        elif np.issubdtype(type(arg), np.number):
-            return torch.tensor(arg)
-        else:
-            return arg
-    elif expected_type == "numpy":
-        if isinstance(arg, np.ndarray):
-            return arg
-        elif _TORCH_INSTALLED and isinstance(arg, torch.Tensor):
-            return arg.detach().cpu().numpy()
-        else:
-            return arg
-    else:
-        return arg
-
-
-def preprocess_args(expected_type):
-    """
-    Utility decorator for methods to preprocess numpy/torch arguments,
-    according to an expected type.
-
-    Output type = input type of the first argument.
-
-    For instance, if function args are numpy and expected_type is torch,
-    converts function args to torch.tensor.
-
-    Parameters
-    ----------
-    expected_type: {'torch', 'numpy'}
-        Desired type for output.
-    """
-
-    def decorator(func):
-        def inner(self, *args, **kwargs):
-            processed_args = ()
-            for ii, arg in enumerate(args):
-                processed_args += (process_type(arg, expected_type),)
-            output = func(self, *processed_args, **kwargs)
-            # Process output according to first argument
-            ouput_expected_type = _get_type(args[0])
-            processed_output = process_type(output, ouput_expected_type)
-            return processed_output
-
-        return inner
-
-    return decorator
diff --git a/rlberry/exploration_tools/uncertainty_estimator.py b/rlberry/exploration_tools/uncertainty_estimator.py
deleted file mode 100644
index 868b4c90e..000000000
--- a/rlberry/exploration_tools/uncertainty_estimator.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from abc import ABC, abstractmethod
-from rlberry.exploration_tools.typing import _get_type
-import numpy as np
-
-
-class UncertaintyEstimator(ABC):
-    def __init__(self, observation_space, action_space, **kwargs):
-        super().__init__()
-        self.observation_space = observation_space
-        self.action_space = action_space
-
-    def reset(self, **kwargs):
-        pass
-
-    @abstractmethod
-    def update(self, state, action, next_state, reward, **kwargs):
-        pass
-
-    @abstractmethod
-    def measure(self, state, action, **kwargs):
-        pass
-
-    def measure_batch(self, states, actions, **kwargs):
-        batch = [self.measure(s, a, **kwargs) for s, a in zip(states, actions)]
-        if _get_type(batch[0]) == "torch":
-            import torch
-
-            return torch.FloatTensor(batch)
-        return np.array(batch)
-
-    def measure_batch_all_actions(self, states):
-        return np.array(
-            [[self.measure(s, a) for a in range(self.action_space.n)] for s in states]
-        )
diff --git a/rlberry/manager/__init__.py b/rlberry/manager/__init__.py
index 7bac68ba5..18ff0bcc2 100644
--- a/rlberry/manager/__init__.py
+++ b/rlberry/manager/__init__.py
@@ -1,9 +1,8 @@
-from .experiment_manager import ExperimentManager, preset_manager
+from .experiment_manager import ExperimentManager
+from .experiment_manager import preset_manager
 from .multiple_managers import MultipleManagers
-from .remote_experiment_manager import RemoteExperimentManager
 from .evaluation import evaluate_agents, plot_writer_data, read_writer_data
 from .comparison import compare_agents
 
 # (Remote)AgentManager alias for the (Remote)ExperimentManager class, for backward compatibility
 AgentManager = ExperimentManager
-RemoteAgentManager = RemoteExperimentManager
diff --git a/rlberry/manager/experiment_manager.py b/rlberry/manager/experiment_manager.py
index 13e12bda7..bdfc012e0 100644
--- a/rlberry/manager/experiment_manager.py
+++ b/rlberry/manager/experiment_manager.py
@@ -219,7 +219,9 @@ class ExperimentManager:
         If 'unique', data is saved to ``output_dir/manager_data/<AGENT_NAME_UNIQUE_ID>``
         If 'timestamp', data is saved to ``output_dir/manager_data/<AGENT_NAME_TIMESTAMP_SHORT_ID>``
     default_writer_kwargs : dict
-        Optional arguments for :class:`~rlberry.utils.writers.DefaultWriter`.
+        Optional arguments for :class:`~rlberry.utils.writers.DefaultWriter`. Typically one may
+        want to change the log style with default_writer_kwargs set to {"style_log":"progressbar"} or
+         {"style_log":"one_line"}
     init_kwargs_per_instance : List[dict] (optional)
         List of length ``n_fit`` containing the params to initialize each of
         the ``n_fit`` agent instances. It can be useful if different instances
diff --git a/rlberry/manager/remote_experiment_manager.py b/rlberry/manager/remote_experiment_manager.py
deleted file mode 100644
index 38335e2f2..000000000
--- a/rlberry/manager/remote_experiment_manager.py
+++ /dev/null
@@ -1,235 +0,0 @@
-import base64
-import dill
-import io
-
-import pandas as pd
-import pathlib
-import pickle
-import zipfile
-from typing import Any, Mapping, Optional
-from rlberry.network import interface
-from rlberry.network.client import BerryClient
-
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class RemoteExperimentManager:
-    """
-    Class to define a client that handles an ExperimentManager instance in a remote BerryServer.
-
-    Parameters
-    ----------
-    client: BerryClient
-        Client instance, to communicate with a BerryServer.
-    **kwargs:
-        Parameters for ExperimentManager instance.
-        Some parameters (as agent_class, train_env, eval_env) can be defined using a ResourceRequest.
-    """
-
-    def __init__(
-        self,
-        client: BerryClient,
-        **kwargs: Mapping[str, Any],
-    ):
-        if client:
-            self._client = client
-
-            # Create a remote ExperimentManager object and keep reference to the filename
-            # in the server where the object was saved.
-            msg = self._client.send(
-                interface.Message.create(
-                    command=interface.Command.AGENT_MANAGER_CREATE_INSTANCE,
-                    params=kwargs,
-                    data=None,
-                )
-            )
-            if msg.command == interface.Command.RAISE_EXCEPTION:
-                raise Exception(msg.message)
-
-            self._remote_experiment_manager_filename = pathlib.Path(
-                msg.info["filename"]
-            )
-
-            # get useful attributes
-            self.agent_name = msg.info["agent_name"]
-            self.output_dir = pathlib.Path(msg.info["output_dir"])  # to save locally
-
-    def set_client(self, client: BerryClient):
-        self._client = client
-
-    @property
-    def remote_file(self):
-        return str(self._remote_experiment_manager_filename)
-
-    def get_writer_data(self):
-        """
-        * Calls get_writer_data() in the remote ExperimentManager and returns the result locally.
-        * If tensorboard data is available in the remote ExperimentManager, the data is zipped,
-        received locally and unzipped.
-        """
-        msg = self._client.send(
-            interface.Message.create(
-                command=interface.Command.AGENT_MANAGER_GET_WRITER_DATA,
-                params=dict(filename=self.remote_file),
-            )
-        )
-        if msg.command == interface.Command.RAISE_EXCEPTION:
-            raise Exception(msg.message)
-        raw_data = msg.data["writer_data"]
-        writer_data = dict()
-        for idx in raw_data:
-            csv_content = raw_data[idx]
-            writer_data[idx] = pd.read_csv(io.StringIO(csv_content), sep=",")
-
-        # check if tensorboard data was received
-        # If so, read file and unzip it.
-        tensorboard_bin_data = msg.data["tensorboard_bin_data"]
-        if tensorboard_bin_data is not None:
-            tensorboard_bin_data = base64.b64decode(
-                tensorboard_bin_data.encode("ascii")
-            )
-            zip_file = open(self.output_dir / "tensorboard_data.zip", "wb")
-            zip_file.write(tensorboard_bin_data)
-            zip_file.close()
-            with zipfile.ZipFile(
-                self.output_dir / "tensorboard_data.zip", "r"
-            ) as zip_ref:
-                zip_ref.extractall(self.output_dir)
-        return writer_data
-
-    def fit(self, budget=None, **kwargs):
-        msg = self._client.send(
-            interface.Message.create(
-                command=interface.Command.AGENT_MANAGER_FIT,
-                params=dict(
-                    filename=self.remote_file, budget=budget, extra_params=kwargs
-                ),
-                data=None,
-            )
-        )
-        if msg.command == interface.Command.RAISE_EXCEPTION:
-            raise Exception(msg.message)
-
-    def eval_agents(self, n_simulations: Optional[int] = None):
-        msg = self._client.send(
-            interface.Message.create(
-                command=interface.Command.AGENT_MANAGER_EVAL,
-                params=dict(filename=self.remote_file, n_simulations=n_simulations),
-                data=None,
-            )
-        )
-        if msg.command == interface.Command.RAISE_EXCEPTION:
-            raise Exception(msg.message)
-        out = msg.data["output"]
-        return out
-
-    def clear_output_dir(self):
-        msg = self._client.send(
-            interface.Message.create(
-                command=interface.Command.AGENT_MANAGER_CLEAR_OUTPUT_DIR,
-                params=dict(filename=self.remote_file),
-                data=None,
-            )
-        )
-        if msg.command == interface.Command.RAISE_EXCEPTION:
-            raise Exception(msg.message)
-
-    def clear_handlers(self):
-        msg = self._client.send(
-            interface.Message.create(
-                command=interface.Command.AGENT_MANAGER_CLEAR_HANDLERS,
-                params=dict(filename=self.remote_file),
-                data=None,
-            )
-        )
-        if msg.command == interface.Command.RAISE_EXCEPTION:
-            raise Exception(msg.message)
-
-    def set_writer(self, idx, writer_fn, writer_kwargs=None):
-        """Note: Use ResourceRequest for writer_fn."""
-        params = dict(idx=idx, writer_fn=writer_fn, writer_kwargs=writer_kwargs)
-        msg = self._client.send(
-            interface.Message.create(
-                command=interface.Command.AGENT_MANAGER_SET_WRITER,
-                params=dict(filename=self.remote_file, kwargs=params),
-                data=None,
-            )
-        )
-        if msg.command == interface.Command.RAISE_EXCEPTION:
-            raise Exception(msg.message)
-
-    def optimize_hyperparams(self, **kwargs):
-        msg = self._client.send(
-            interface.Message.create(
-                command=interface.Command.AGENT_MANAGER_OPTIMIZE_HYPERPARAMS,
-                params=dict(filename=self.remote_file, kwargs=kwargs),
-                data=None,
-            )
-        )
-        if msg.command == interface.Command.RAISE_EXCEPTION:
-            raise Exception(msg.message)
-        best_params_dict = msg.data
-        return best_params_dict
-
-    def save(self):
-        """
-        Save RemoteExperimentManager data to self.output_dir.
-
-        Returns
-        -------
-        filename where the ExperimentManager object was saved.
-        """
-        # use self.output_dir
-        output_dir = self.output_dir
-
-        # create dir if it does not exist
-        output_dir.mkdir(parents=True, exist_ok=True)
-
-        # save
-        filename = pathlib.Path("remote_manager_obj").with_suffix(".pickle")
-        filename = output_dir / filename
-        filename.parent.mkdir(parents=True, exist_ok=True)
-        try:
-            with filename.open("wb") as ff:
-                pickle.dump(self.__dict__, ff)
-            logger.info(
-                "Saved RemoteExperimentManager({}) using pickle.".format(
-                    self.agent_name
-                )
-            )
-        except Exception:
-            try:
-                with filename.open("wb") as ff:
-                    dill.dump(self.__dict__, ff)
-                logger.info(
-                    "Saved RemoteExperimentManager({}) using dill.".format(
-                        self.agent_name
-                    )
-                )
-            except Exception as ex:
-                logger.warning(
-                    "[RemoteExperimentManager] Instance cannot be pickled: " + str(ex)
-                )
-
-        return filename
-
-    @classmethod
-    def load(cls, filename):
-        filename = pathlib.Path(filename).with_suffix(".pickle")
-
-        obj = cls(None)
-        try:
-            with filename.open("rb") as ff:
-                tmp_dict = pickle.load(ff)
-            logger.info("Loaded RemoteExperimentManager using pickle.")
-        except Exception:
-            with filename.open("rb") as ff:
-                tmp_dict = dill.load(ff)
-            logger.info("Loaded RemoteExperimentManager using dill.")
-
-        obj.__dict__.clear()
-        obj.__dict__.update(tmp_dict)
-        return obj
diff --git a/rlberry/manager/tests/test_comparisons.py b/rlberry/manager/tests/test_comparisons.py
index b95020653..a671c7c60 100644
--- a/rlberry/manager/tests/test_comparisons.py
+++ b/rlberry/manager/tests/test_comparisons.py
@@ -1,5 +1,5 @@
 import pytest
-from rlberry.envs import GridWorld
+from rlberry_research.envs import GridWorld
 from rlberry.agents import AgentWithSimplePolicy
 from rlberry.manager import AgentManager
 from rlberry.manager import compare_agents
diff --git a/rlberry/manager/tests/test_experiment_manager.py b/rlberry/manager/tests/test_experiment_manager.py
index 63a489623..870d321c1 100644
--- a/rlberry/manager/tests/test_experiment_manager.py
+++ b/rlberry/manager/tests/test_experiment_manager.py
@@ -2,7 +2,7 @@
 import numpy as np
 import sys
 import os
-from rlberry.envs import GridWorld
+from rlberry_research.envs import GridWorld
 from rlberry.agents import AgentWithSimplePolicy
 from rlberry.manager import (
     ExperimentManager,
@@ -379,3 +379,27 @@ def test_compress():
     )
     stats.fit()
     evaluate_agents([stats], show=False)
+
+
+@pytest.mark.parametrize("style_log", ["multi_line", "one_line", "progressbar"])
+def test_logs(style_log):
+    # Define train and evaluation envs
+    train_env = (GridWorld, {})
+
+    # Parameters
+    params = dict(hyperparameter1=-1, hyperparameter2=lambda x: 42)
+    eval_kwargs = dict(eval_horizon=10)
+
+    # Run ExperimentManager
+    stats = ExperimentManager(
+        DummyAgent,
+        train_env,
+        fit_budget=15,
+        eval_kwargs=eval_kwargs,
+        init_kwargs=params,
+        default_writer_kwargs={"style_log": style_log, "log_interval": 0},
+        n_fit=3,
+        seed=123,
+    )
+    stats.fit()
+    evaluate_agents([stats], show=False)
diff --git a/rlberry/manager/tests/test_experiment_manager_seeding.py b/rlberry/manager/tests/test_experiment_manager_seeding.py
index d0e5a317c..c1c647ba0 100644
--- a/rlberry/manager/tests/test_experiment_manager_seeding.py
+++ b/rlberry/manager/tests/test_experiment_manager_seeding.py
@@ -1,8 +1,8 @@
 from rlberry.envs.tests.test_env_seeding import get_env_trajectory, compare_trajectories
 from rlberry.envs import gym_make
-from rlberry.envs.classic_control import MountainCar
+from rlberry_research.envs.classic_control import MountainCar
 from rlberry.manager import ExperimentManager, MultipleManagers
-from rlberry.agents.torch import A2CAgent
+from rlberry_research.agents.torch import A2CAgent
 import gymnasium as gym
 import pytest
 
diff --git a/rlberry/manager/tests/test_hyperparam_optim.py b/rlberry/manager/tests/test_hyperparam_optim.py
index 2803adbcd..5ee62d3d0 100644
--- a/rlberry/manager/tests/test_hyperparam_optim.py
+++ b/rlberry/manager/tests/test_hyperparam_optim.py
@@ -1,6 +1,6 @@
-from rlberry.envs import GridWorld
+from rlberry_research.envs import GridWorld
 from rlberry.agents import AgentWithSimplePolicy
-from rlberry.agents.dynprog.value_iteration import ValueIterationAgent
+from rlberry_scool.agents.dynprog.value_iteration import ValueIterationAgent
 from rlberry.manager import ExperimentManager
 from optuna.samplers import TPESampler
 import numpy as np
@@ -86,7 +86,7 @@ def test_hyperparam_optim_random(parallelization, custom_eval_function, fit_frac
             DummyAgent,
             train_env,
             init_kwargs={},
-            fit_budget=1,
+            fit_budget=50,
             eval_kwargs={"eval_horizon": 5},
             n_fit=3,
             parallelization=parallelization,
@@ -97,6 +97,7 @@ def test_hyperparam_optim_random(parallelization, custom_eval_function, fit_frac
         stats_agent.optimize_hyperparams(
             sampler_method="random",
             n_trials=3,
+            timeout=0.5,
             optuna_parallelization=parallelization,
             custom_eval_function=custom_eval_function,
             fit_fraction=fit_fraction,
diff --git a/rlberry/manager/tests/test_plot.py b/rlberry/manager/tests/test_plot.py
index 70533ac60..fe0acab86 100644
--- a/rlberry/manager/tests/test_plot.py
+++ b/rlberry/manager/tests/test_plot.py
@@ -6,9 +6,9 @@
 import sys
 
 from rlberry.wrappers import WriterWrapper
-from rlberry.envs import GridWorld
+from rlberry_research.envs import GridWorld
 from rlberry.manager import plot_writer_data, ExperimentManager, read_writer_data
-from rlberry.agents import UCBVIAgent
+from rlberry_scool.agents import UCBVIAgent
 
 
 class VIAgent(UCBVIAgent):
diff --git a/rlberry/network/__init__.py b/rlberry/network/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/rlberry/network/client.py b/rlberry/network/client.py
deleted file mode 100644
index 32d07177d..000000000
--- a/rlberry/network/client.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import pprint
-import socket
-import json
-from typing import List, Union
-from rlberry.network import interface
-from rlberry.network.utils import serialize_message
-
-
-class BerryClient:
-    """
-    rlberry client
-
-    For now, works only on Linux systems
-
-    Parameters
-    ----------
-    host :
-        hostname, IP address or empty string.
-    port : int
-        Integer from 1-65535
-    """
-
-    def __init__(
-        self,
-        host="127.0.0.1",
-        port: int = 65432,
-    ) -> None:
-        assert port >= 1 and port <= 65535
-        self._host = host
-        self._port = port
-
-    def send(
-        self,
-        *messages: interface.Message,
-        print_response: bool = False,
-    ) -> Union[List[interface.Message], interface.Message]:
-        returned_messages = []
-        pp = pprint.PrettyPrinter(indent=4)
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-            s.connect((self._host, self._port))
-            for msg in messages:
-                msg_bytes = serialize_message(msg)
-                interface.send_data(s, msg_bytes)
-                received_bytes = interface.receive_data(s)
-                received_msg_dict = json.loads(received_bytes)
-                if print_response:
-                    pp.pprint(received_msg_dict)
-                received_msg = interface.Message.from_dict(received_msg_dict)
-                returned_messages.append(received_msg)
-
-        if len(messages) == 1:
-            return returned_messages[0]
-        return returned_messages
diff --git a/rlberry/network/interface.py b/rlberry/network/interface.py
deleted file mode 100644
index 929a3f366..000000000
--- a/rlberry/network/interface.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import struct
-from typing import Any, Dict, Mapping, NamedTuple, Optional
-
-
-REQUEST_PREFIX = "ResourceRequest_"
-
-
-class Command:
-    NONE = "NONE"
-    RAISE_EXCEPTION = "RAISE_EXCEPTION"
-    ECHO = "ECHO"
-    LIST_RESOURCES = "LIST_RESOURCES"
-    AGENT_MANAGER_CREATE_INSTANCE = "AGENT_MANAGER_CREATE_INSTANCE"
-    AGENT_MANAGER_FIT = "AGENT_MANAGER_FIT"
-    AGENT_MANAGER_EVAL = "AGENT_MANAGER_EVAL"
-    AGENT_MANAGER_CLEAR_OUTPUT_DIR = "AGENT_MANAGER_CLEAR_OUTPUT_DIR"
-    AGENT_MANAGER_CLEAR_HANDLERS = "AGENT_MANAGER_CLEAR_HANDLERS"
-    AGENT_MANAGER_SET_WRITER = "AGENT_MANAGER_SET_WRITER"
-    AGENT_MANAGER_OPTIMIZE_HYPERPARAMS = "AGENT_MANAGER_OPTIMIZE_HYPERPARAMS"
-    AGENT_MANAGER_GET_WRITER_DATA = "AGENT_MANAGER_GET_WRITER_DATA"
-
-
-class BerryServerInfo:
-    host: str
-    port: int
-
-
-class Message(NamedTuple):
-    message: Optional[str] = ""
-    command: Optional[Command] = None
-    params: Optional[Mapping[str, Any]] = None
-    data: Optional[Mapping[str, Any]] = None
-    info: Optional[Mapping[str, Any]] = None
-
-    def to_dict(self):
-        return self._asdict()
-
-    @classmethod
-    def create(
-        cls,
-        message: Optional[str] = "",
-        command: Optional[Command] = None,
-        params: Optional[Mapping[str, Any]] = None,
-        data: Optional[Mapping[str, Any]] = None,
-        info: Optional[Mapping[str, Any]] = None,
-    ):
-        command = command or ""
-        params = params or dict()
-        data = data or dict()
-        info = info or dict()
-        return cls(
-            message=message,
-            command=command,
-            params=params,
-            data=data,
-            info=info,
-        )
-
-    @classmethod
-    def from_dict(cls, dict_message):
-        return cls(**dict_message)
-
-
-class ResourceItem(Dict):
-    obj: Any
-    description: str
-
-
-Resources = Mapping[str, ResourceItem]
-
-
-class ResourceRequest(NamedTuple):
-    name: str = ""
-    kwargs: Optional[Mapping[str, Any]] = None
-
-
-def next_power_of_two(x: int):
-    return 1 << (x - 1).bit_length()
-
-
-def send_data(socket, data):
-    """
-    adapted from: https://stackoverflow.com/a/63532988
-    """
-    print(f"[rlberry.network] sending {len(data)} bytes...")
-    socket.sendall(struct.pack(">I", len(data)) + data)
-
-
-def receive_data(socket):
-    """
-    adapted from: https://stackoverflow.com/a/63532988
-    """
-    data_size_packed = socket.recv(4)
-    if not data_size_packed:
-        return data_size_packed
-    data_size = struct.unpack(">I", data_size_packed)[0]
-    received_data = b""
-    remaining_size = min(next_power_of_two(data_size), 4096)
-    while remaining_size > 0:
-        received_data += socket.recv(remaining_size)
-        remaining_size = data_size - len(received_data)
-        print(f"[rlberry.network] ... received {len(received_data)}/{data_size} bytes.")
-    return received_data
diff --git a/rlberry/network/server.py b/rlberry/network/server.py
deleted file mode 100644
index e40bd632d..000000000
--- a/rlberry/network/server.py
+++ /dev/null
@@ -1,174 +0,0 @@
-import concurrent.futures
-import logging
-import multiprocessing
-import socket
-import json
-import rlberry.network.server_utils as server_utils
-from rlberry.network import interface
-from rlberry.network.utils import (
-    apply_fn_to_tree,
-    map_request_to_obj,
-    serialize_message,
-)
-from rlberry.envs import gym_make
-from typing import Optional
-
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class ClientHandler:
-    def __init__(self, client_socket, client_address, resources, timeout):
-        self._socket = client_socket
-        self._address = client_address
-        self._resources = resources
-        self._logger = logging.getLogger("ClientHandler")
-        self._timeout = timeout
-
-    def _process_message(self, message: interface.Message):
-        """Replace resource requests in 'message' by available resources."""
-        message = message.to_dict()
-        message = apply_fn_to_tree(
-            lambda key, val: map_request_to_obj(key, val, self._resources),
-            message,
-            apply_to_nodes=True,
-        )
-        return interface.Message.from_dict(message)
-
-    def _execute_message(self, message: interface.Message):
-        """Execute command in message and send response."""
-        self._socket.settimeout(self._timeout)
-        try:
-            # Execute commands
-            response = server_utils.execute_message(message, self._resources)
-            # Send response
-            interface.send_data(self._socket, serialize_message(response))
-        except Exception as ex:
-            response = interface.Message.create(
-                command=interface.Command.RAISE_EXCEPTION, message=str(ex)
-            )
-            interface.send_data(self._socket, serialize_message(response))
-            return 1
-        return 0
-
-    def run(self):
-        with self._socket:
-            try:
-                while True:
-                    print(
-                        f"\n<server: client process> Handling client @ {self._address}"
-                    )
-                    self._socket.settimeout(self._timeout)
-                    message_bytes = interface.receive_data(self._socket)
-                    if not message_bytes:
-                        break
-                    # process bytes
-                    message = interface.Message.from_dict(json.loads(message_bytes))
-                    message = self._process_message(message)
-                    print(f"<server: client process> Received message: \n{message}")
-                    # execute message commands and send back a response
-                    self._execute_message(message)
-            except Exception as ex:
-                print(f"<server: client process> [ERROR]: {ex}")
-                self._logger.exception(ex)
-            finally:
-                print(f"<server: client process> Finished client @ {self._address}")
-
-
-class BerryServer:
-    """
-    rlberry server
-
-    Parameters
-    ----------
-    host :
-        hostname, IP address or empty string.
-    port : int
-        Integer from 1 to 65535.
-    backlog : int
-        Number of unnaccepted connections allowed before refusing new connections.
-    resources : Resources
-        List of resources that can be requested by client.
-    client_socket_timeout : float, default: 120
-        Timeout (in seconds) for client socket operations.
-    terminate_after : int
-        Number of received client sockets after which to terminate the server. If None,
-        does not terminate.
-    """
-
-    def __init__(
-        self,
-        host="127.0.0.1",
-        port: int = 65432,
-        backlog: int = 5,
-        resources: Optional[interface.Resources] = None,
-        client_socket_timeout: float = 120.0,
-        terminate_after: Optional[int] = None,
-    ) -> None:
-        assert port >= 1 and port <= 65535
-        self._host = host
-        self._port = port
-        self._backlog = backlog
-
-        self._resources = resources
-        self._client_socket_timeout = client_socket_timeout
-        self._terminate_after = terminate_after
-        self._client_socket_counter = 0
-
-        # Define basic resources
-        if resources is None:
-            self._resources = dict(
-                gym_make=interface.ResourceItem(obj=gym_make, description="gym_make"),
-            )
-        else:
-            for _, val in resources.items():
-                if set(val.keys()) != set(["obj", "description"]):
-                    raise ValueError(
-                        "resources items must be a dictionary with keys ['obj', 'description']."
-                        f" Received: {list(val.keys())}"
-                    )
-
-    def start(self):
-        print(
-            f"\n\nStarting BerryServer @ (host, port) = ({self._host}, {self._port}).\n\n"
-        )
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-            s.bind((self._host, self._port))
-            s.listen(self._backlog)
-            with concurrent.futures.ProcessPoolExecutor(
-                mp_context=multiprocessing.get_context("spawn")
-            ) as executor:
-                futures = []
-                while True:
-                    print(
-                        f"<server: main process> BerryServer({self._host}, {self._port}): waiting for connection..."
-                    )
-                    client_socket, client_address = s.accept()  # wait for connection
-                    self._client_socket_counter += 1
-                    client_handler = ClientHandler(
-                        client_socket,
-                        client_address,
-                        self._resources,
-                        self._client_socket_timeout,
-                    )
-                    print(
-                        f"<server: main process> BerryServer({self._host}, {self._port}): "
-                        f"new client @ {client_address}"
-                    )
-                    futures.append(executor.submit(client_handler.run))
-                    if (
-                        self._terminate_after
-                        and self._client_socket_counter >= self._terminate_after
-                    ):
-                        print(
-                            "<server: main process> Terminating server (main process): "
-                            "reached max number of client sockets."
-                        )
-                        break
-
-
-if __name__ == "__main__":
-    server = BerryServer()
-    server.start()
diff --git a/rlberry/network/server_utils.py b/rlberry/network/server_utils.py
deleted file mode 100644
index 75922a83f..000000000
--- a/rlberry/network/server_utils.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import pathlib
-from rlberry.network import interface
-from rlberry.manager import ExperimentManager
-from rlberry import metadata_utils
-import rlberry.utils.io
-import base64
-
-
-def execute_message(
-    message: interface.Message, resources: interface.Resources
-) -> interface.Message:
-    response = interface.Message.create(command=interface.Command.ECHO)
-    # LIST_RESOURCES
-    if message.command == interface.Command.LIST_RESOURCES:
-        info = {}
-        for rr in resources:
-            info[rr] = resources[rr]["description"]
-        response = interface.Message.create(info=info)
-    # AGENT_MANAGER_CREATE_INSTANCE
-    elif message.command == interface.Command.AGENT_MANAGER_CREATE_INSTANCE:
-        params = message.params
-        base_dir = pathlib.Path(metadata_utils.RLBERRY_DEFAULT_DATA_DIR)
-        if "output_dir" in params:
-            params["output_dir"] = base_dir / "server_data" / params["output_dir"]
-        else:
-            params["output_dir"] = base_dir / "server_data/"
-        experiment_manager = ExperimentManager(**params)
-        filename = str(experiment_manager.save())
-        response = interface.Message.create(
-            info=dict(
-                filename=filename,
-                agent_name=experiment_manager.agent_name,
-                output_dir=str(experiment_manager.output_dir).replace(
-                    "server_data/", "client_data/"
-                ),
-            )
-        )
-        del experiment_manager
-    # AGENT_MANAGER_FIT
-    elif message.command == interface.Command.AGENT_MANAGER_FIT:
-        filename = message.params["filename"]
-        budget = message.params["budget"]
-        extra_params = message.params["extra_params"]
-        experiment_manager = ExperimentManager.load(filename)
-        experiment_manager.fit(budget, **extra_params)
-        experiment_manager.save()
-        response = interface.Message.create(command=interface.Command.ECHO)
-        del experiment_manager
-    # AGENT_MANAGER_EVAL
-    elif message.command == interface.Command.AGENT_MANAGER_EVAL:
-        filename = message.params["filename"]
-        experiment_manager = ExperimentManager.load(filename)
-        eval_output = experiment_manager.eval_agents(message.params["n_simulations"])
-        response = interface.Message.create(data=dict(output=eval_output))
-        del experiment_manager
-    # AGENT_MANAGER_CLEAR_OUTPUT_DIR
-    elif message.command == interface.Command.AGENT_MANAGER_CLEAR_OUTPUT_DIR:
-        filename = message.params["filename"]
-        experiment_manager = ExperimentManager.load(filename)
-        experiment_manager.clear_output_dir()
-        response = interface.Message.create(
-            message=f"Cleared output dir: {experiment_manager.output_dir}"
-        )
-        del experiment_manager
-    # AGENT_MANAGER_CLEAR_HANDLERS
-    elif message.command == interface.Command.AGENT_MANAGER_CLEAR_HANDLERS:
-        filename = message.params["filename"]
-        experiment_manager = ExperimentManager.load(filename)
-        experiment_manager.clear_handlers()
-        experiment_manager.save()
-        response = interface.Message.create(message=f"Cleared handlers: {filename}")
-        del experiment_manager
-    # AGENT_MANAGER_SET_WRITER
-    elif message.command == interface.Command.AGENT_MANAGER_SET_WRITER:
-        filename = message.params["filename"]
-        experiment_manager = ExperimentManager.load(filename)
-        experiment_manager.set_writer(**message.params["kwargs"])
-        experiment_manager.save()
-        del experiment_manager
-    # AGENT_MANAGER_OPTIMIZE_HYPERPARAMS
-    elif message.command == interface.Command.AGENT_MANAGER_OPTIMIZE_HYPERPARAMS:
-        filename = message.params["filename"]
-        experiment_manager = ExperimentManager.load(filename)
-        best_params_dict = experiment_manager.optimize_hyperparams(
-            **message.params["kwargs"]
-        )
-        experiment_manager.save()
-        del experiment_manager
-        response = interface.Message.create(data=best_params_dict)
-    # AGENT_MANAGER_GET_WRITER_DATA
-    elif message.command == interface.Command.AGENT_MANAGER_GET_WRITER_DATA:
-        # writer scalar data
-        filename = message.params["filename"]
-        experiment_manager = ExperimentManager.load(filename)
-        writer_data = experiment_manager.get_writer_data()
-        writer_data = writer_data or dict()
-        for idx in writer_data:
-            writer_data[idx] = writer_data[idx].to_csv(index=False)
-        # tensoboard data
-        tensorboard_bin_data = None
-        if experiment_manager.tensorboard_dir is not None:
-            tensorboard_zip_file = rlberry.utils.io.zipdir(
-                experiment_manager.tensorboard_dir,
-                experiment_manager.output_dir / "tensorboard_data.zip",
-            )
-            if tensorboard_zip_file is not None:
-                tensorboard_bin_data = open(tensorboard_zip_file, "rb").read()
-                tensorboard_bin_data = base64.b64encode(tensorboard_bin_data).decode(
-                    "ascii"
-                )
-        response = interface.Message.create(
-            data=dict(
-                writer_data=writer_data, tensorboard_bin_data=tensorboard_bin_data
-            )
-        )
-        del experiment_manager
-    # end
-    return response
diff --git a/rlberry/network/tests/__init__.py b/rlberry/network/tests/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/rlberry/network/tests/conftest.py b/rlberry/network/tests/conftest.py
deleted file mode 100644
index 91ffaff1f..000000000
--- a/rlberry/network/tests/conftest.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# content of conftest.py
-# This file is used to spawn a server to connect to in the tests from test_server.py
-
-import multiprocessing
-
-from rlberry.network.interface import ResourceItem
-from rlberry.network.server import BerryServer
-from rlberry.agents import ValueIterationAgent
-from rlberry.agents.torch import REINFORCEAgent
-from rlberry.envs import GridWorld, gym_make
-from rlberry.utils.writers import DefaultWriter
-
-import sys
-
-
-def print_err(s):
-    sys.stderr.write(s)
-    sys.stderr.flush()
-
-
-def server(port):
-    # definition of server
-    resources = dict(
-        GridWorld=ResourceItem(obj=GridWorld, description="GridWorld constructor"),
-        gym_make=ResourceItem(obj=gym_make, description="gym_make"),
-        REINFORCEAgent=ResourceItem(obj=REINFORCEAgent, description="REINFORCEAgent"),
-        ValueIterationAgent=ResourceItem(
-            obj=ValueIterationAgent,
-            description="ValueIterationAgent constructor" + ValueIterationAgent.__doc__,
-        ),
-        DefaultWriter=ResourceItem(
-            obj=DefaultWriter, description="rlberry default writer"
-        ),
-    )
-    server = BerryServer(resources=resources, port=port, client_socket_timeout=120.0)
-    server.start()
-
-
-if __name__ == "__main__":
-    default_port = 4242
-    p = multiprocessing.Process(target=server, args=(default_port,))
-    p.start()
-    print_err("Server startup completed!")
diff --git a/rlberry/network/tests/test_server.py b/rlberry/network/tests/test_server.py
deleted file mode 100644
index 8f3bf3ca1..000000000
--- a/rlberry/network/tests/test_server.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import sys
-
-import py
-import pytest
-from xprocess import ProcessStarter
-import numpy as np
-
-from rlberry.network.client import BerryClient
-from rlberry.network import interface
-from rlberry.network.interface import Message, ResourceRequest
-from rlberry.manager import RemoteExperimentManager
-from rlberry.manager.evaluation import evaluate_agents
-
-server_name = "berry"
-
-
-@pytest.fixture(autouse=True)
-def start_server(xprocess):
-    python_executable_full_path = sys.executable
-    python_server_script_full_path = py.path.local(__file__).dirpath("conftest.py")
-
-    class Starter(ProcessStarter):
-        pattern = "completed"
-        args = [python_executable_full_path, python_server_script_full_path]
-
-    xprocess.ensure(server_name, Starter)
-    yield
-    xprocess.getinfo(server_name).terminate()
-
-
-def test_client():
-    port = 4242
-    client = BerryClient(port=port)
-    # Send params for ExperimentManager
-    client.send(
-        Message.create(
-            command=interface.Command.AGENT_MANAGER_CREATE_INSTANCE,
-            params=dict(
-                agent_class=ResourceRequest(name="ValueIterationAgent"),
-                train_env=ResourceRequest(name="GridWorld", kwargs=dict(nrows=3)),
-                fit_budget=2,
-                init_kwargs=dict(gamma=0.95),
-                eval_kwargs=dict(eval_horizon=2, n_simulations=2),
-                n_fit=2,
-                seed=10,
-            ),
-            data=None,
-        ),
-        Message.create(
-            command=interface.Command.LIST_RESOURCES, params=dict(), data=dict()
-        ),
-    )
-
-    client.send(
-        Message.create(
-            command=interface.Command.NONE,
-            params=dict(),
-            data=dict(big_list=list(1.0 * np.arange(2**4))),
-        ),
-        print_response=True,
-    )
-
-
-def test_remote_manager():
-    port = 4242
-    client = BerryClient(port=port)
-    remote_manager = RemoteExperimentManager(
-        client,
-        agent_class=ResourceRequest(name="REINFORCEAgent"),
-        train_env=ResourceRequest(name="gym_make", kwargs=dict(id="CartPole-v1")),
-        fit_budget=10,
-        init_kwargs=dict(gamma=0.99),
-        eval_kwargs=dict(eval_horizon=2, n_simulations=2),
-        n_fit=2,
-        agent_name="REINFORCE(remote)",
-    )
-    remote_manager.set_writer(
-        idx=0,
-        writer_fn=ResourceRequest(name="DefaultWriter"),
-        writer_kwargs=dict(name="debug_reinforce_writer"),
-    )
-
-    # Optimize hyperparams of remote agent
-    best_params = remote_manager.optimize_hyperparams(timeout=1)
-    print(f"best params = {best_params}")
-
-    fname1 = remote_manager.save()
-    del remote_manager
-    remote_manager = RemoteExperimentManager.load(fname1)
-    remote_manager.fit(3)
-    evaluate_agents([remote_manager], n_simulations=2, show=False)
diff --git a/rlberry/network/utils.py b/rlberry/network/utils.py
deleted file mode 100644
index 67e2ae1f7..000000000
--- a/rlberry/network/utils.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import json
-from copy import deepcopy
-from rlberry.network import interface
-from typing import Any, Callable, Mapping, Optional, Tuple, Union
-
-
-Tree = Union[Any, Tuple, Mapping[Any, "Tree"]]
-
-
-def apply_fn_to_tree(
-    fn: Callable[[Any, Any], Tuple[Any, Any]],
-    tree: Tree,
-    is_leaf: Optional[Callable[[Any], Any]] = None,
-    apply_to_nodes: Optional[bool] = False,
-):
-    """
-    new_key, new_val = fn(key, my_dict[key])
-    """
-    is_leaf = is_leaf or (
-        lambda x: not isinstance(x, Mapping) and not isinstance(x, Tuple)
-    )
-    if is_leaf(tree):
-        return deepcopy(tree)
-    if isinstance(tree, Mapping):
-        new_tree = dict()
-        keys = list(tree.keys())
-        for key in keys:
-            new_tree[key] = tree[key]
-            if apply_to_nodes or is_leaf(tree[key]):
-                new_key, new_val = fn(key, tree[key])
-                new_tree.pop(key)
-                new_tree[new_key] = new_val
-        return {
-            key: apply_fn_to_tree(fn, val, is_leaf, apply_to_nodes)
-            for (key, val) in new_tree.items()
-        }
-    elif isinstance(tree, Tuple):
-        return tuple(
-            [apply_fn_to_tree(fn, val, is_leaf, apply_to_nodes) for val in tree]
-        )
-    else:
-        raise RuntimeError("Tree is not a Mapping or Tuple.")
-
-
-def _map_resource_request_to_dict(key, val):
-    if isinstance(val, interface.ResourceRequest):
-        assert isinstance(key, str)
-        new_key = interface.REQUEST_PREFIX + key
-        new_val = val._asdict()
-        return new_key, new_val
-    return key, val
-
-
-def map_request_to_obj(key, val, resources: interface.Resources):
-    if key.startswith(interface.REQUEST_PREFIX):
-        new_key = key[len(interface.REQUEST_PREFIX) :]
-        resource_name = val["name"]
-        try:
-            resource_kwargs = val["kwargs"]
-        except KeyError:
-            resource_kwargs = None
-        if resource_name in resources:
-            if resource_kwargs:
-                new_val = (resources[resource_name]["obj"], resource_kwargs)
-            else:
-                new_val = resources[resource_name]["obj"]
-            return new_key, new_val
-        else:
-            raise RuntimeError(f"Unavailable requested resource: {resource_name}")
-    else:
-        return key, val
-
-
-def serialize_message(message: interface.Message) -> bytes:
-    message = message.to_dict()
-    message = apply_fn_to_tree(
-        _map_resource_request_to_dict, message, apply_to_nodes=True
-    )
-
-    def default(obj):
-        return f"<<non-serializable: {type(obj).__qualname__}>>"
-
-    return str.encode(json.dumps(message, default=default))
diff --git a/rlberry/rendering/__init__.py b/rlberry/rendering/__init__.py
deleted file mode 100644
index 5bdd0e295..000000000
--- a/rlberry/rendering/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .core import Scene, GeometricPrimitive
-from .render_interface import RenderInterface
-from .render_interface import RenderInterface2D
diff --git a/rlberry/rendering/common_shapes.py b/rlberry/rendering/common_shapes.py
deleted file mode 100644
index 91f942c14..000000000
--- a/rlberry/rendering/common_shapes.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import numpy as np
-from rlberry.rendering import GeometricPrimitive
-
-
-def bar_shape(p0, p1, width):
-    shape = GeometricPrimitive("QUADS")
-
-    x0, y0 = p0
-    x1, y1 = p1
-
-    direction = np.array([x1 - x0, y1 - y0])
-    norm = np.sqrt((direction * direction).sum())
-    direction = direction / norm
-
-    # get vector perpendicular to direction
-    u_vec = np.zeros(2)
-    u_vec[0] = -direction[1]
-    u_vec[1] = direction[0]
-
-    u_vec = u_vec * width / 2
-
-    shape.add_vertex((x0 + u_vec[0], y0 + u_vec[1]))
-    shape.add_vertex((x0 - u_vec[0], y0 - u_vec[1]))
-    shape.add_vertex((x1 - u_vec[0], y1 - u_vec[1]))
-    shape.add_vertex((x1 + u_vec[0], y1 + u_vec[1]))
-    return shape
-
-
-def circle_shape(center, radius, n_points=50):
-    shape = GeometricPrimitive("POLYGON")
-
-    x0, y0 = center
-    theta = np.linspace(0.0, 2 * np.pi, n_points)
-    for tt in theta:
-        xx = radius * np.cos(tt)
-        yy = radius * np.sin(tt)
-        shape.add_vertex((x0 + xx, y0 + yy))
-
-    return shape
diff --git a/rlberry/rendering/core.py b/rlberry/rendering/core.py
deleted file mode 100644
index 0cc5e92ef..000000000
--- a/rlberry/rendering/core.py
+++ /dev/null
@@ -1,56 +0,0 @@
-"""
-Provide classes for geometric primitives in OpenGL and scenes.
-"""
-
-
-class Scene:
-    """
-    Class representing a scene, which is a vector of GeometricPrimitive objects
-    """
-
-    def __init__(self):
-        self.shapes = []
-
-    def add_shape(self, shape):
-        self.shapes.append(shape)
-
-
-class GeometricPrimitive:
-    """
-    Class representing an OpenGL geometric primitive.
-
-     Primitive type (GL_LINE_LOOP by defaut)
-
-     If using OpenGLRender2D, one of the following:
-           POINTS
-           LINES
-           LINE_STRIP
-           LINE_LOOP
-           POLYGON
-           TRIANGLES
-           TRIANGLE_STRIP
-           TRIANGLE_FAN
-           QUADS
-           QUAD_STRIP
-
-    If using PyGameRender2D:
-            POLYGON
-
-
-    TODO: Add support to more pygame shapes,
-    see https://www.pygame.org/docs/ref/draw.html
-    """
-
-    def __init__(self, primitive_type="GL_LINE_LOOP"):
-        # primitive type
-        self.type = primitive_type
-        # color in RGB
-        self.color = (0.25, 0.25, 0.25)
-        # list of vertices. each vertex is a tuple with coordinates in space
-        self.vertices = []
-
-    def add_vertex(self, vertex):
-        self.vertices.append(vertex)
-
-    def set_color(self, color):
-        self.color = color
diff --git a/rlberry/rendering/opengl_render2d.py b/rlberry/rendering/opengl_render2d.py
deleted file mode 100644
index 64ec79646..000000000
--- a/rlberry/rendering/opengl_render2d.py
+++ /dev/null
@@ -1,252 +0,0 @@
-"""
-OpenGL code for 2D rendering, using pygame.
-"""
-
-import numpy as np
-from os import environ
-
-from rlberry.rendering import Scene
-
-import rlberry
-
-logger = rlberry.logger
-environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "1"
-
-_IMPORT_SUCESSFUL = True
-_IMPORT_ERROR_MSG = ""
-try:
-    import pygame as pg
-    from pygame.locals import DOUBLEBUF, OPENGL
-
-    from OpenGL.GLU import gluOrtho2D
-    from OpenGL.GL import glMatrixMode, glLoadIdentity, glClearColor
-    from OpenGL.GL import glClear, glFlush, glBegin, glEnd
-    from OpenGL.GL import glColor3f, glVertex2f
-    from OpenGL.GL import glReadBuffer, glReadPixels
-    from OpenGL.GL import GL_PROJECTION, GL_COLOR_BUFFER_BIT
-    from OpenGL.GL import GL_POINTS, GL_LINES, GL_LINE_STRIP, GL_LINE_LOOP
-    from OpenGL.GL import GL_POLYGON, GL_TRIANGLES, GL_TRIANGLE_STRIP
-    from OpenGL.GL import GL_TRIANGLE_FAN, GL_QUADS, GL_QUAD_STRIP
-    from OpenGL.GL import GL_FRONT, GL_RGB, GL_UNSIGNED_BYTE
-
-except Exception as ex:
-    _IMPORT_SUCESSFUL = False
-    _IMPORT_ERROR_MSG = str(ex)
-
-
-class OpenGLRender2D:
-    """
-    Class to render a list of scenes using OpenGL and pygame.
-    """
-
-    def __init__(self):
-        # parameters
-        self.window_width = 800
-        self.window_height = 800  # multiples of 16 are preferred
-        self.background_color = (0.6, 0.75, 1.0)
-        self.refresh_interval = 50
-        self.window_name = "rlberry render"
-        self.clipping_area = (-1.0, 1.0, -1.0, 1.0)
-
-        # time counter
-        self.time_count = 0
-
-        # background scene
-        self.background = Scene()
-        # data to be rendered (list of scenes)
-        self.data = []
-
-    def set_window_name(self, name):
-        self.window_name = name
-
-    def set_refresh_interval(self, interval):
-        self.refresh_interval = interval
-
-    def set_clipping_area(self, area):
-        """
-        The clipping area is tuple with elements (left, right, bottom, top)
-        Default = (-1.0, 1.0, -1.0, 1.0)
-        """
-        self.clipping_area = area
-        base_size = max(self.window_width, self.window_height)
-        width_range = area[1] - area[0]
-        height_range = area[3] - area[2]
-        base_range = max(width_range, height_range)
-        width_range /= base_range
-        height_range /= base_range
-        self.window_width = int(base_size * width_range)
-        self.window_height = int(base_size * height_range)
-
-        # width and height must be divisible by 2
-        if self.window_width % 2 == 1:
-            self.window_width += 1
-        if self.window_height % 2 == 1:
-            self.window_height += 1
-
-    def set_data(self, data):
-        self.data = data
-
-    def set_background(self, background):
-        self.background = background
-
-    def initGL(self):
-        """
-        initialize GL
-        """
-        glMatrixMode(GL_PROJECTION)
-        glLoadIdentity()
-        gluOrtho2D(
-            self.clipping_area[0],
-            self.clipping_area[1],
-            self.clipping_area[2],
-            self.clipping_area[3],
-        )
-
-    def display(self):
-        """
-        Callback function, handler for window re-paint
-        """
-        # Set background color (clear background)
-        glClearColor(
-            self.background_color[0],
-            self.background_color[1],
-            self.background_color[2],
-            1.0,
-        )
-        glClear(GL_COLOR_BUFFER_BIT)
-
-        # Display background
-        for shape in self.background.shapes:
-            self.draw_geometric2d(shape)
-
-        # Display objects
-        if len(self.data) > 0:
-            idx = self.time_count % len(self.data)
-            for shape in self.data[idx].shapes:
-                self.draw_geometric2d(shape)
-
-        self.time_count += 1
-        glFlush()
-
-    @staticmethod
-    def draw_geometric2d(shape):
-        """
-        Draw a 2D shape, of type GeometricPrimitive
-        """
-        if shape.type == "POINTS":
-            glBegin(GL_POINTS)
-        elif shape.type == "LINES":
-            glBegin(GL_LINES)
-        elif shape.type == "LINE_STRIP":
-            glBegin(GL_LINE_STRIP)
-        elif shape.type == "LINE_LOOP":
-            glBegin(GL_LINE_LOOP)
-        elif shape.type == "POLYGON":
-            glBegin(GL_POLYGON)
-        elif shape.type == "TRIANGLES":
-            glBegin(GL_TRIANGLES)
-        elif shape.type == "TRIANGLE_STRIP":
-            glBegin(GL_TRIANGLE_STRIP)
-        elif shape.type == "TRIANGLE_FAN":
-            glBegin(GL_TRIANGLE_FAN)
-        elif shape.type == "QUADS":
-            glBegin(GL_QUADS)
-        elif shape.type == "QUAD_STRIP":
-            glBegin(GL_QUAD_STRIP)
-        else:
-            logger.error("Invalid type for geometric primitive!")
-            raise NameError
-
-        # set color
-        glColor3f(shape.color[0], shape.color[1], shape.color[2])
-
-        # create vertices
-        for vertex in shape.vertices:
-            glVertex2f(vertex[0], vertex[1])
-        glEnd()
-
-    def run_graphics(self, loop=True):
-        """
-        Sequentially displays scenes in self.data
-
-        If loop is True, keep rendering until user closes the window.
-        """
-        global _IMPORT_SUCESSFUL
-
-        if _IMPORT_SUCESSFUL:
-            pg.init()
-            display = (self.window_width, self.window_height)
-            pg.display.set_mode(display, DOUBLEBUF | OPENGL)
-            pg.display.set_caption(self.window_name)
-            self.initGL()
-            while True:
-                for event in pg.event.get():
-                    if event.type == pg.QUIT:
-                        pg.quit()
-                        return
-                #
-                self.display()
-                #
-                pg.display.flip()
-                pg.time.wait(self.refresh_interval)
-
-                # if not loop, stop
-                if not loop:
-                    pg.quit()
-                    return
-        else:
-            logger.error(
-                f"Not possible to render the environment due to the following error: {_IMPORT_ERROR_MSG}"
-            )
-            return
-
-    def get_gl_image_str(self):
-        # see https://gist.github.com/Jerdak/7364746
-        glReadBuffer(GL_FRONT)
-        pixels = glReadPixels(
-            0, 0, self.window_width, self.window_height, GL_RGB, GL_UNSIGNED_BYTE
-        )
-        return pixels
-
-    def get_video_data(self):
-        """
-        Stores scenes in self.data in a list of numpy arrays that can be used
-        to save a video.
-        """
-        global _IMPORT_SUCESSFUL
-
-        if _IMPORT_SUCESSFUL:
-            video_data = []
-
-            pg.init()
-            display = (self.window_width, self.window_height)
-            _ = pg.display.set_mode(display, DOUBLEBUF | OPENGL)
-            pg.display.set_caption(self.window_name)
-            self.initGL()
-
-            self.time_count = 0
-            while self.time_count <= len(self.data):
-                #
-                self.display()
-                #
-                pg.display.flip()
-
-                #
-                # See https://stackoverflow.com/a/42754578/5691288
-                #
-                string_image = self.get_gl_image_str()
-                temp_surf = pg.image.frombytes(
-                    string_image, (self.window_width, self.window_height), "RGB"
-                )
-                tmp_arr = pg.surfarray.array3d(temp_surf)
-                imgdata = np.moveaxis(tmp_arr, 0, 1)
-                imgdata = np.flipud(imgdata)
-                video_data.append(imgdata)
-
-            pg.quit()
-            return video_data
-        else:
-            logger.error(
-                f"Not possible to render the environment due to the following error: {_IMPORT_ERROR_MSG}"
-            )
-            return []
diff --git a/rlberry/rendering/pygame_render2d.py b/rlberry/rendering/pygame_render2d.py
deleted file mode 100644
index a8d5b3990..000000000
--- a/rlberry/rendering/pygame_render2d.py
+++ /dev/null
@@ -1,197 +0,0 @@
-"""
-Code for 2D rendering, using pygame (without OpenGL)
-"""
-
-import numpy as np
-from os import environ
-
-from rlberry.rendering import Scene
-
-import rlberry
-
-logger = rlberry.logger
-
-environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "1"
-
-_IMPORT_SUCESSFUL = True
-_IMPORT_ERROR_MSG = ""
-try:
-    import pygame as pg
-
-except Exception as ex:
-    _IMPORT_SUCESSFUL = False
-    _IMPORT_ERROR_MSG = str(ex)
-
-
-class PyGameRender2D:
-    """Class to render a list of scenes using pygame."""
-
-    def __init__(self):
-        # parameters
-        self.window_width = 800
-        self.window_height = 800  # multiples of 16 are preferred
-        self.background_color = (150, 190, 255)
-        self.refresh_interval = 50
-        self.window_name = "rlberry render"
-        self.clipping_area = (-1.0, 1.0, -1.0, 1.0)
-
-        # time counter
-        self.time_count = 0
-
-        # background scene
-        self.background = Scene()
-        # data to be rendered (list of scenes)
-        self.data = []
-
-    def set_window_name(self, name):
-        self.window_name = name
-
-    def set_refresh_interval(self, interval):
-        self.refresh_interval = interval
-
-    def set_clipping_area(self, area):
-        """
-        The clipping area is tuple with elements (left, right, bottom, top)
-        Default = (-1.0, 1.0, -1.0, 1.0)
-        """
-        self.clipping_area = area
-        base_size = max(self.window_width, self.window_height)
-        width_range = area[1] - area[0]
-        height_range = area[3] - area[2]
-        base_range = max(width_range, height_range)
-        width_range /= base_range
-        height_range /= base_range
-        self.window_width = int(base_size * width_range)
-        self.window_height = int(base_size * height_range)
-
-        # width and height must be divisible by 2
-        if self.window_width % 2 == 1:
-            self.window_width += 1
-        if self.window_height % 2 == 1:
-            self.window_height += 1
-
-    def set_data(self, data):
-        self.data = data
-
-    def set_background(self, background):
-        self.background = background
-
-    def display(self):
-        """
-        Callback function, handler for window re-paint
-        """
-        # Set background color (clear background)
-        self.screen.fill(self.background_color)
-
-        # Display background
-        for shape in self.background.shapes:
-            self.draw_geometric2d(shape)
-
-        # Display objects
-        if len(self.data) > 0:
-            idx = self.time_count % len(self.data)
-            for shape in self.data[idx].shapes:
-                self.draw_geometric2d(shape)
-
-        self.time_count += 1
-
-    def draw_geometric2d(self, shape):
-        """
-        Draw a 2D shape, of type GeometricPrimitive
-        """
-        if shape.type in ["POLYGON"]:
-            area = self.clipping_area
-            width_range = area[1] - area[0]
-            height_range = area[3] - area[2]
-
-            vertices = []
-            for vertex in shape.vertices:
-                xx = vertex[0] * self.window_width / width_range
-                yy = vertex[1] * self.window_height / height_range
-
-                # put origin at bottom left instead of top left
-                yy = self.window_height - yy
-
-                pg_vertex = (xx, yy)
-                vertices.append(pg_vertex)
-
-            color = (255 * shape.color[0], 255 * shape.color[1], 255 * shape.color[2])
-            pg.draw.polygon(self.screen, color, vertices)
-
-        else:
-            raise NotImplementedError(
-                "Shape type %s not implemented in pygame renderer." % shape.type
-            )
-
-    def run_graphics(self, loop=True):
-        """
-        Sequentially displays scenes in self.data
-        """
-        global _IMPORT_SUCESSFUL
-
-        if _IMPORT_SUCESSFUL:
-            pg.init()
-            display = (self.window_width, self.window_height)
-            self.screen = pg.display.set_mode(display)
-            pg.display.set_caption(self.window_name)
-            while True:
-                for event in pg.event.get():
-                    if event.type == pg.QUIT:
-                        pg.quit()
-                        return
-                #
-                self.display()
-                #
-                pg.display.flip()
-                pg.time.wait(self.refresh_interval)
-
-                # if not loop, stop
-                if not loop:
-                    pg.quit()
-                    return
-        else:
-            logger.error(
-                f"Not possible to render the environment due to the following error: {_IMPORT_ERROR_MSG}"
-            )
-            return
-
-    def get_video_data(self):
-        """
-        Stores scenes in self.data in a list of numpy arrays that can be used
-        to save a video.
-        """
-        global _IMPORT_SUCESSFUL
-
-        if _IMPORT_SUCESSFUL:
-            video_data = []
-
-            pg.init()
-            display = (self.window_width, self.window_height)
-            self.screen = pg.display.set_mode(display)
-            pg.display.set_caption(self.window_name)
-
-            self.time_count = 0
-            while self.time_count <= len(self.data):
-                #
-                self.display()
-                #
-                pg.display.flip()
-
-                #
-                # See https://stackoverflow.com/a/42754578/5691288
-                #
-                string_image = pg.image.tobytes(self.screen, "RGB")
-                temp_surf = pg.image.frombytes(
-                    string_image, (self.window_width, self.window_height), "RGB"
-                )
-                tmp_arr = pg.surfarray.array3d(temp_surf)
-                imgdata = np.moveaxis(tmp_arr, 0, 1)
-                video_data.append(imgdata)
-
-            pg.quit()
-            return video_data
-        else:
-            logger.error(
-                f"Not possible to render the environment due to the following error: {_IMPORT_ERROR_MSG}"
-            )
-            return []
diff --git a/rlberry/rendering/render_interface.py b/rlberry/rendering/render_interface.py
deleted file mode 100644
index af846cf33..000000000
--- a/rlberry/rendering/render_interface.py
+++ /dev/null
@@ -1,162 +0,0 @@
-"""
-Interface that allows 2D rendering.
-"""
-
-
-from abc import ABC, abstractmethod
-
-from rlberry.rendering.opengl_render2d import OpenGLRender2D
-from rlberry.rendering.pygame_render2d import PyGameRender2D
-from rlberry.rendering.utils import video_write
-
-import rlberry
-
-logger = rlberry.logger
-
-
-class RenderInterface(ABC):
-    """
-    Common interface for rendering in rlberry.
-    """
-
-    def __init__(self):
-        self._rendering_enabled = False
-
-    def is_render_enabled(self):
-        return self._rendering_enabled
-
-    def enable_rendering(self):
-        self._rendering_enabled = True
-
-    def disable_rendering(self):
-        self._rendering_enabled = False
-
-    def save_video(self, filename, **kwargs):
-        """
-        Save video file.
-        """
-        pass
-
-    def get_video(self, **kwargs):
-        """
-        Get video data.
-        """
-        pass
-
-    @abstractmethod
-    def render(self, **kwargs):
-        """
-        Display on screen.
-        """
-        pass
-
-
-class RenderInterface2D(RenderInterface):
-    """
-    Interface for 2D rendering in rlberry.
-    """
-
-    def __init__(self):
-        RenderInterface.__init__(self)
-        self._rendering_enabled = False
-        self._rendering_type = "2d"
-        self._state_history_for_rendering = []
-        self._refresh_interval = 50  # in milliseconds
-        self._clipping_area = (-1.0, 1.0, -1.0, 1.0)  # (left,right,bottom,top)
-
-        # rendering type, either 'pygame' or 'opengl'
-        self.renderer_type = "opengl"
-
-    def get_renderer(self):
-        if self.renderer_type == "opengl":
-            return OpenGLRender2D()
-        elif self.renderer_type == "pygame":
-            return PyGameRender2D()
-        else:
-            raise NotImplementedError("Unknown renderer type.")
-
-    @abstractmethod
-    def get_scene(self, state):
-        """
-        Return scene (list of shapes) representing a given state
-        """
-        pass
-
-    @abstractmethod
-    def get_background(self):
-        """
-        Returne a scene (list of shapes) representing the background
-        """
-        pass
-
-    def append_state_for_rendering(self, state):
-        self._state_history_for_rendering.append(state)
-
-    def set_refresh_interval(self, interval):
-        self._refresh_interval = interval
-
-    def clear_render_buffer(self):
-        self._state_history_for_rendering = []
-
-    def set_clipping_area(self, area):
-        self._clipping_area = area
-
-    def _get_background_and_scenes(self):
-        # background
-        background = self.get_background()
-
-        # data: convert states to scenes
-        scenes = []
-        for state in self._state_history_for_rendering:
-            scene = self.get_scene(state)
-            scenes.append(scene)
-        return background, scenes
-
-    def render(self, loop=True, **kwargs):
-        """
-        Function to render an environment that implements the interface.
-        """
-
-        if self.is_render_enabled():
-            # background and data
-            background, data = self._get_background_and_scenes()
-
-            if len(data) == 0:
-                logger.info("No data to render.")
-                return
-
-            # render
-            renderer = self.get_renderer()
-
-            renderer.window_name = self.name
-            renderer.set_refresh_interval(self._refresh_interval)
-            renderer.set_clipping_area(self._clipping_area)
-            renderer.set_data(data)
-            renderer.set_background(background)
-            renderer.run_graphics(loop)
-            return 0
-        else:
-            logger.info("Rendering not enabled for the environment.")
-            return 1
-
-    def get_video(self, framerate=25, **kwargs):
-        # background and data
-        background, data = self._get_background_and_scenes()
-
-        if len(data) == 0:
-            logger.info("No data to save.")
-            return
-
-        # get video data from renderer
-        renderer = self.get_renderer()
-        renderer.window_name = self.name
-        renderer.set_refresh_interval(self._refresh_interval)
-        renderer.set_clipping_area(self._clipping_area)
-        renderer.set_data(data)
-        renderer.set_background(background)
-
-        return renderer.get_video_data()
-
-    def save_video(self, filename, framerate=25, **kwargs):
-        video_data = self.get_video(framerate=framerate, **kwargs)
-        video_write(filename, video_data, framerate=framerate)
diff --git a/rlberry/rendering/tests/__init__.py b/rlberry/rendering/tests/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/rlberry/rendering/tests/test_rendering_interface.py b/rlberry/rendering/tests/test_rendering_interface.py
deleted file mode 100644
index f0c793700..000000000
--- a/rlberry/rendering/tests/test_rendering_interface.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import os
-import pytest
-import sys
-
-from pyvirtualdisplay import Display
-from rlberry.envs.classic_control import MountainCar
-from rlberry.envs.classic_control import Acrobot
-from rlberry.envs.classic_control import Pendulum
-from rlberry.envs.finite import Chain
-from rlberry.envs.finite import GridWorld
-from rlberry.envs.benchmarks.grid_exploration.four_room import FourRoom
-from rlberry.envs.benchmarks.grid_exploration.six_room import SixRoom
-from rlberry.envs.benchmarks.grid_exploration.apple_gold import AppleGold
-from rlberry.envs.benchmarks.ball_exploration import PBall2D, SimplePBallND
-from rlberry.envs.benchmarks.generalization.twinrooms import TwinRooms
-from rlberry.rendering import RenderInterface
-from rlberry.rendering import RenderInterface2D
-from rlberry.envs import Wrapper
-
-import tempfile
-
-try:
-    display = Display(visible=0, size=(1400, 900))
-    display.start()
-except Exception:
-    pass
-
-classes = [
-    Acrobot,
-    Pendulum,
-    MountainCar,
-    GridWorld,
-    Chain,
-    PBall2D,
-    SimplePBallND,
-    FourRoom,
-    SixRoom,
-    AppleGold,
-    TwinRooms,
-]
-
-
-@pytest.mark.parametrize("ModelClass", classes)
-def test_instantiation(ModelClass):
-    env = ModelClass()
-
-    if isinstance(env, RenderInterface):
-        env.disable_rendering()
-        assert not env.is_render_enabled()
-        env.enable_rendering()
-        assert env.is_render_enabled()
-
-
-@pytest.mark.xfail(sys.platform != "linux", reason="bug with mac and windows???")
-@pytest.mark.parametrize("ModelClass", classes)
-def test_render2d_interface(ModelClass):
-    env = ModelClass()
-
-    if isinstance(env, RenderInterface2D):
-        env.enable_rendering()
-
-        if env.is_online():
-            for _ in range(2):
-                observation, info = env.reset()
-                for _ in range(5):
-                    assert env.observation_space.contains(observation)
-                    action = env.action_space.sample()
-                    observation, _, _, _, _ = env.step(action)
-                env.render(loop=False)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                saving_path = tmpdirname + "/test_video.mp4"
-
-                env.save_video(saving_path)
-                env.clear_render_buffer()
-
-
-@pytest.mark.xfail(sys.platform != "linux", reason="bug with mac and windows???")
-@pytest.mark.parametrize("ModelClass", classes)
-def test_render2d_interface_wrapped(ModelClass):
-    env = Wrapper(ModelClass())
-
-    if isinstance(env.env, RenderInterface2D):
-        env.enable_rendering()
-        if env.is_online():
-            for _ in range(2):
-                observation, info = env.reset()
-                for _ in range(5):
-                    assert env.observation_space.contains(observation)
-                    action = env.action_space.sample()
-                    observation, _, _, _, _ = env.step(action)
-                env.render(loop=False)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                saving_path = tmpdirname + "/test_video.mp4"
-                env.save_video(saving_path)
-                env.clear_render_buffer()
-        try:
-            os.remove("test_video.mp4")
-        except Exception:
-            pass
-
-
-def test_render_appelGold():
-    env = AppleGold()
-    env.render_mode = "human"
-    env = Wrapper(env)
-
-    if env.is_online():
-        for _ in range(2):
-            observation, info = env.reset()
-            for _ in range(5):
-                assert env.observation_space.contains(observation)
-                action = env.action_space.sample()
-                observation, _, _, _, _ = env.step(action)
-            env.render(loop=False)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            saving_path = tmpdirname + "/test_video.mp4"
-            env.save_video(saving_path)
-            env.clear_render_buffer()
-    try:
-        os.remove("test_video.mp4")
-    except Exception:
-        pass
diff --git a/rlberry/rendering/utils.py b/rlberry/rendering/utils.py
deleted file mode 100644
index bf09963d3..000000000
--- a/rlberry/rendering/utils.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import numpy as np
-
-
-_FFMPEG_INSTALLED = True
-try:
-    import ffmpeg
-except Exception:
-    _FFMPEG_INSTALLED = False
-
-import rlberry
-
-logger = rlberry.logger
-
-
-def video_write(fn, images, framerate=60, vcodec="libx264"):
-    """
-    Save list of images to a video file.
-
-    Source:
-    https://github.com/kkroening/ffmpeg-python/issues/246#issuecomment-520200981
-    Modified so that framerate is given to .input(), as suggested in the
-    thread, to avoid
-    skipping frames.
-
-    Parameters
-    ----------
-    fn : string
-        filename
-    images : list or np.array
-        list of images to save to a video.
-    framerate : int
-    """
-    global _FFMPEG_INSTALLED
-
-    try:
-        if len(images) == 0:
-            logger.warning("Calling video_write() with empty images.")
-            return
-
-        if not _FFMPEG_INSTALLED:
-            logger.error(
-                "video_write(): Unable to save video, ffmpeg-python \
-    package required (https://github.com/kkroening/ffmpeg-python)"
-            )
-            return
-
-        if not isinstance(images, np.ndarray):
-            images = np.asarray(images)
-        _, height, width, channels = images.shape
-        process = (
-            ffmpeg.input(
-                "pipe:",
-                format="rawvideo",
-                pix_fmt="rgb24",
-                s="{}x{}".format(width, height),
-                r=framerate,
-            )
-            .output(fn, pix_fmt="yuv420p", vcodec=vcodec)
-            .overwrite_output()
-            .run_async(pipe_stdin=True)
-        )
-        for frame in images:
-            process.stdin.write(frame.astype(np.uint8).tobytes())
-        process.stdin.close()
-        process.wait()
-
-    except Exception as ex:
-        logger.warning(
-            "Not possible to save \
-video, due to exception: {}".format(
-                str(ex)
-            )
-        )
diff --git a/rlberry/tests/test_agent_extra.py b/rlberry/tests/test_agent_extra.py
index 61cfcdba6..be55e7d5a 100644
--- a/rlberry/tests/test_agent_extra.py
+++ b/rlberry/tests/test_agent_extra.py
@@ -1,13 +1,13 @@
 import pytest
-import rlberry.agents as agents
-import rlberry.agents.torch as torch_agents
+import rlberry_scool.agents as agents_scool
+import rlberry_research.agents.torch as torch_agents
 from rlberry.utils.check_agent import (
     check_rl_agent,
     check_rlberry_agent,
     check_vectorized_env_agent,
     check_hyperparam_optimisation_agent,
 )
-from rlberry.agents.features import FeatureMap
+from rlberry_scool.agents.features import FeatureMap
 import numpy as np
 import sys
 
@@ -25,12 +25,12 @@ def map(self, observation, action):
 
 
 # LSVIUCBAgent needs a feature map function to work.
-class OneHotLSVI(agents.LSVIUCBAgent):
+class OneHotLSVI(agents_scool.LSVIUCBAgent):
     def __init__(self, env, **kwargs):
         def feature_map_fn(_env):
             return OneHotFeatureMap(5, 2)  # values for Chain
 
-        agents.LSVIUCBAgent.__init__(
+        agents_scool.LSVIUCBAgent.__init__(
             self, env, feature_map_fn=feature_map_fn, horizon=10, **kwargs
         )
 
diff --git a/rlberry/tests/test_agents_base.py b/rlberry/tests/test_agents_base.py
index a9c65ee9f..89378f24d 100644
--- a/rlberry/tests/test_agents_base.py
+++ b/rlberry/tests/test_agents_base.py
@@ -11,8 +11,9 @@
 import numpy as np
 import sys
 
-import rlberry.agents as agents
-from rlberry.agents.features import FeatureMap
+import rlberry_research.agents as agents_research
+import rlberry_scool.agents as agents_scool
+from rlberry_scool.agents.features import FeatureMap
 
 from rlberry.utils.check_agent import (
     check_rl_agent,
@@ -33,32 +34,32 @@ def map(self, observation, action):
 
 
 # LSVIUCBAgent needs a feature map function to work.
-class OneHotLSVI(agents.LSVIUCBAgent):
+class OneHotLSVI(agents_scool.LSVIUCBAgent):
     def __init__(self, env, **kwargs):
         def feature_map_fn(_env):
             return OneHotFeatureMap(5, 2)  # values for Chain
 
-        agents.LSVIUCBAgent.__init__(
+        agents_scool.LSVIUCBAgent.__init__(
             self, env, feature_map_fn=feature_map_fn, horizon=10, **kwargs
         )
 
 
 FINITE_MDP_AGENTS = [
-    agents.QLAgent,
-    agents.SARSAAgent,
-    agents.ValueIterationAgent,
-    agents.MBQVIAgent,
-    agents.UCBVIAgent,
-    agents.OptQLAgent,
-    agents.PSRLAgent,
-    agents.RLSVIAgent,
+    agents_scool.QLAgent,
+    agents_scool.SARSAAgent,
+    agents_scool.ValueIterationAgent,
+    agents_scool.MBQVIAgent,
+    agents_scool.UCBVIAgent,
+    agents_research.OptQLAgent,
+    agents_research.PSRLAgent,
+    agents_research.RLSVIAgent,
     OneHotLSVI,
 ]
 
 
 CONTINUOUS_STATE_AGENTS = [
-    agents.RSUCBVIAgent,
-    agents.RSKernelUCBVIAgent,
+    agents_research.RSUCBVIAgent,
+    agents_research.RSKernelUCBVIAgent,
 ]
 
 
diff --git a/rlberry/tests/test_envs.py b/rlberry/tests/test_envs.py
index 9519de04f..36321d20d 100644
--- a/rlberry/tests/test_envs.py
+++ b/rlberry/tests/test_envs.py
@@ -1,11 +1,11 @@
 from rlberry.utils.check_env import check_env, check_rlberry_env
-from rlberry.envs import Acrobot
-from rlberry.envs.benchmarks.ball_exploration import PBall2D
-from rlberry.envs.benchmarks.generalization.twinrooms import TwinRooms
-from rlberry.envs.benchmarks.grid_exploration.apple_gold import AppleGold
-from rlberry.envs.benchmarks.grid_exploration.nroom import NRoom
-from rlberry.envs.classic_control import MountainCar, SpringCartPole
-from rlberry.envs.finite import Chain, GridWorld
+from rlberry_research.envs import Acrobot
+from rlberry_research.envs.benchmarks.ball_exploration import PBall2D
+from rlberry_research.envs.benchmarks.generalization.twinrooms import TwinRooms
+from rlberry_research.envs.benchmarks.grid_exploration.apple_gold import AppleGold
+from rlberry_research.envs.benchmarks.grid_exploration.nroom import NRoom
+from rlberry_research.envs.classic_control import MountainCar, SpringCartPole
+from rlberry_research.envs.finite import Chain, GridWorld
 import pytest
 
 ALL_ENVS = [
diff --git a/rlberry/tests/test_rlberry_main_agents_and_env.py b/rlberry/tests/test_rlberry_main_agents_and_env.py
new file mode 100644
index 000000000..6a6bb6258
--- /dev/null
+++ b/rlberry/tests/test_rlberry_main_agents_and_env.py
@@ -0,0 +1,133 @@
+"""
+===============================================
+Tests some agent and env from rlberry only (no rlberry-scool or rlberry research)
+===============================================
+
+"""
+
+from rlberry.utils.check_env import check_env, check_gym_env
+from rlberry.utils.check_agent import check_rl_agent
+from rlberry.envs import gym_make, atari_make
+from rlberry.agents.stable_baselines import StableBaselinesAgent
+from stable_baselines3 import A2C
+
+from stable_baselines3 import DQN
+import pytest
+
+
+import gymnasium as gym
+import numpy as np
+from typing import Tuple
+
+
+class CustomDummyEnv(gym.Env):
+    def __init__(self):
+        obs_dict = dict(
+            board=gym.spaces.Box(low=0, high=1, shape=(8 * 8,), dtype=bool),
+            player=gym.spaces.Discrete(8),
+        )
+        self.observation_space = gym.spaces.Dict(obs_dict)
+        self.action_space = gym.spaces.MultiDiscrete([8, 8])
+        self.has_reset_before_step_dummy = False
+
+    def reset(self):
+        self.has_reset_before_step_dummy = True
+        return self._obs(), {}
+
+    def _obs(self):
+        return {"board": np.zeros(shape=(8, 8), dtype=bool).flatten(), "player": 1}
+
+    def step(self, action: Tuple[int, int]):
+        if not self.has_reset_before_step_dummy:
+            raise AssertionError("Cannot call env.step() before calling reset()")
+        reward = 0.2
+        terminated = False
+        truncated = False
+        info = {}
+        return self._obs(), reward, terminated, truncated, info
+
+    def render(self):
+        print("hi")
+
+    def reseed(self, seed):
+        print("reseed")
+
+
+class CustomDummyEnvBox1(CustomDummyEnv):
+    def __init__(self):
+        CustomDummyEnv.__init__(self)
+        self.action_space = gym.spaces.Box(-np.inf, np.inf)
+
+
+class CustomDummyEnvBox2(CustomDummyEnv):
+    def __init__(self):
+        CustomDummyEnv.__init__(self)
+        self.action_space = gym.spaces.Box(5, 5)
+
+
+FROZEN_LAKE_CONSTR = (
+    gym_make,
+    dict(id="FrozenLake-v1", wrap_spaces=True, is_slippery=False),
+)
+CART_POLE_CONSTR = (gym_make, dict(id="CartPole-v1", wrap_spaces=True))
+PENDULUM_CONSTR = (gym_make, dict(id="Pendulum-v1", wrap_spaces=True))
+ASTEROIDS_CONSTR = (atari_make, dict(id="ALE/Asteroids-v5", wrap_spaces=True))
+CUSTOM_CONSTR = (CustomDummyEnv, {})
+
+
+TEST_ENV_SUCCES = [
+    FROZEN_LAKE_CONSTR,
+    CART_POLE_CONSTR,
+    PENDULUM_CONSTR,
+    ASTEROIDS_CONSTR,
+    CUSTOM_CONSTR,
+]
+
+
+@pytest.mark.parametrize("Env", TEST_ENV_SUCCES)
+def test_env(Env):
+    current_env = Env[0](**Env[1])
+    if not isinstance(current_env, CustomDummyEnv):
+        check_env(current_env)
+    check_gym_env(current_env)
+
+
+CUSTOM_BOX_CONSTR1 = (CustomDummyEnvBox1, {})
+CUSTOM_BOX_CONSTR2 = (CustomDummyEnvBox2, {})
+
+TEST_ENV_FAIL = [
+    CUSTOM_CONSTR,
+    CUSTOM_BOX_CONSTR1,
+    CUSTOM_BOX_CONSTR2,
+]
+
+
+@pytest.mark.parametrize("Env", TEST_ENV_FAIL)
+def test_errors_env(Env):
+    current_env = Env[0](**Env[1])
+    had_exception_step_before_reset = False
+    try:
+        current_env.step(0)
+    except Exception as ex:
+        had_exception_step_before_reset = True
+
+    assert had_exception_step_before_reset
+    check_gym_env(current_env)
+
+
+A2C_INIT_KWARGS = {"algo_cls": A2C, "policy": "MlpPolicy", "verbose": 1}
+DQN_INIT_KWARGS = {"algo_cls": DQN, "policy": "MlpPolicy", "verbose": 1}
+
+AGENTS_WITH_ENV = [
+    (A2C_INIT_KWARGS, PENDULUM_CONSTR),
+    (DQN_INIT_KWARGS, CART_POLE_CONSTR),
+]
+
+
+@pytest.mark.parametrize("agent_kwargs,env", AGENTS_WITH_ENV)
+def test_rlberry_agent(agent_kwargs, env):
+    check_rl_agent(
+        StableBaselinesAgent,
+        env=env,
+        init_kwargs=agent_kwargs,
+    )
diff --git a/rlberry/utils/__init__.py b/rlberry/utils/__init__.py
index f70c962c1..b2a8ff62c 100644
--- a/rlberry/utils/__init__.py
+++ b/rlberry/utils/__init__.py
@@ -1,4 +1,3 @@
-from .check_bandit_agent import check_bandit_agent
 from .check_agent import (
     check_rl_agent,
     check_save_load,
diff --git a/rlberry/utils/check_agent.py b/rlberry/utils/check_agent.py
index f4a02976c..d0fc9e961 100644
--- a/rlberry/utils/check_agent.py
+++ b/rlberry/utils/check_agent.py
@@ -1,5 +1,5 @@
-from rlberry.envs import Chain, Pendulum
-from rlberry.envs.benchmarks.ball_exploration import PBall2D
+from rlberry_research.envs import Chain, Pendulum
+from rlberry_research.envs.benchmarks.ball_exploration import PBall2D
 from rlberry.manager import ExperimentManager
 import numpy as np
 from rlberry.seeding import set_external_seed
@@ -61,7 +61,13 @@ def _fit_experiment_manager(agent, env="continuous_state", init_kwargs=None):
     train_env = _make_tuple_env(env)
     try:
         agent = ExperimentManager(
-            agent, train_env, fit_budget=5, n_fit=1, seed=SEED, init_kwargs=init_kwargs
+            agent,
+            train_env,
+            agent_name="test_agent",
+            fit_budget=5,
+            n_fit=1,
+            seed=SEED,
+            init_kwargs=init_kwargs,
         )
         agent.fit()
     except Exception as exc:
diff --git a/rlberry/utils/check_bandit_agent.py b/rlberry/utils/check_bandit_agent.py
deleted file mode 100644
index 89389b77f..000000000
--- a/rlberry/utils/check_bandit_agent.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from rlberry.envs.bandits import BernoulliBandit
-from rlberry.manager import ExperimentManager
-
-
-def check_bandit_agent(Agent, environment=BernoulliBandit, seed=42):
-    """
-    Function used to check a bandit agent in rlberry on a Gaussian bandit problem.
-
-    Parameters
-    ----------
-    Agent: rlberry agent module
-        Agent class that we want to test.
-
-    environment: rlberry env module
-        Environment (i.e bandit instance) on which to test the agent.
-
-    seed : Seed sequence from which to spawn the random number generator.
-
-
-    Returns
-    -------
-    result : bool
-        Whether the agent is a valid/compatible bandit agent.
-
-    Examples
-    --------
-    >>> from rlberry.agents.bandits import IndexAgent
-    >>> from rlberry.utils import check_bandit_agent
-    >>> import numpy as np
-    >>> class UCBAgent(IndexAgent):
-    >>>     name = "UCB"
-    >>>     def __init__(self, env, **kwargs):
-    >>>         def index(r, t):
-    >>>             return np.mean(r) + np.sqrt(np.log(t**2) / (2 * len(r)))
-    >>>         IndexAgent.__init__(self, env, index, **kwargs)
-    >>> check_bandit_agent(UCBAgent)
-    True
-
-    """
-    env_ctor = environment
-    env_kwargs = {}
-
-    agent1 = ExperimentManager(
-        Agent, (env_ctor, env_kwargs), fit_budget=10, n_fit=1, seed=seed
-    )
-    agent2 = ExperimentManager(
-        Agent, (env_ctor, env_kwargs), fit_budget=10, n_fit=1, seed=seed
-    )
-
-    agent1.fit()
-    agent2.fit()
-    env = env_ctor(**env_kwargs)
-    state, info = env.reset()
-    result = True
-    for _ in range(5):
-        # test reproducibility on 5 actions
-        action1 = agent1.agent_handlers[0].policy(state)
-        action2 = agent2.agent_handlers[0].policy(state)
-        if action1 != action2:
-            result = False
-
-    return result
diff --git a/rlberry/utils/io.py b/rlberry/utils/io.py
deleted file mode 100644
index cb269f29a..000000000
--- a/rlberry/utils/io.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import os
-import zipfile
-import pathlib
-
-
-def zipdir(dir_path, ouput_fname):
-    """
-    Zip a directory.
-
-    Parameters
-    ----------
-    dir_path : Path or str
-        Directory to be compressed.
-    output_fname : str
-        Name of output zip file.
-
-    Returns
-    -------
-    path to zip file, or None if dir_path does not exist.
-    """
-    dir_path = pathlib.Path(dir_path)
-    if not dir_path.exists():
-        return None
-    ouput_fname = pathlib.Path(ouput_fname).with_suffix(".zip")
-    zipf = zipfile.ZipFile(ouput_fname, "w", zipfile.ZIP_DEFLATED)
-    for root, _, files in os.walk(dir_path):
-        for file in files:
-            zipf.write(
-                os.path.join(root, file),
-                os.path.relpath(os.path.join(root, file), os.path.join(dir_path, "..")),
-            )
-    zipf.close()
-    return ouput_fname
diff --git a/rlberry/utils/tests/test_check.py b/rlberry/utils/tests/test_check.py
index 070b580d0..c7c926296 100644
--- a/rlberry/utils/tests/test_check.py
+++ b/rlberry/utils/tests/test_check.py
@@ -1,6 +1,6 @@
 import numpy as np
 import pytest
-from rlberry.envs import GridWorld, Chain
+from rlberry_research.envs import GridWorld, Chain
 from rlberry.utils.check_env import check_env
 from rlberry.utils.check_agent import (
     check_rl_agent,
@@ -9,7 +9,7 @@
 )
 from rlberry.spaces import Box, Dict, Discrete
 import gymnasium as gym
-from rlberry.agents import ValueIterationAgent, UCBVIAgent
+from rlberry_scool.agents import ValueIterationAgent, UCBVIAgent
 
 
 class ActionDictTestEnv(gym.Env):
diff --git a/rlberry/utils/tests/test_writer.py b/rlberry/utils/tests/test_writer.py
index 1345649d2..504301800 100644
--- a/rlberry/utils/tests/test_writer.py
+++ b/rlberry/utils/tests/test_writer.py
@@ -1,5 +1,5 @@
 import time
-from rlberry.envs import GridWorld
+from rlberry_research.envs import GridWorld
 from rlberry.agents import AgentWithSimplePolicy
 from rlberry.manager import ExperimentManager
 
@@ -20,8 +20,9 @@ def fit(self, budget, **kwargs):
         self.total_budget += budget
         for ii in range(budget):
             if self.writer is not None:
-                self.writer.add_scalar("a", 42, ii)
+                self.writer.add_scalar("a", ii, ii)
             time.sleep(1)
+
         return None
 
     def policy(self, observation):
@@ -42,6 +43,11 @@ def test_myoutput(capsys):  # or use "capfd" for fd-level
     )
     agent.fit(budget=3)
 
+    assert agent.agent_handlers[0].writer.summary_writer == None
+    assert list(agent.agent_handlers[0].writer.read_tag_value("a")) == [0, 1, 2]
+    assert agent.agent_handlers[0].writer.read_first_tag_value("a") == 0
+    assert agent.agent_handlers[0].writer.read_last_tag_value("a") == 2
+
     captured = capsys.readouterr()
     # test that what is written to stderr is longer than 50 char,
     assert (
diff --git a/rlberry/utils/writers.py b/rlberry/utils/writers.py
index 2b3504df2..5bc9ae7bf 100644
--- a/rlberry/utils/writers.py
+++ b/rlberry/utils/writers.py
@@ -35,7 +35,7 @@ class DefaultWriter:
     log_interval : int
         Minimum number of seconds between consecutive logs (with logging module).
     style_log: str
-        Possible values are "multi_line" and "one_line". Define the style of the logs.
+        Possible values are "multi_line", "one_line" and "progressbar". Define the style of the logs.
     tensorboard_kwargs : Optional[dict]
         Parameters for tensorboard SummaryWriter. If provided, DefaultWriter
         will behave as tensorboard.SummaryWriter, and will keep utilities to handle
@@ -430,6 +430,8 @@ def __init__(self, *args, desc="", **kwargs):
 
     def set_description(self, desc=None, refresh=True):
         screen_width, _ = _screen_shape_wrapper()(sys.stdout)
+        if screen_width is None:
+            screen_width = 600
         max_len = screen_width
         if len(desc) > 1:
             if not self.subbar:
diff --git a/rlberry/wrappers/tests/old_env/old_acrobot.py b/rlberry/wrappers/tests/old_env/old_acrobot.py
index 8ee5d24f2..35408486d 100644
--- a/rlberry/wrappers/tests/old_env/old_acrobot.py
+++ b/rlberry/wrappers/tests/old_env/old_acrobot.py
@@ -12,8 +12,8 @@
 import numpy as np
 import rlberry.spaces as spaces
 from rlberry.envs.interface import Model
-from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D
-from rlberry.rendering.common_shapes import bar_shape, circle_shape
+from rlberry_research.rendering import Scene, GeometricPrimitive, RenderInterface2D
+from rlberry_research.rendering.common_shapes import bar_shape, circle_shape
 
 __copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy"
 __credits__ = [
diff --git a/rlberry/wrappers/tests/old_env/old_apple_gold.py b/rlberry/wrappers/tests/old_env/old_apple_gold.py
index 9006c990c..31cc45c87 100644
--- a/rlberry/wrappers/tests/old_env/old_apple_gold.py
+++ b/rlberry/wrappers/tests/old_env/old_apple_gold.py
@@ -1,7 +1,7 @@
 import numpy as np
 import rlberry.spaces as spaces
 from rlberry.wrappers.tests.old_env.old_gridworld import Old_GridWorld
-from rlberry.rendering import Scene, GeometricPrimitive
+from rlberry_research.rendering import Scene, GeometricPrimitive
 
 import rlberry
 
diff --git a/rlberry/wrappers/tests/old_env/old_gridworld.py b/rlberry/wrappers/tests/old_env/old_gridworld.py
index 4de564bac..774a08a74 100644
--- a/rlberry/wrappers/tests/old_env/old_gridworld.py
+++ b/rlberry/wrappers/tests/old_env/old_gridworld.py
@@ -5,9 +5,9 @@
 from matplotlib import cm
 
 from rlberry.wrappers.tests.old_env.old_finite_mdp import Old_FiniteMDP
-from rlberry.envs.finite import gridworld_utils
-from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D
-from rlberry.rendering.common_shapes import circle_shape
+from rlberry_research.envs.finite import gridworld_utils
+from rlberry_research.rendering import Scene, GeometricPrimitive, RenderInterface2D
+from rlberry_research.rendering.common_shapes import circle_shape
 
 
 import rlberry
diff --git a/rlberry/wrappers/tests/old_env/old_mountain_car.py b/rlberry/wrappers/tests/old_env/old_mountain_car.py
index dc40b31db..1e163f950 100644
--- a/rlberry/wrappers/tests/old_env/old_mountain_car.py
+++ b/rlberry/wrappers/tests/old_env/old_mountain_car.py
@@ -16,7 +16,7 @@
 
 import rlberry.spaces as spaces
 from rlberry.envs.interface import Model
-from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D
+from rlberry_research.rendering import Scene, GeometricPrimitive, RenderInterface2D
 
 
 class Old_MountainCar(RenderInterface2D, Model):
diff --git a/rlberry/wrappers/tests/old_env/old_nroom.py b/rlberry/wrappers/tests/old_env/old_nroom.py
index 6820ee780..f3b9cd9c5 100644
--- a/rlberry/wrappers/tests/old_env/old_nroom.py
+++ b/rlberry/wrappers/tests/old_env/old_nroom.py
@@ -2,7 +2,7 @@
 import numpy as np
 import rlberry.spaces as spaces
 from rlberry.wrappers.tests.old_env.old_gridworld import Old_GridWorld
-from rlberry.rendering import Scene, GeometricPrimitive
+from rlberry_research.rendering import Scene, GeometricPrimitive
 
 import rlberry
 
diff --git a/rlberry/wrappers/tests/old_env/old_pball.py b/rlberry/wrappers/tests/old_env/old_pball.py
index acc7ee29d..ae183de47 100644
--- a/rlberry/wrappers/tests/old_env/old_pball.py
+++ b/rlberry/wrappers/tests/old_env/old_pball.py
@@ -3,7 +3,7 @@
 
 import rlberry.spaces as spaces
 from rlberry.envs.interface import Model
-from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D
+from rlberry_research.rendering import Scene, GeometricPrimitive, RenderInterface2D
 
 import rlberry
 
diff --git a/rlberry/wrappers/tests/old_env/old_pendulum.py b/rlberry/wrappers/tests/old_env/old_pendulum.py
index e8e93ca01..1ce1151fe 100644
--- a/rlberry/wrappers/tests/old_env/old_pendulum.py
+++ b/rlberry/wrappers/tests/old_env/old_pendulum.py
@@ -11,8 +11,8 @@
 import numpy as np
 import rlberry.spaces as spaces
 from rlberry.envs.interface import Model
-from rlberry.rendering import Scene, RenderInterface2D
-from rlberry.rendering.common_shapes import bar_shape, circle_shape
+from rlberry_research.rendering import Scene, RenderInterface2D
+from rlberry_research.rendering.common_shapes import bar_shape, circle_shape
 
 
 class Old_Pendulum(RenderInterface2D, Model):
diff --git a/rlberry/wrappers/tests/old_env/old_six_room.py b/rlberry/wrappers/tests/old_env/old_six_room.py
index a51905d2d..a38368819 100644
--- a/rlberry/wrappers/tests/old_env/old_six_room.py
+++ b/rlberry/wrappers/tests/old_env/old_six_room.py
@@ -1,7 +1,7 @@
 import numpy as np
 import rlberry.spaces as spaces
 from rlberry.wrappers.tests.old_env.old_gridworld import Old_GridWorld
-from rlberry.rendering import Scene, GeometricPrimitive
+from rlberry_research.rendering import Scene, GeometricPrimitive
 
 import rlberry
 
diff --git a/rlberry/wrappers/tests/old_env/old_twinrooms.py b/rlberry/wrappers/tests/old_env/old_twinrooms.py
index c9ffa09a5..1c6078b9a 100644
--- a/rlberry/wrappers/tests/old_env/old_twinrooms.py
+++ b/rlberry/wrappers/tests/old_env/old_twinrooms.py
@@ -1,8 +1,8 @@
 import numpy as np
 import rlberry.spaces as spaces
 from rlberry.envs import Model
-from rlberry.rendering import Scene, GeometricPrimitive, RenderInterface2D
-from rlberry.rendering.common_shapes import circle_shape
+from rlberry_research.rendering import Scene, GeometricPrimitive, RenderInterface2D
+from rlberry_research.rendering.common_shapes import circle_shape
 
 import rlberry
 
diff --git a/rlberry/wrappers/tests/test_basewrapper.py b/rlberry/wrappers/tests/test_basewrapper.py
index f624e46f2..16279473f 100644
--- a/rlberry/wrappers/tests/test_basewrapper.py
+++ b/rlberry/wrappers/tests/test_basewrapper.py
@@ -1,6 +1,6 @@
 from rlberry.envs.interface import Model
 from rlberry.envs import Wrapper
-from rlberry.envs import GridWorld
+from rlberry_research.envs import GridWorld
 import gymnasium as gym
 
 
diff --git a/rlberry/wrappers/tests/test_common_wrappers.py b/rlberry/wrappers/tests/test_common_wrappers.py
index 502d24c2a..eb4b51e8a 100644
--- a/rlberry/wrappers/tests/test_common_wrappers.py
+++ b/rlberry/wrappers/tests/test_common_wrappers.py
@@ -1,10 +1,11 @@
 import numpy as np
 import pytest
 from rlberry import spaces
-from rlberry.agents import RSUCBVIAgent
-from rlberry.envs.classic_control import MountainCar
-from rlberry.envs.finite import FiniteMDP, GridWorld
-from rlberry.exploration_tools.discrete_counter import DiscreteCounter
+from rlberry_research.agents import RSUCBVIAgent
+from rlberry_research.envs.classic_control import MountainCar
+from rlberry_research.envs.finite import GridWorld
+from rlberry.envs.finite_mdp import FiniteMDP
+from rlberry_research.exploration_tools.discrete_counter import DiscreteCounter
 from rlberry.seeding import Seeder
 from rlberry.wrappers.autoreset import AutoResetWrapper
 from rlberry.wrappers.discrete2onehot import DiscreteToOneHotWrapper
diff --git a/rlberry/wrappers/tests/test_wrapper_seeding.py b/rlberry/wrappers/tests/test_wrapper_seeding.py
index 936db0d14..25f6c9f37 100644
--- a/rlberry/wrappers/tests/test_wrapper_seeding.py
+++ b/rlberry/wrappers/tests/test_wrapper_seeding.py
@@ -3,10 +3,10 @@
 from rlberry.seeding import Seeder
 
 from copy import deepcopy
-from rlberry.envs.classic_control import MountainCar, Acrobot
-from rlberry.envs.finite import Chain
-from rlberry.envs.finite import GridWorld
-from rlberry.envs.benchmarks.ball_exploration import PBall2D, SimplePBallND
+from rlberry_research.envs.classic_control import MountainCar, Acrobot
+from rlberry_research.envs.finite import Chain
+from rlberry_research.envs.finite import GridWorld
+from rlberry_research.envs.benchmarks.ball_exploration import PBall2D, SimplePBallND
 from rlberry.envs import Wrapper
 from rlberry.wrappers import RescaleRewardWrapper
 
diff --git a/rlberry/wrappers/tests/test_writer_utils.py b/rlberry/wrappers/tests/test_writer_utils.py
index da8edad70..4b3c5da78 100644
--- a/rlberry/wrappers/tests/test_writer_utils.py
+++ b/rlberry/wrappers/tests/test_writer_utils.py
@@ -1,9 +1,9 @@
 import pytest
 
 from rlberry.wrappers import WriterWrapper
-from rlberry.envs import GridWorld
+from rlberry_research.envs import GridWorld
 
-from rlberry.agents import UCBVIAgent
+from rlberry_scool.agents import UCBVIAgent
 
 
 @pytest.mark.parametrize("write_scalar", ["action", "reward", "action_and_reward"])
diff --git a/rlberry/wrappers/vis2d.py b/rlberry/wrappers/vis2d.py
index 808b64bb8..e0db99bba 100644
--- a/rlberry/wrappers/vis2d.py
+++ b/rlberry/wrappers/vis2d.py
@@ -1,7 +1,7 @@
 from rlberry.envs import Wrapper
-from rlberry.exploration_tools.discrete_counter import DiscreteCounter
+from rlberry_research.exploration_tools.discrete_counter import DiscreteCounter
 from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
-from rlberry.rendering.utils import video_write
+from rlberry_research.rendering.utils import video_write
 import gymnasium.spaces as spaces
 
 import matplotlib.pyplot as plt
diff --git a/scripts/fetch_contributors.py b/scripts/fetch_contributors.py
index 5421a85f0..e89d347f7 100644
--- a/scripts/fetch_contributors.py
+++ b/scripts/fetch_contributors.py
@@ -15,7 +15,6 @@
 
 
 MEMBERS = [
-    "sauxpa",
     "TimotheeMathieu",
     "omardrwch",
     "xuedong",
@@ -23,6 +22,7 @@
     "yfletberliac",
     "mmcenta",
     "menardprr",
+    "sauxpa",
     "riccardodv",
     "AleShi94",
     "KohlerHECTOR",
@@ -30,6 +30,7 @@
     "riiswa",
     "brahimdriss",
     "RemyDegenne",
+    "YannBerthelot",
 ]