Merge branch 'vwxyzjn:master' into master

vwxyzjn · Nov 17, 2024 · 52e99b3 · 52e99b3
2 parents bc29059 + e648ee2
commit 52e99b3
Show file tree

Hide file tree

Showing 21 changed files with 4,156 additions and 4 deletions.
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -12,7 +12,7 @@ jobs:
       matrix:
         python-version: ["3.8", "3.9", "3.10"]
         poetry-version: ["1.7"]
-        os: [ubuntu-22.04, macos-latest, windows-latest]
+        os: [ubuntu-22.04, windows-latest]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v2
@@ -55,7 +55,7 @@ jobs:
       matrix:
         python-version: ["3.8", "3.9", "3.10"]
         poetry-version: ["1.7"]
-        os: [ubuntu-22.04, macos-latest, windows-latest]
+        os: [ubuntu-22.04, windows-latest]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v2
@@ -91,7 +91,7 @@ jobs:
       matrix:
         python-version: ["3.8", "3.9", "3.10"]
         poetry-version: ["1.7"]
-        os: [ubuntu-22.04, macos-latest, windows-latest]
+        os: [ubuntu-22.04, windows-latest]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v2

diff --git a/README.md b/README.md
@@ -28,7 +28,11 @@ CleanRL is a Deep Reinforcement Learning library that provides high-quality sing
 
 You can read more about CleanRL in our [JMLR paper](https://www.jmlr.org/papers/volume23/21-1342/21-1342.pdf) and [documentation](https://docs.cleanrl.dev/).
 
-CleanRL only contains implementations of **online** deep reinforcement learning algorithms. If you are looking for **offline** algorithms, please check out [corl-team/CORL](https://github.com/corl-team/CORL), which shares a similar design philosophy as CleanRL.
+Notable CleanRL-related projects:
+
+* [corl-team/CORL](https://github.com/corl-team/CORL): Offline RL algorithm implemented in CleanRL style
+* [pytorch-labs/LeanRL](https://github.com/pytorch-labs/LeanRL): Fast optimized PyTorch implementation of CleanRL RL algorithms using CUDAGraphs.
+
 
 > ℹ️ **Support for Gymnasium**: [Farama-Foundation/Gymnasium](https://github.com/Farama-Foundation/Gymnasium) is the next generation of [`openai/gym`](https://github.com/openai/gym) that will continue to be maintained and introduce new features. Please see their [announcement](https://farama.org/Announcing-The-Farama-Foundation) for further detail. We are migrating to `gymnasium` and the progress can be tracked in [vwxyzjn/cleanrl#277](https://github.com/vwxyzjn/cleanrl/pull/277). 
 
@@ -85,6 +89,7 @@ pip install -r requirements/requirements-pettingzoo.txt
 pip install -r requirements/requirements-jax.txt
 pip install -r requirements/requirements-docs.txt
 pip install -r requirements/requirements-cloud.txt
+pip install -r requirements/requirements-memory_gym.txt
 ```
 
 To run training scripts in other games:
@@ -140,6 +145,7 @@ You may also use a prebuilt development environment hosted in Gitpod:
 | |  [`ppo_atari_multigpu.py`](https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/ppo_atari_multigpu.py),  [docs](https://docs.cleanrl.dev/rl-algorithms/ppo/#ppo_atari_multigpupy)
 | | [`ppo_pettingzoo_ma_atari.py`](https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/ppo_pettingzoo_ma_atari.py),  [docs](https://docs.cleanrl.dev/rl-algorithms/ppo/#ppo_pettingzoo_ma_ataripy)
 | | [`ppo_continuous_action_isaacgym.py`](https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/ppo_continuous_action_isaacgym/ppo_continuous_action_isaacgym.py),  [docs](https://docs.cleanrl.dev/rl-algorithms/ppo/#ppo_continuous_action_isaacgympy)
+| | [`ppo_trxl.py`](https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/ppo_trxl/ppo_trxl.py),  [docs](https://docs.cleanrl.dev/rl-algorithms/ppo-trxl/)
 | ✅ [Deep Q-Learning (DQN)](https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf) |  [`dqn.py`](https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/dqn.py),  [docs](https://docs.cleanrl.dev/rl-algorithms/dqn/#dqnpy) |
 | | [`dqn_atari.py`](https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/dqn_atari.py),  [docs](https://docs.cleanrl.dev/rl-algorithms/dqn/#dqn_ataripy) |
 | | [`dqn_jax.py`](https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/dqn_jax.py), [docs](https://docs.cleanrl.dev/rl-algorithms/dqn/#dqn_jaxpy) |

diff --git a/benchmark/ppo_trxl.sh b/benchmark/ppo_trxl.sh
@@ -0,0 +1,52 @@
+# export WANDB_ENTITY=openrlbenchmark
+
+cd cleanrl/ppo_trxl
+poetry install
+OMP_NUM_THREADS=4 poetry run python -m cleanrl_utils.benchmark \
+    --env-ids MortarMayhem-Grid-v0 \
+    --command "python ./cleanrl/ppo_trxl/ppo_trxl.py --track --norm_adv --trxl_memory_length 119 --total_timesteps 100000000" \
+    --num-seeds 3 \
+    --workers 32 \
+    --slurm-template-path benchmark/cleanrl_1gpu.slurm_template
+
+OMP_NUM_THREADS=4 poetry run python -m cleanrl_utils.benchmark \
+    --env-ids MortarMayhem-v0 \
+    --command "python ./cleanrl/ppo_trxl/ppo_trxl.py --track --reconstruction_coef 0.1 --trxl_memory_length 275" \
+    --num-seeds 3 \
+    --workers 32 \
+    --slurm-template-path benchmark/cleanrl_1gpu.slurm_template
+
+OMP_NUM_THREADS=4 poetry run python -m cleanrl_utils.benchmark \
+    --env-ids MysteryPath-Grid-v0 \
+    --command "python ./cleanrl/ppo_trxl/ppo_trxl.py --track --trxl_memory_length 96 --total_timesteps 100000000" \
+    --num-seeds 3 \
+    --workers 32 \
+    --slurm-template-path benchmark/cleanrl_1gpu.slurm_template
+
+OMP_NUM_THREADS=4 poetry run python -m cleanrl_utils.benchmark \
+    --env-ids MysteryPath-v0 \
+    --command "python ./cleanrl/ppo_trxl/ppo_trxl.py --track --trxl_memory_length 256" \
+    --num-seeds 3 \
+    --workers 32 \
+    --slurm-template-path benchmark/cleanrl_1gpu.slurm_template
+
+OMP_NUM_THREADS=4 poetry run python -m cleanrl_utils.benchmark \
+    --env-ids SearingSpotlights-v0 \
+    --command "python ./cleanrl/ppo_trxl/ppo_trxl.py --track --reconstruction_coef 0.1 --trxl_memory_length 256" \
+    --num-seeds 3 \
+    --workers 32 \
+    --slurm-template-path benchmark/cleanrl_1gpu.slurm_template
+
+OMP_NUM_THREADS=4 poetry run python -m cleanrl_utils.benchmark \
+    --env-ids Endless-SearingSpotlights-v0 \
+    --command "python ./cleanrl/ppo_trxl/ppo_trxl.py --track --reconstruction_coef 0.1 --trxl_memory_length 256 --total_timesteps 350000000" \
+    --num-seeds 3 \
+    --workers 32 \
+    --slurm-template-path benchmark/cleanrl_1gpu.slurm_template
+
+OMP_NUM_THREADS=4 poetry run python -m cleanrl_utils.benchmark \
+    --env-ids Endless-MortarMayhem-v0 Endless-MysteryPath-v0 \
+    --command "python ./cleanrl/ppo_trxl/ppo_trxl.py --track --trxl_memory_length 256 --total_timesteps 350000000" \
+    --num-seeds 3 \
+    --workers 32 \
+    --slurm-template-path benchmark/cleanrl_1gpu.slurm_template
diff --git a/benchmark/pqn.sh b/benchmark/pqn.sh
@@ -0,0 +1,32 @@
+poetry install
+OMP_NUM_THREADS=1 xvfb-run -a poetry run python -m cleanrl_utils.benchmark \
+    --env-ids CartPole-v1 Acrobot-v1 MountainCar-v0 \
+    --command "poetry run python cleanrl/pqn.py --no_cuda --track" \
+    --num-seeds 3 \
+    --workers 9 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 10 \
+    --slurm-template-path benchmark/cleanrl_1gpu.slurm_template
+
+poetry install -E envpool
+poetry run python -m cleanrl_utils.benchmark \
+    --env-ids Breakout-v5 SpaceInvaders-v5 BeamRider-v5 Pong-v5 MsPacman-v5 \
+    --command "poetry run python cleanrl/pqn_atari_envpool.py --track" \
+    --num-seeds 3 \
+    --workers 9 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 10 \
+    --slurm-template-path benchmark/cleanrl_1gpu.slurm_template
+
+poetry install -E envpool
+poetry run python -m cleanrl_utils.benchmark \
+    --env-ids Breakout-v5 SpaceInvaders-v5 BeamRider-v5 Pong-v5 MsPacman-v5 \
+    --command "poetry run python cleanrl/pqn_atari_envpool_lstm.py --track" \
+    --num-seeds 3 \
+    --workers 9 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 10 \
+    --slurm-template-path benchmark/cleanrl_1gpu.slurm_template
diff --git a/benchmark/pqn_plot.sh b/benchmark/pqn_plot.sh
@@ -0,0 +1,50 @@
+
+python -m openrlbenchmark.rlops \
+    --filters '?we=rogercreus&wpn=cleanRL&ceik=env_id&cen=exp_name&metric=charts/episodic_return' \
+        'pqn?tag=pr-472&cl=CleanRL PQN' \
+    --env-ids CartPole-v1 Acrobot-v1 MountainCar-v0 \
+    --no-check-empty-runs \
+    --pc.ncols 3 \
+    --pc.ncols-legend 2 \
+    --output-filename benchmark/cleanrl/pqn \
+    --scan-history
+
+python -m openrlbenchmark.rlops \
+    --filters '?we=rogercreus&wpn=cleanRL&ceik=env_id&cen=exp_name&metric=charts/episodic_return' \
+        'pqn_atari_envpool?tag=pr-472&cl=CleanRL PQN' \
+    --env-ids Breakout-v5 SpaceInvaders-v5 BeamRider-v5 Pong-v5 MsPacman-v5 \
+    --no-check-empty-runs \
+    --pc.ncols 3 \
+    --pc.ncols-legend 3 \
+    --rliable \
+    --rc.score_normalization_method maxmin \
+    --rc.normalized_score_threshold 1.0 \
+    --rc.sample_efficiency_plots \
+    --rc.sample_efficiency_and_walltime_efficiency_method Median \
+    --rc.performance_profile_plots  \
+    --rc.aggregate_metrics_plots  \
+    --rc.sample_efficiency_num_bootstrap_reps 10 \
+    --rc.performance_profile_num_bootstrap_reps 10 \
+    --rc.interval_estimates_num_bootstrap_reps 10 \
+    --output-filename static/0compare \
+    --scan-history
+
+python -m openrlbenchmark.rlops \
+    --filters '?we=rogercreus&wpn=cleanRL&ceik=env_id&cen=exp_name&metric=charts/episodic_return' \
+        'pqn_atari_envpool_lstm?tag=pr-472&cl=CleanRL PQN' \
+    --env-ids Breakout-v5 SpaceInvaders-v5 BeamRider-v5 MsPacman-v5 \
+    --no-check-empty-runs \
+    --pc.ncols 3 \
+    --pc.ncols-legend 3 \
+    --rliable \
+    --rc.score_normalization_method maxmin \
+    --rc.normalized_score_threshold 1.0 \
+    --rc.sample_efficiency_plots \
+    --rc.sample_efficiency_and_walltime_efficiency_method Median \
+    --rc.performance_profile_plots  \
+    --rc.aggregate_metrics_plots  \
+    --rc.sample_efficiency_num_bootstrap_reps 10 \
+    --rc.performance_profile_num_bootstrap_reps 10 \
+    --rc.interval_estimates_num_bootstrap_reps 10 \
+    --output-filename static/0compare \
+    --scan-history
diff --git a/cleanrl/ppo_trxl/enjoy.py b/cleanrl/ppo_trxl/enjoy.py
@@ -0,0 +1,91 @@
+from dataclasses import dataclass
+
+import gymnasium as gym
+import torch
+import tyro
+from ppo_trxl import Agent, make_env
+
+
+@dataclass
+class Args:
+    hub: bool = False
+    """whether to load the model from the huggingface hub or from the local disk"""
+    name: str = "Endless-MortarMayhem-v0_12.nn"
+    """path to the model file"""
+
+
+if __name__ == "__main__":
+    # Parse command line arguments and retrieve model path
+    cli_args = tyro.cli(Args)
+    if cli_args.hub:
+        try:
+            from huggingface_hub import hf_hub_download
+
+            path = hf_hub_download(repo_id="LilHairdy/cleanrl_memory_gym", filename=cli_args.name)
+        except:
+            raise RuntimeError(
+                "Cannot load model from the huggingface hub. Please install the huggingface_hub pypi package and verify the model name. You can also download the model from the hub manually and load it from disk."
+            )
+    else:
+        path = cli_args.name
+
+    # Load the pre-trained model and the original args used to train it
+    checkpoint = torch.load(path)
+    args = checkpoint["args"]
+    args = type("Args", (), args)
+
+    # Init environment and reset
+    env = make_env(args.env_id, 0, False, "", "human")()
+    obs, _ = env.reset()
+    env.render()
+
+    # Determine maximum episode steps
+    max_episode_steps = env.spec.max_episode_steps
+    if not max_episode_steps:
+        max_episode_steps = env.max_episode_steps
+    if max_episode_steps <= 0:
+        max_episode_steps = 1024  # Memory Gym envs have max_episode_steps set to -1
+        # May episode impacts positional encoding, so make sure to set this accordingly
+
+    # Setup agent and load its model parameters
+    action_space_shape = (
+        (env.action_space.n,) if isinstance(env.action_space, gym.spaces.Discrete) else tuple(env.action_space.nvec)
+    )
+    agent = Agent(args, env.observation_space, action_space_shape, max_episode_steps)
+    agent.load_state_dict(checkpoint["model_weights"])
+
+    # Setup Transformer-XL memory, mask and indices
+    memory = torch.zeros((1, max_episode_steps, args.trxl_num_layers, args.trxl_dim), dtype=torch.float32)
+    memory_mask = torch.tril(torch.ones((args.trxl_memory_length, args.trxl_memory_length)), diagonal=-1)
+    repetitions = torch.repeat_interleave(
+        torch.arange(0, args.trxl_memory_length).unsqueeze(0), args.trxl_memory_length - 1, dim=0
+    ).long()
+    memory_indices = torch.stack(
+        [torch.arange(i, i + args.trxl_memory_length) for i in range(max_episode_steps - args.trxl_memory_length + 1)]
+    ).long()
+    memory_indices = torch.cat((repetitions, memory_indices))
+
+    # Run episode
+    done = False
+    t = 0
+    while not done:
+        # Prepare observation and memory
+        obs = torch.Tensor(obs).unsqueeze(0)
+        memory_window = memory[0, memory_indices[t].unsqueeze(0)]
+        t_ = max(0, min(t, args.trxl_memory_length - 1))
+        mask = memory_mask[t_].unsqueeze(0)
+        indices = memory_indices[t].unsqueeze(0)
+        # Forward agent
+        action, _, _, _, new_memory = agent.get_action_and_value(obs, memory_window, mask, indices)
+        memory[:, t] = new_memory
+        # Step
+        obs, reward, termination, truncation, info = env.step(action.cpu().squeeze().numpy())
+        env.render()
+        done = termination or truncation
+        t += 1
+
+    if "r" in info["episode"].keys():
+        print(f"Episode return: {info['episode']['r'][0]}, Episode length: {info['episode']['l'][0]}")
+    else:
+        print(f"Episode return: {info['reward']}, Episode length: {info['length']}")
+    env.close()