proroklab · cmarschner · Sep 24, 2023 · Sep 24, 2023
diff --git a/environment.py b/environment.py
@@ -1,7 +1,8 @@
 import numpy as np
-import gym
-from gym import spaces
-from gym.utils import seeding, EzPickle
+import gymnasium as gym
+from gymnasium import spaces
+from gymnasium.utils import seeding, EzPickle
+from typing import Any, Dict, Optional, Tuple
 
 X = 1
 Y = 0
@@ -63,7 +64,7 @@ def __init__(self, *args, **kwargs):
 
     def reset(self):
         self.pose = self.random_state.uniform((0, 0), self.world_shape)
-        self.goal = self.random_state.randint((0, 0), self.world_shape)
+        self.goal = self.random_state.integers((0, 0), self.world_shape)
         self.reached_goal = False
         return [0, 0]
 
@@ -127,11 +128,11 @@ def seed(self, seed=None):
         self.random_state, seed = seeding.np_random(seed)
         return [seed]
 
-    def reset(self):
+    def reset(self, seed:Optional[int] = None, options: Dict[str, Any] = None) -> Tuple[Dict[str, np.ndarray], Dict[str, Any]]:
         reset_actions = [agent.reset() for agent in self.agents]
         self.goal_poses = [agent.goal for agent in self.agents]
         self.timestep = 0
-        return self.step(reset_actions)[0]
+        return self.step(reset_actions)[0], {}
 
     def step(self, actions):
         self.timestep += 1
@@ -165,7 +166,7 @@ def step(self, actions):
         info = {"rewards": rewards}
         all_rewards = sum(rewards.values())
 
-        return obs, all_rewards, done, info
+        return obs, all_rewards, done, False, info
 
     def render(self, mode="human"):
         top_bot_margin = " " + "-" * self.cfg["world_shape"][Y] * 2 + "\n"

diff --git a/multi_action_dist.py b/multi_action_dist.py
@@ -1,4 +1,4 @@
-import gym
+import gymnasium as gym
 import numpy as np
 import tree
 from ray.rllib.models.torch.torch_action_dist import (

diff --git a/multi_trainer.py b/multi_trainer.py
@@ -11,10 +11,10 @@
 from typing import List, Optional, Union
 from typing import Type
 
-import gym
+import gymnasium as gym
 import numpy as np
 import ray
-from ray.rllib.agents.ppo import PPOTrainer
+from ray.rllib.algorithms.ppo import PPO
 from ray.rllib.algorithms.algorithm import Algorithm
 from ray.rllib.algorithms.ppo import PPOTorchPolicy
 from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config
@@ -157,10 +157,10 @@ def compute_gae_for_sample_batch(
 
         # sample_batch[SampleBatch.INFOS] = list of len ROLLOUT_SIZE of which every element is
         # {'rewards': {0: -0.077463925, 1: -0.0029145998, 2: -0.08233316}} if there are 3 agents
-
+        # Note(cmarschner): rewards key missing for first entry for some reason
         samplebatch_infos_rewards = concat_samples(
             [
-                SampleBatch({str(k): [np.float32(v)] for k, v in s["rewards"].items()})
+                SampleBatch({str(k): [np.float32(v)] for k, v in s.get("rewards",  {0: 0, 1: 0, 2: 0}).items()})
                 for s in sample_batch[SampleBatch.INFOS]
                 # s = {'rewards': {0: -0.077463925, 1: -0.0029145998, 2: -0.08233316}} if there are 3 agents
             ]
@@ -172,7 +172,7 @@ def compute_gae_for_sample_batch(
     if not isinstance(policy.action_space, gym.spaces.tuple.Tuple):
         raise InvalidActionSpace("Expect tuple action space")
 
-    keys_to_overwirte = [
+    keys_to_overwrite = [
         SampleBatch.REWARDS,
         SampleBatch.VF_PREDS,
         Postprocessing.ADVANTAGES,
@@ -182,7 +182,7 @@ def compute_gae_for_sample_batch(
     original_batch = sample_batch.copy()
 
     # We prepare the sample batch to contain the agent batches
-    for k in keys_to_overwirte:
+    for k in keys_to_overwrite:
         sample_batch[k] = np.zeros((len(original_batch), n_agents), dtype=np.float32)
 
     if original_batch[SampleBatch.DONES][-1]:
@@ -223,7 +223,7 @@ def compute_gae_for_sample_batch(
             use_critic=policy.config.get("use_critic", True),
         )
 
-        for k in keys_to_overwirte:
+        for k in keys_to_overwrite:
             sample_batch[k][:, agent_index] = sample_batch_agent[k]
 
     return sample_batch
@@ -496,15 +496,15 @@ def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
         )
 
 
-class MultiPPOTrainer(PPOTrainer, ABC):
-    @override(PPOTrainer)
+class MultiPPOTrainer(PPO, ABC):
+    @override(PPO)
     def get_default_policy_class(self, config):
         return MultiPPOTorchPolicy
 
-    @override(PPOTrainer)
+    @override(PPO)
     def training_step(self) -> ResultDict:
         # Collect SampleBatches from sample workers until we have a full batch.
-        if self._by_agent_steps:
+        if self.config.count_steps_by == "agent_steps":
             assert False
             train_batch = synchronous_parallel_sample(
                 worker_set=self.workers, max_agent_steps=self.config["train_batch_size"]

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
-ray[rllib]==2.1.0
+ray[rllib]==2.7.0
 torch==1.13.0
diff --git a/train.py b/train.py
@@ -13,7 +13,7 @@
 
 
 def train(
-    share_observations=True, use_beta=True, action_space="discrete", goal_shift=1
+    share_observations=True, use_beta=True, action_space="discrete", goal_shift=1, num_gpus=1
 ):
     ray.init()
 
@@ -35,6 +35,8 @@ def train(
         # )],
         stop={"training_iteration": 30},
         config={
+            "_enable_learner_api": False,
+            "_enable_rl_module_api": False,
             "framework": "torch",
             "env": "demo_env",
             "kl_coeff": 0.0,
@@ -45,7 +47,7 @@ def train(
             "rollout_fragment_length": 1250,
             "sgd_minibatch_size": 2048,
             "num_sgd_iter": 16,
-            "num_gpus": 1,
+            "num_gpus": num_gpus,
             "num_workers": 8,
             "num_envs_per_worker": 1,
             "lr": 5e-4,
@@ -73,7 +75,6 @@ def train(
         },
     )
 
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="RLLib multi-agent with shared NN demo."
@@ -103,11 +104,18 @@ def train(
         choices=range(0, 2),
         help="Goal shift offset (0 means that each agent moves to its own goal, 1 to its neighbor, etc.)",
     )
+    parser.add_argument(
+        "--num_gpus",
+        type=int,
+        default=1,
+        help="GPUs to use."
+    )
 
     args = parser.parse_args()
     train(
         share_observations=not args.disable_sharing,
         use_beta=not args.disable_beta,
         action_space=args.action_space,
         goal_shift=args.goal_shift,
+        num_gpus=args.num_gpus,
     )