Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make it run on ray 2.7 #3

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions environment.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import numpy as np
import gym
from gym import spaces
from gym.utils import seeding, EzPickle
import gymnasium as gym
from gymnasium import spaces
from gymnasium.utils import seeding, EzPickle
from typing import Any, Dict, Optional, Tuple

X = 1
Y = 0
Expand Down Expand Up @@ -63,7 +64,7 @@ def __init__(self, *args, **kwargs):

def reset(self):
self.pose = self.random_state.uniform((0, 0), self.world_shape)
self.goal = self.random_state.randint((0, 0), self.world_shape)
self.goal = self.random_state.integers((0, 0), self.world_shape)
self.reached_goal = False
return [0, 0]

Expand Down Expand Up @@ -127,11 +128,11 @@ def seed(self, seed=None):
self.random_state, seed = seeding.np_random(seed)
return [seed]

def reset(self):
def reset(self, seed:Optional[int] = None, options: Dict[str, Any] = None) -> Tuple[Dict[str, np.ndarray], Dict[str, Any]]:
reset_actions = [agent.reset() for agent in self.agents]
self.goal_poses = [agent.goal for agent in self.agents]
self.timestep = 0
return self.step(reset_actions)[0]
return self.step(reset_actions)[0], {}

def step(self, actions):
self.timestep += 1
Expand Down Expand Up @@ -165,7 +166,7 @@ def step(self, actions):
info = {"rewards": rewards}
all_rewards = sum(rewards.values())

return obs, all_rewards, done, info
return obs, all_rewards, done, False, info

def render(self, mode="human"):
top_bot_margin = " " + "-" * self.cfg["world_shape"][Y] * 2 + "\n"
Expand Down
2 changes: 1 addition & 1 deletion multi_action_dist.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import gym
import gymnasium as gym
import numpy as np
import tree
from ray.rllib.models.torch.torch_action_dist import (
Expand Down
22 changes: 11 additions & 11 deletions multi_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
from typing import List, Optional, Union
from typing import Type

import gym
import gymnasium as gym
import numpy as np
import ray
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.algorithms.ppo import PPO
from ray.rllib.algorithms.algorithm import Algorithm
from ray.rllib.algorithms.ppo import PPOTorchPolicy
from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config
Expand Down Expand Up @@ -157,10 +157,10 @@ def compute_gae_for_sample_batch(

# sample_batch[SampleBatch.INFOS] = list of len ROLLOUT_SIZE of which every element is
# {'rewards': {0: -0.077463925, 1: -0.0029145998, 2: -0.08233316}} if there are 3 agents

# Note(cmarschner): rewards key missing for first entry for some reason
samplebatch_infos_rewards = concat_samples(
[
SampleBatch({str(k): [np.float32(v)] for k, v in s["rewards"].items()})
SampleBatch({str(k): [np.float32(v)] for k, v in s.get("rewards", {0: 0, 1: 0, 2: 0}).items()})
for s in sample_batch[SampleBatch.INFOS]
# s = {'rewards': {0: -0.077463925, 1: -0.0029145998, 2: -0.08233316}} if there are 3 agents
]
Expand All @@ -172,7 +172,7 @@ def compute_gae_for_sample_batch(
if not isinstance(policy.action_space, gym.spaces.tuple.Tuple):
raise InvalidActionSpace("Expect tuple action space")

keys_to_overwirte = [
keys_to_overwrite = [
SampleBatch.REWARDS,
SampleBatch.VF_PREDS,
Postprocessing.ADVANTAGES,
Expand All @@ -182,7 +182,7 @@ def compute_gae_for_sample_batch(
original_batch = sample_batch.copy()

# We prepare the sample batch to contain the agent batches
for k in keys_to_overwirte:
for k in keys_to_overwrite:
sample_batch[k] = np.zeros((len(original_batch), n_agents), dtype=np.float32)

if original_batch[SampleBatch.DONES][-1]:
Expand Down Expand Up @@ -223,7 +223,7 @@ def compute_gae_for_sample_batch(
use_critic=policy.config.get("use_critic", True),
)

for k in keys_to_overwirte:
for k in keys_to_overwrite:
sample_batch[k][:, agent_index] = sample_batch_agent[k]

return sample_batch
Expand Down Expand Up @@ -496,15 +496,15 @@ def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
)


class MultiPPOTrainer(PPOTrainer, ABC):
@override(PPOTrainer)
class MultiPPOTrainer(PPO, ABC):
@override(PPO)
def get_default_policy_class(self, config):
return MultiPPOTorchPolicy

@override(PPOTrainer)
@override(PPO)
def training_step(self) -> ResultDict:
# Collect SampleBatches from sample workers until we have a full batch.
if self._by_agent_steps:
if self.config.count_steps_by == "agent_steps":
assert False
train_batch = synchronous_parallel_sample(
worker_set=self.workers, max_agent_steps=self.config["train_batch_size"]
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
ray[rllib]==2.1.0
ray[rllib]==2.7.0
torch==1.13.0
14 changes: 11 additions & 3 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


def train(
share_observations=True, use_beta=True, action_space="discrete", goal_shift=1
share_observations=True, use_beta=True, action_space="discrete", goal_shift=1, num_gpus=1
):
ray.init()

Expand All @@ -35,6 +35,8 @@ def train(
# )],
stop={"training_iteration": 30},
config={
"_enable_learner_api": False,
"_enable_rl_module_api": False,
"framework": "torch",
"env": "demo_env",
"kl_coeff": 0.0,
Expand All @@ -45,7 +47,7 @@ def train(
"rollout_fragment_length": 1250,
"sgd_minibatch_size": 2048,
"num_sgd_iter": 16,
"num_gpus": 1,
"num_gpus": num_gpus,
"num_workers": 8,
"num_envs_per_worker": 1,
"lr": 5e-4,
Expand Down Expand Up @@ -73,7 +75,6 @@ def train(
},
)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="RLLib multi-agent with shared NN demo."
Expand Down Expand Up @@ -103,11 +104,18 @@ def train(
choices=range(0, 2),
help="Goal shift offset (0 means that each agent moves to its own goal, 1 to its neighbor, etc.)",
)
parser.add_argument(
"--num_gpus",
type=int,
default=1,
help="GPUs to use."
)

args = parser.parse_args()
train(
share_observations=not args.disable_sharing,
use_beta=not args.disable_beta,
action_space=args.action_space,
goal_shift=args.goal_shift,
num_gpus=args.num_gpus,
)