Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 16 additions & 27 deletions oai_agents/agents/base_agent.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from oai_agents.agents.agent_utils import load_agent, CustomAgent
from oai_agents.agents.agent_utils import load_agent
from oai_agents.common.arguments import get_args_to_save, set_args_from_load
from oai_agents.common.state_encodings import ENCODING_SCHEMES
from oai_agents.common.subtasks import get_doable_subtasks, Subtasks
from oai_agents.common.tags import AgentPerformance, KeyCheckpoints
from oai_agents.common.tags import AgentPerformance, KeyCheckpoints, TeammatesCollection
from oai_agents.common.checked_model_name_handler import CheckedModelNameHandler
# from oai_agents.gym_environments.base_overcooked_env import USEABLE_COUNTERS

Expand Down Expand Up @@ -318,6 +318,7 @@ def predict(self, obs, state=None, episode_start=None, deterministic=False):
# Updated to include action masking
self.policy.set_training_mode(False)
obs, vectorized_env = self.policy.obs_to_tensor(obs)

with th.no_grad():
if 'subtask_mask' in obs and np.prod(obs['subtask_mask'].shape) == np.prod(self.policy.action_space.n):
dist = self.policy.get_distribution(obs, action_masks=obs['subtask_mask'])
Expand Down Expand Up @@ -378,10 +379,6 @@ def __init__(self, name, args, seed=None):
if th.cuda.is_available():
th.cuda.manual_seed_all(seed)
th.backends.cudnn.deterministic = True

self.eval_teammates_collection = {}
self.teammates_collection = {}

# For environment splits while training
self.n_layouts = len(self.args.layout_names)
self.splits = []
Expand Down Expand Up @@ -422,16 +419,13 @@ def evaluate(self, eval_agent, num_eps_per_layout_per_tm=5, visualize=False, tim
selected_p_indexes = random.sample(range(self.args.num_players), min(3, self.args.num_players))

for _, env in enumerate(self.eval_envs):

rew_per_layout_per_teamtype[env.layout_name] = {
teamtype: [] for teamtype in self.eval_teammates_collection[env.layout_name]
teamtype: [] for teamtype in env.teammates_collection[TeammatesCollection.EVAL][env.layout_name]
}
rew_per_layout[env.layout_name] = 0

teamtypes_population = self.eval_teammates_collection[env.layout_name]

for teamtype in teamtypes_population:
teammates = teamtypes_population[teamtype][np.random.randint(len(teamtypes_population[teamtype]))]
env.set_teammates(teammates)
for teamtype in env.teammates_collection[TeammatesCollection.EVAL][env.layout_name]:
env.set_teammates(teamtype=teamtype)

for p_idx in selected_p_indexes:
env.set_reset_p_idx(p_idx)
Expand All @@ -455,21 +449,16 @@ def evaluate(self, eval_agent, num_eps_per_layout_per_tm=5, visualize=False, tim
wandb.log({'eval_mean_reward': np.mean(tot_mean_reward), 'timestep': timestep})
return np.mean(tot_mean_reward), rew_per_layout, rew_per_layout_per_teamtype

def set_new_teammates(self, curriculum):
def set_new_teammates(self):
"""
The logic for selecting teammates has been moved to `base_overcooked_env` to support
running environments with the SubProcEnv flag enabled.
`teammates_collection` and `curriculum` are now managed within the environment.
The `set_teammates` method in `base_overcooked_env` selects an appropriate teammate
based on the current curriculum settings.
"""
for i in range(self.args.n_envs):
layout_name = self.env.env_method('get_layout_name', indices=i)[0]
population_teamtypes = self.teammates_collection[layout_name]

teammates = curriculum.select_teammates_for_layout(population_teamtypes=population_teamtypes,
layout=layout_name)

assert len(teammates) == self.args.teammates_len
assert type(teammates) == list

for teammate in teammates:
assert type(teammate) in [SB3Wrapper, CustomAgent]

self.env.env_method('set_teammates', teammates, indices=i)
self.env.env_method('set_teammates', indices=i)


def get_agents(self) -> List[OAIAgent]:
Expand Down
4 changes: 2 additions & 2 deletions oai_agents/agents/hrl.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from oai_agents.agents.base_agent import OAIAgent, PolicyClone
from oai_agents.agents.rl import RLAgentTrainer, VEC_ENV_CLS
from oai_agents.agents.rl import RLAgentTrainer
from oai_agents.common.arguments import get_args_to_save, set_args_from_load
from oai_agents.common.subtasks import Subtasks
# from oai_agents.gym_environments.worker_env import OvercookedSubtaskGymEnv
from oai_agents.gym_environments.manager_env import OvercookedManagerGymEnv
# from oai_agents.gym_environments.manager_env import OvercookedManagerGymEnv

from overcooked_ai_py.mdp.overcooked_mdp import Action

Expand Down
6 changes: 3 additions & 3 deletions oai_agents/agents/mep_population_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(self, population_size, args):
self.epoch_timesteps = args.epoch_timesteps # Number of timesteps per training episode
seeds, h_dims = generate_hdim_and_seed(
for_evaluation=args.gen_pop_for_eval,
total_ego_agents=population_size
total_sp_agents=population_size
)

self.population: List[RLAgentTrainer] = []
Expand Down Expand Up @@ -204,9 +204,9 @@ def train_population(self, total_timesteps: int, num_of_ckpoints: int, eval_inte

set_input(args=args)

args.total_ego_agents = 4
args.total_sp_agents = 4

manager = MEPPopulationManager(population_size=args.total_ego_agents, args=args)
manager = MEPPopulationManager(population_size=args.total_sp_agents, args=args)
manager.train_population(
total_timesteps=args.pop_total_training_timesteps,
num_of_ckpoints=args.num_of_ckpoints,
Expand Down
106 changes: 28 additions & 78 deletions oai_agents/agents/rl.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,21 @@
from oai_agents.agents.base_agent import SB3Wrapper, SB3LSTMWrapper, OAITrainer, OAIAgent
from oai_agents.common.networks import OAISinglePlayerFeatureExtractor
from oai_agents.common.state_encodings import ENCODING_SCHEMES
from oai_agents.common.tags import AgentPerformance, TeamType, TeammatesCollection, KeyCheckpoints
from oai_agents.common.tags import AgentPerformance, TeammatesCollection, KeyCheckpoints
from oai_agents.agents.agent_utils import CustomAgent
from oai_agents.gym_environments.base_overcooked_env import OvercookedGymEnv
from oai_agents.common.checked_model_name_handler import CheckedModelNameHandler
from oai_agents.gym_environments.base_overcooked_env import OvercookedGymEnv


import numpy as np
from stable_baselines3 import PPO, DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from sb3_contrib import RecurrentPPO
import wandb
import os
from typing import Literal

VEC_ENV_CLS = DummyVecEnv #

class RLAgentTrainer(OAITrainer):
''' Train an RL agent to play with a teammates_collection of agents.'''
def __init__(
Expand Down Expand Up @@ -64,19 +63,19 @@ def __init__(
self.use_policy_clone = use_policy_clone

self.learner_type = learner_type
self.env, self.eval_envs = self.get_envs(env, eval_envs, deterministic, learner_type, start_timestep)

# teammates_collection and curriculum are passed to the environment instead.
self.env, self.eval_envs = self.get_envs(_env=env, _eval_envs=eval_envs,
deterministic=deterministic, learner_type=learner_type,
start_timestep=start_timestep, teammates_collection=teammates_collection,
curriculum=self.curriculum)

# Episode to start training from (usually 0 unless restarted)
self.start_step = start_step
self.steps = self.start_step
# Cumm. timestep to start training from (usually 0 unless restarted)
self.start_timestep = start_timestep
self.learning_agent, self.agents = self.get_learning_agent(agent)
self.teammates_collection, self.eval_teammates_collection = self.get_teammates_collection(
_tms_clctn = teammates_collection,
learning_agent = self.learning_agent,
train_types = train_types,
eval_types = eval_types
)
self.best_score, self.best_training_rew = -1, float('-inf')

@classmethod
Expand Down Expand Up @@ -125,64 +124,6 @@ def get_learning_agent(self, agent):
agents = [learning_agent]
return learning_agent, agents


def get_teammates_collection(self, _tms_clctn, learning_agent, train_types: Optional[List]=None, eval_types:Optional[List]=None):
'''
Returns a dictionary of teammates_collection for training and evaluation
dict
teammates_collection = {
'layout_name': {
'TeamType.HIGH_FIRST': [[agent1, agent2], ...],
'TeamType.MEDIUM_FIRST': [[agent3, agent4], ...],
'TeamType.LOW_FIRST': [[agent5, agent6], ..],
'TeamType.RANDOM': [[agent7, agent8], ...],
},
}
'''
train_types = train_types if train_types is not None else []
eval_types = eval_types if eval_types is not None else []
if _tms_clctn == {}:
_tms_clctn = {
TeammatesCollection.TRAIN: {
layout_name:
{TeamType.SELF_PLAY: [[learning_agent for _ in range(self.teammates_len)]]}
for layout_name in self.args.layout_names
},
TeammatesCollection.EVAL: {
layout_name:
{TeamType.SELF_PLAY: [[learning_agent for _ in range(self.teammates_len)]]}
for layout_name in self.args.layout_names
}
}

else:
for layout in self.args.layout_names:
for tt in _tms_clctn[TeammatesCollection.TRAIN][layout]:
if tt == TeamType.SELF_PLAY:
_tms_clctn[TeammatesCollection.TRAIN][layout][TeamType.SELF_PLAY] = [[learning_agent for _ in range(self.teammates_len)]]
for tt in _tms_clctn[TeammatesCollection.EVAL][layout]:
if tt == TeamType.SELF_PLAY:
_tms_clctn[TeammatesCollection.EVAL][layout][TeamType.SELF_PLAY] = [[learning_agent for _ in range(self.teammates_len)]]

train_teammates_collection = _tms_clctn[TeammatesCollection.TRAIN]
eval_teammates_collection = _tms_clctn[TeammatesCollection.EVAL]

if train_types:
train_teammates_collection = {
layout: {team_type: train_teammates_collection[layout][team_type] for team_type in train_types}
for layout in train_teammates_collection
}
if eval_types:
eval_teammates_collection = {
layout: {team_type: eval_teammates_collection[layout][team_type] for team_type in eval_types}
for layout in eval_teammates_collection
}

self.check_teammates_collection_structure(train_teammates_collection)
self.check_teammates_collection_structure(eval_teammates_collection)
return train_teammates_collection, eval_teammates_collection


def print_tc_helper(self, teammates_collection, message=None):
print("-------------------")
if message:
Expand All @@ -193,18 +134,27 @@ def print_tc_helper(self, teammates_collection, message=None):
teammates_c = teammates_collection[layout_name][tag]
for teammates in teammates_c:
for agent in teammates:
print(f'\t{agent.name}, score for layout {layout_name} is: {agent.layout_scores[layout_name]}, start_pos: {agent.get_start_position(layout_name, 0)}, len: {len(teammates)}')
print(f'\t{agent.name}, score for layout {layout_name} is:, start_pos: {agent.get_start_position(layout_name, 0)}, len: {len(teammates)}')
print("-------------------")


def get_envs(self, _env, _eval_envs, deterministic, learner_type, start_timestep: int = 0):
def get_envs(self, _env, _eval_envs, deterministic, learner_type, teammates_collection, curriculum, start_timestep: int = 0):
if self.args.use_multipleprocesses:
VEC_ENV_CLS = SubprocVecEnv
else:
VEC_ENV_CLS = DummyVecEnv

if _env is None:
env_kwargs = {'shape_rewards': True, 'full_init': False, 'stack_frames': self.use_frame_stack,
'deterministic': deterministic,'args': self.args, 'learner_type': learner_type, 'start_timestep': start_timestep}
'deterministic': deterministic,'args': self.args, 'learner_type': learner_type, 'start_timestep': start_timestep,
'teammates_collection': teammates_collection, 'curriculum': curriculum
}
env = make_vec_env(OvercookedGymEnv, n_envs=self.args.n_envs, seed=self.seed, vec_env_cls=VEC_ENV_CLS, env_kwargs=env_kwargs)

eval_envs_kwargs = {'is_eval_env': True, 'horizon': 400, 'stack_frames': self.use_frame_stack,
'deterministic': deterministic, 'args': self.args, 'learner_type': learner_type}
'deterministic': deterministic, 'args': self.args, 'learner_type': learner_type,
'teammates_collection': teammates_collection, 'curriculum': curriculum
}
eval_envs = [OvercookedGymEnv(**{'env_index': i, **eval_envs_kwargs, 'unique_env_idx':self.args.n_envs+i}) for i in range(self.n_layouts)]
else:
env = _env
Expand Down Expand Up @@ -310,10 +260,9 @@ def should_evaluate(self, steps):

def log_details(self, experiment_name, total_train_timesteps):
print("Training agent: " + self.name + ", for experiment: " + experiment_name)
self.print_tc_helper(self.teammates_collection, "Train TC")
self.print_tc_helper(self.eval_teammates_collection, "Eval TC")
self.print_tc_helper(self.eval_envs[0].teammates_collection[TeammatesCollection.EVAL], "Eval TC")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are the teammate collections are always going to be uniform?

self.print_tc_helper(self.eval_envs[0].teammates_collection[TeammatesCollection.TRAIN], "Train TC")
self.curriculum.print_curriculum()
print("How Long: ", self.args.how_long)
print(f"Epoch timesteps: {self.epoch_timesteps}")
print(f"Total training timesteps: {total_train_timesteps}")
print(f"Number of environments: {self.n_envs}")
Expand Down Expand Up @@ -373,10 +322,11 @@ def train_agents(self, total_train_timesteps, tag_for_returning_agent, resume_ck

while self.learning_agent.num_timesteps < total_train_timesteps:
self.curriculum.update(current_step=self.steps)
self.set_new_teammates(curriculum=self.curriculum)
self.set_new_teammates()

# In each iteration the agent collects n_envs * n_steps experiences. This continues until self.learning_agent.num_timesteps > epoch_timesteps is reached.
self.learning_agent.learn(self.epoch_timesteps)

self.steps += 1

if self.should_evaluate(steps=self.steps):
Expand Down
22 changes: 19 additions & 3 deletions oai_agents/common/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def get_arguments(additional_args: Optional[List] = None):
additional_args = additional_args if additional_args is not None else []

parser = argparse.ArgumentParser(description='PyTorch Soft Actor-Critic Args')
parser.add_argument('--layout-names', help='Overcooked maps to use')
parser.add_argument('--layout-names', help='Overcooked maps to use', default='default')
parser.add_argument('--horizon', type=int, default=400, help='Max timesteps in a rollout')
parser.add_argument('--num_stack', type=int, default=3, help='Number of frame stacks to use in training if frame stacks are being used')
parser.add_argument('--encoding-fn', type=str, default='OAI_egocentric',
Expand Down Expand Up @@ -98,15 +98,31 @@ def get_arguments(additional_args: Optional[List] = None):
parser.add_argument("--custom-agent-ck-rate-generation", type=int)

parser.add_argument('--gen-pop-for-eval', type=str2bool, default=False, help="Specifies whether to generate a population of agents for evaluation purposes. Currently, this functionality is limited to self-play agents, as support for other methods has not yet been implemented..)")
parser.add_argument("--total-ego-agents", type=int, default=4)
parser.add_argument('--use-cuda', type=str2bool, help="Specifies whether to use cuda for training.")
parser.add_argument('--use-multipleprocesses', type=str2bool, help="SubprocVecEnv vs DummyVecEnv")

parser.add_argument("--total-sp-agents", type=int, default=4)
parser.add_argument("--ck-list-offset", type=int, default=0)

# The next three args are only to run the ultimate baseline exp, I will clean it later
parser.add_argument('--low-perfs', help='code to run ult baseline exp', default='default')
parser.add_argument('--med-perfs', help='code to run ult baseline exp', default='default')
parser.add_argument('--high-perfs', help='code to run ult baseline exp', default='default')


for parser_arg, parser_kwargs in additional_args:
parser.add_argument(parser_arg, **parser_kwargs)

args = parser.parse_args()
args.base_dir = Path(args.base_dir)
args.device = th.device('cuda' if th.cuda.is_available() else 'cpu')

args.device = th.device('cuda' if args.use_cuda and th.cuda.is_available() else 'cpu')

args.layout_names = args.layout_names.split(',')
args.low_perfs = args.low_perfs.split(',')
args.med_perfs = args.med_perfs.split(',')
args.high_perfs = args.high_perfs.split(',')


if isinstance(args.layout_names, str):
args.layout_names = args.layout_names.split(',')
Expand Down
1 change: 1 addition & 0 deletions oai_agents/common/curriculum.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def select_teammates_for_layout(self, population_teamtypes, layout):
population = [population_teamtypes[t] for t in population_teamtypes.keys()]
teammates_per_type = population[np.random.randint(len(population))]
teammates = teammates_per_type[np.random.randint(len(teammates_per_type))]

elif self.prioritized_sampling:
teammates = self.select_teammates_prioritized_sampling(population_teamtypes, layout)
else:
Expand Down
Loading