KeyError: «преимущества» в PPO MARL с использованием Ray RLLib

KeyError: «преимущества» в PPO MARL с использованием Ray RLLib ⇐ Python

Ответить

1 сообщение • Страница 1 из 1

Anonymous

KeyError: «преимущества» в PPO MARL с использованием Ray RLLib

Цитата

Сообщение Anonymous » 04 ноя 2025, 18:35

Я использую ray 2.50.1 для реализации модели MARL с использованием PPO.
Однако я столкнулся со следующей проблемой:

Код: Выделить всё

'advantages'
KeyError: 'advantages'

During handling of the above exception, another exception occurred:

File "/home/tangjintong/multi_center_1020/main.py", line 267, in 
result = algo.train()
^^^^^^^^^^^^
KeyError: 'advantages'

Других сообщений об ошибках в IDE не отображается. Вот скриншот:

Я размещаю здесь свой код, чтобы вы могли легко воспроизвести ошибку:

Код: Выделить всё

import numpy as np
import matplotlib.pyplot as plt
from torch import nn
import os
from gymnasium import spaces
import ray
from ray import tune
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.core.rl_module.torch import TorchRLModule
from ray.rllib.utils.typing import TensorType
from ray.rllib.core.rl_module.rl_module import RLModuleSpec
from ray.rllib.core import Columns
from ray.rllib.utils.annotations import override
from ray.rllib.core.rl_module.apis.value_function_api import ValueFunctionAPI

class MaskedRLModule(TorchRLModule):
def setup(self):
super().setup()
input_dim = self.observation_space['obs'].n
hidden_dim = self.model_config["hidden_dim"]
output_dim = self.action_space.n
self.policy_net = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, output_dim)
)
self.value_net = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1)
)

def _forward(self, batch: TensorType, **kwargs) -> TensorType:
# batch["obs"] shape: [B, obs_size]
logits = self.policy_net(batch["obs"]["obs"].float())
# Handle action masking
if "action_mask" in batch["obs"]:
mask = batch["obs"]["action_mask"]
# Set logits of invalid actions to -inf
logits = logits.masked_fill(mask == 0, -1e9)
return {Columns.ACTION_DIST_INPUTS: logits}

@override(ValueFunctionAPI)
def compute_values(self, batch, **kwargs):
return self.value_net(batch["obs"]["obs"].float())

class Grid9x9MultiAgentEnv(MultiAgentEnv):
"""9x9 discrete grid multi-agent environment (2 homogeneous agents)."""

def __init__(self, env_config=None):
super().__init__()
env_config = env_config or {}
self._num_agents = env_config.get("num_agents")         # Use private variable for agent count to avoid errors
self.agents = self.possible_agents = [f"agent_{i}"  for i in range(self._num_agents)]
self.render_step_num = env_config.get("render_step_num")
self.truncation_step_num = env_config.get("truncation_step_num")
self.size = env_config.get("size")

self.grid = np.zeros((self.size, self.size), dtype=np.int8)  # 0=empty, 1=occupied
self.agent_positions = {agent: None for agent in self.agents}
self._update_masks()

self.step_in_episode = 0
self.current_total_step = 0
# Both action and observation spaces are discrete grids of size 9*9
self.action_space = spaces.Dict({
f"agent_{i}": spaces.Discrete(self.size * self.size)
for i in range(self._num_agents)
})

self.observation_space = spaces.Dict({
f"agent_{i}": spaces.Dict({
"obs": spaces.Discrete(self.size * self.size),
"action_mask": spaces.Discrete(self.size * self.size),
})
for i in range(self._num_agents)
})

coords = np.array([(i, j) for i in range(self.size) for j in range(self.size)])  # 81×2, each row is (row, col)
# Calculate Euclidean distance matrix
diff = coords[:, None, :] - coords[None, :, :]  # 81×81×2
self.distance_matrix = np.sqrt((diff ** 2).sum(-1))  # 81×81

def reset(self, *, seed=None, options=None):
super().reset(seed=seed)
print(f"Environment reset at step {self.current_total_step}.")
self.grid = np.zeros((self.size, self.size), dtype=np.int8)  # 0=empty, 1=occupied
self.agent_positions = {agent: None for agent in self.agents}
self._update_masks()
self.step_in_episode = 0

obs = {agent: self._get_obs(agent) for agent in self.agents}
return obs, {}

def _update_masks(self):
"""Update action masks: cannot select occupied cells."""
mask = 1 - self.grid.flatten()  # 1 indicates available positions, 0 indicates unavailable positions
self.current_masks = {agent: mask.copy() for agent in self.agents}

# If both agents have chosen positions, mutually prohibit selecting the same position
for agent, pos in self.agent_positions.items():
if pos is not None:
for other in self.agents:
if other != agent:
self.current_masks[other][pos] = 0

def _get_obs(self, agent):
return {
"obs": self.grid.flatten().astype(np.float32),
"action_mask": self.current_masks[agent].astype(np.float32),
}

def step(self, actions):
"""actions is a dict: {agent_0: act0, agent_1: act1}"""
rewards = {agent: 0.0 for agent in self.agents}
terminations = {agent: False for agent in self.agents}
truncations = {agent: False for agent in self.agents}
infos = {agent: {} for agent in self.agents}

# Check for action conflicts and update grid and agent_positions
chosen_positions = set()
for agent, act in actions.items():
if self.current_masks[agent][act] == 0:
rewards[agent] = -1.0
else:
if act in chosen_positions:
# Conflicting position, keep agent_position[agent] unchanged
rewards[agent] = -1.0
else:
if self.agent_positions[agent] is not None:
row, col = divmod(self.agent_positions[agent], self.size)
self.grid[row, col] = 0  # Release previous position
row, col = divmod(act, self.size)
self.grid[row, col] = 1  # Occupy new position
self.agent_positions[agent] = act
chosen_positions.add(act)

rewards = self.reward()

self._update_masks()
obs = {agent: self._get_obs(agent) for agent in self.agents}

self.step_in_episode += 1
self.current_total_step += 1

# When any agent terminates, e.g., the entire episode terminates:
if self.step_in_episode >= self.truncation_step_num:
for agent in self.agents:
terminations[agent] = True
truncations[agent] = True
self.visualize()

# "__all__"  must exist and be accurate
terminations["__all__"] = all(terminations[a] for a in self.agents)
truncations["__all__"] = all(truncations[a] for a in self.agents)

return obs, rewards, terminations, truncations, infos

def reward(self):
"""
Reward function: The reward for a merchant's chosen cell is the total number of customers served * product price.
Customer cost is transportation cost (related to distance) + product price, so customers only choose the merchant that minimizes their cost.
Since merchants have the same product price, customers choose the nearest merchant.
Therefore, each merchant wants their chosen cell to cover more customers.
Simplified here: reward equals the number of customers covered by that merchant.
"""
positions = list(self.agent_positions.values())
# Get covered customers (i.e., customers closer to this merchant)
customer_agent = np.argmin(self.distance_matrix[positions], axis=0)
# Count the number of customers corresponding to each agent as reward
values, counts = np.unique(customer_agent, return_counts=True)
return {f"agent_{v}": counts[i] for i, v in enumerate(values)}

def visualize(self):
n = self.size
fig, ax = plt.subplots(figsize=(6, 6))

# Draw grid lines
for x in range(n + 1):
ax.axhline(x, color='k', lw=1)
ax.axvline(x, color='k', lw=1)

# Draw occupied positions
for pos in self.agent_positions.values():
row, col = divmod(pos, n)
ax.add_patch(plt.Rectangle((col, n - 1 - row), 1, 1, color='lightgray'))

# Draw agents
colors = ["red", "blue"]
for i, (agent, pos) in enumerate(self.agent_positions.items()):
row, col = divmod(pos, n)
ax.scatter(col + 0.5, n - 1 - row + 0.5, c=colors[i], s=200, label=agent)

ax.set_xlim(0, n)
ax.set_ylim(0, n)
ax.set_xticks([])
ax.set_yticks([])
ax.set_aspect('equal')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper right')
if not os.path.exists("figures"):
os.makedirs("figures")
plt.savefig(f"figures/grid_step_{self.current_total_step}.png")
plt.close()

if __name__ == "__main__":
ray.init(ignore_reinit_error=True)

env_name = "Grid9x9MultiAgentEnv"
tune.register_env(env_name, lambda cfg: Grid9x9MultiAgentEnv(cfg))

def policy_mapping_fn(agent_id, episode, **kwargs):
# Homogeneous agents share one policy
return "shared_policy"

env_config = {
# Environment parameters can be passed here
"render_step_num": 500,
"truncation_step_num": 500,
"num_agents": 2,
"size": 9,
}

model_config = {
"hidden_dim": 128,
}

config = (
PPOConfig()
.environment(
env=env_name,
env_config=env_config
)
.multi_agent(
policies={"shared_policy"},
policy_mapping_fn=policy_mapping_fn,
)
.rl_module(
rl_module_spec=RLModuleSpec(
module_class=MaskedRLModule,
model_config=model_config,
)
)
.framework("torch")
.env_runners(
num_env_runners=1,                  # Number of parallel environments
rollout_fragment_length=50,         # Sampling fragment length
batch_mode="truncate_episodes",     # Sampling mode:  collect a complete episode as a batch
add_default_connectors_to_env_to_module_pipeline=True,
add_default_connectors_to_module_to_env_pipeline=True
)
.resources(num_gpus=1)
.training(
train_batch_size=1000,  # Minimum number of experience steps to collect before each update
minibatch_size=128,     # Number of steps per minibatch during update
lr=1e-4,  # Learning rate
use_gae=True,
use_critic=True,
)
)
algo = config.build_algo()
print("Start training...")
for i in range(5):
result = algo.train()
print(f"Iteration {i}: reward={result['episode_reward_mean']}")

Я читал несколько сообщений об этой проблеме, но ни одно из них не помогло. Буду благодарен за любую помощь!

Подробнее здесь: https://stackoverflow.com/questions/797 ... -ray-rllib

1762270516

Anonymous

Я использую ray 2.50.1 для реализации модели MARL с использованием PPO.
Однако я столкнулся со следующей проблемой:
[code]'advantages'
KeyError: 'advantages'

During handling of the above exception, another exception occurred:

File "/home/tangjintong/multi_center_1020/main.py", line 267, in 
result = algo.train()
^^^^^^^^^^^^
KeyError: 'advantages'
[/code]
Других сообщений об ошибках в IDE не отображается.  Вот скриншот:
[img]https://i.sstatic.net/DasXK2v4.png[/img]

Я размещаю здесь свой код, чтобы вы могли легко воспроизвести ошибку:
[code]import numpy as np
import matplotlib.pyplot as plt
from torch import nn
import os
from gymnasium import spaces
import ray
from ray import tune
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.core.rl_module.torch import TorchRLModule
from ray.rllib.utils.typing import TensorType
from ray.rllib.core.rl_module.rl_module import RLModuleSpec
from ray.rllib.core import Columns
from ray.rllib.utils.annotations import override
from ray.rllib.core.rl_module.apis.value_function_api import ValueFunctionAPI

class MaskedRLModule(TorchRLModule):
def setup(self):
super().setup()
input_dim = self.observation_space['obs'].n
hidden_dim = self.model_config["hidden_dim"]
output_dim = self.action_space.n
self.policy_net = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, output_dim)
)
self.value_net = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1)
)

def _forward(self, batch: TensorType, **kwargs) -> TensorType:
# batch["obs"] shape: [B, obs_size]
logits = self.policy_net(batch["obs"]["obs"].float())
# Handle action masking
if "action_mask" in batch["obs"]:
mask = batch["obs"]["action_mask"]
# Set logits of invalid actions to -inf
logits = logits.masked_fill(mask == 0, -1e9)
return {Columns.ACTION_DIST_INPUTS: logits}

@override(ValueFunctionAPI)
def compute_values(self, batch, **kwargs):
return self.value_net(batch["obs"]["obs"].float())

class Grid9x9MultiAgentEnv(MultiAgentEnv):
"""9x9 discrete grid multi-agent environment (2 homogeneous agents)."""

def __init__(self, env_config=None):
super().__init__()
env_config = env_config or {}
self._num_agents = env_config.get("num_agents")         # Use private variable for agent count to avoid errors
self.agents = self.possible_agents = [f"agent_{i}"  for i in range(self._num_agents)]
self.render_step_num = env_config.get("render_step_num")
self.truncation_step_num = env_config.get("truncation_step_num")
self.size = env_config.get("size")

self.grid = np.zeros((self.size, self.size), dtype=np.int8)  # 0=empty, 1=occupied
self.agent_positions = {agent: None for agent in self.agents}
self._update_masks()

self.step_in_episode = 0
self.current_total_step = 0
# Both action and observation spaces are discrete grids of size 9*9
self.action_space = spaces.Dict({
f"agent_{i}": spaces.Discrete(self.size * self.size)
for i in range(self._num_agents)
})

self.observation_space = spaces.Dict({
f"agent_{i}": spaces.Dict({
"obs": spaces.Discrete(self.size * self.size),
"action_mask": spaces.Discrete(self.size * self.size),
})
for i in range(self._num_agents)
})

coords = np.array([(i, j) for i in range(self.size) for j in range(self.size)])  # 81×2, each row is (row, col)
# Calculate Euclidean distance matrix
diff = coords[:, None, :] - coords[None, :, :]  # 81×81×2
self.distance_matrix = np.sqrt((diff ** 2).sum(-1))  # 81×81

def reset(self, *, seed=None, options=None):
super().reset(seed=seed)
print(f"Environment reset at step {self.current_total_step}.")
self.grid = np.zeros((self.size, self.size), dtype=np.int8)  # 0=empty, 1=occupied
self.agent_positions = {agent: None for agent in self.agents}
self._update_masks()
self.step_in_episode = 0

obs = {agent: self._get_obs(agent) for agent in self.agents}
return obs, {}

def _update_masks(self):
"""Update action masks: cannot select occupied cells."""
mask = 1 - self.grid.flatten()  # 1 indicates available positions, 0 indicates unavailable positions
self.current_masks = {agent: mask.copy() for agent in self.agents}

# If both agents have chosen positions, mutually prohibit selecting the same position
for agent, pos in self.agent_positions.items():
if pos is not None:
for other in self.agents:
if other != agent:
self.current_masks[other][pos] = 0

def _get_obs(self, agent):
return {
"obs": self.grid.flatten().astype(np.float32),
"action_mask": self.current_masks[agent].astype(np.float32),
}

def step(self, actions):
"""actions is a dict: {agent_0: act0, agent_1: act1}"""
rewards = {agent: 0.0 for agent in self.agents}
terminations = {agent: False for agent in self.agents}
truncations = {agent: False for agent in self.agents}
infos = {agent: {} for agent in self.agents}

# Check for action conflicts and update grid and agent_positions
chosen_positions = set()
for agent, act in actions.items():
if self.current_masks[agent][act] == 0:
rewards[agent] = -1.0
else:
if act in chosen_positions:
# Conflicting position, keep agent_position[agent] unchanged
rewards[agent] = -1.0
else:
if self.agent_positions[agent] is not None:
row, col = divmod(self.agent_positions[agent], self.size)
self.grid[row, col] = 0  # Release previous position
row, col = divmod(act, self.size)
self.grid[row, col] = 1  # Occupy new position
self.agent_positions[agent] = act
chosen_positions.add(act)

rewards = self.reward()

self._update_masks()
obs = {agent: self._get_obs(agent) for agent in self.agents}

self.step_in_episode += 1
self.current_total_step += 1

# When any agent terminates, e.g., the entire episode terminates:
if self.step_in_episode >= self.truncation_step_num:
for agent in self.agents:
terminations[agent] = True
truncations[agent] = True
self.visualize()

# "__all__"  must exist and be accurate
terminations["__all__"] = all(terminations[a] for a in self.agents)
truncations["__all__"] = all(truncations[a] for a in self.agents)

return obs, rewards, terminations, truncations, infos

def reward(self):
"""
Reward function: The reward for a merchant's chosen cell is the total number of customers served * product price.
Customer cost is transportation cost (related to distance) + product price, so customers only choose the merchant that minimizes their cost.
Since merchants have the same product price, customers choose the nearest merchant.
Therefore, each merchant wants their chosen cell to cover more customers.
Simplified here: reward equals the number of customers covered by that merchant.
"""
positions = list(self.agent_positions.values())
# Get covered customers (i.e., customers closer to this merchant)
customer_agent = np.argmin(self.distance_matrix[positions], axis=0)
# Count the number of customers corresponding to each agent as reward
values, counts = np.unique(customer_agent, return_counts=True)
return {f"agent_{v}": counts[i] for i, v in enumerate(values)}

def visualize(self):
n = self.size
fig, ax = plt.subplots(figsize=(6, 6))

# Draw grid lines
for x in range(n + 1):
ax.axhline(x, color='k', lw=1)
ax.axvline(x, color='k', lw=1)

# Draw occupied positions
for pos in self.agent_positions.values():
row, col = divmod(pos, n)
ax.add_patch(plt.Rectangle((col, n - 1 - row), 1, 1, color='lightgray'))

# Draw agents
colors = ["red", "blue"]
for i, (agent, pos) in enumerate(self.agent_positions.items()):
row, col = divmod(pos, n)
ax.scatter(col + 0.5, n - 1 - row + 0.5, c=colors[i], s=200, label=agent)

ax.set_xlim(0, n)
ax.set_ylim(0, n)
ax.set_xticks([])
ax.set_yticks([])
ax.set_aspect('equal')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper right')
if not os.path.exists("figures"):
os.makedirs("figures")
plt.savefig(f"figures/grid_step_{self.current_total_step}.png")
plt.close()

if __name__ == "__main__":
ray.init(ignore_reinit_error=True)

env_name = "Grid9x9MultiAgentEnv"
tune.register_env(env_name, lambda cfg: Grid9x9MultiAgentEnv(cfg))

def policy_mapping_fn(agent_id, episode, **kwargs):
# Homogeneous agents share one policy
return "shared_policy"

env_config = {
# Environment parameters can be passed here
"render_step_num": 500,
"truncation_step_num": 500,
"num_agents": 2,
"size": 9,
}

model_config = {
"hidden_dim": 128,
}

config = (
PPOConfig()
.environment(
env=env_name,
env_config=env_config
)
.multi_agent(
policies={"shared_policy"},
policy_mapping_fn=policy_mapping_fn,
)
.rl_module(
rl_module_spec=RLModuleSpec(
module_class=MaskedRLModule,
model_config=model_config,
)
)
.framework("torch")
.env_runners(
num_env_runners=1,                  # Number of parallel environments
rollout_fragment_length=50,         # Sampling fragment length
batch_mode="truncate_episodes",     # Sampling mode:  collect a complete episode as a batch
add_default_connectors_to_env_to_module_pipeline=True,
add_default_connectors_to_module_to_env_pipeline=True
)
.resources(num_gpus=1)
.training(
train_batch_size=1000,  # Minimum number of experience steps to collect before each update
minibatch_size=128,     # Number of steps per minibatch during update
lr=1e-4,  # Learning rate
use_gae=True,
use_critic=True,
)
)
algo = config.build_algo()
print("Start training...")
for i in range(5):
result = algo.train()
print(f"Iteration {i}: reward={result['episode_reward_mean']}")
[/code]
Я читал несколько сообщений об этой проблеме, но ни одно из них не помогло. Буду благодарен за любую помощь!
 

Подробнее здесь: [url]https://stackoverflow.com/questions/79796380/keyerror-advantages-in-ppo-marl-using-ray-rllib[/url]