Я пытаюсь реализовать оптимизацию проксимальной политики с несколькими головами актеров: агент может выполнять несколько действий, поэтому мне нужна одна голова, чтобы выбрать, какое действие(я) выполнять, а затем несколько других, чтобы определить параметры выбранное действие
import numpy as np
from collections import OrderedDict
import torch
from torch import nn
from torch import optim
from torch.distributions.categorical import Categorical
from gymnasium.spaces import Discrete
from icecream import ic
# Since I'm using dictionaries to store outputs based on which
# action head they related to, I need functions to convert
# dicts of tensors to tensors
def logprobdict_2_tensor(logprob_dict):
for k, v in logprob_dict.items():
if len(v.shape)==2 and v.shape[0]==1:
logprob_dict[k] = v[0]
return torch.concatenate(list(logprob_dict.values()))
def tensorise_dict(act_dict, device):
return OrderedDict({k: t if isinstance(t, torch.Tensor) else torch.tensor(t, device=device) for k, t in act_dict.items()})
# Policy and value model
class ActorCriticNetwork(nn.Module):
def __init__(self, obs_space_size, action_space_sizes, device:str="cpu",):
super().__init__() # manadatory
self.policy_layers: dict[str, nn.Sequential] = {}
torch.autograd.set_detect_anomaly(True)
# self.last_policy_networks = [] # ??? XXX
self.action_choices = [
['gp_new'],
['gp_continue'],
['use_mem', 'gp_new'], # note, actions will be performed in list order, if more than one is specified
['use_mem', 'gp_continue'],
['store_mem'],
['publish'],
['read']
]
self.action_choice_space = Discrete(len(self.action_choices))
self.device = device
# shared layers for all heads, including critic
self.shared_layers = nn.Sequential(
nn.Linear(obs_space_size, 64).double(),
nn.ReLU().double(),
nn.Linear(64, 64).double(),
nn.ReLU().double())
# action choice head
self.policy_layers['choice'] = nn.Sequential(
nn.Linear(64, 64).double(),
nn.ReLU().double(),
nn.Linear(64, self.action_choice_space.n).double())
# dict of heads for specific action paramers
for name, size in action_space_sizes.items():
self.policy_layers[name] = nn.Sequential(
nn.Linear(64, 64).double(),
nn.ReLU().double(),
nn.Linear(64, size).double())
# critic
self.value_layers = nn.Sequential(
nn.Linear(64, 64).double(),
nn.ReLU().double(),
nn.Linear(64, 1).double())
def value(self, obs):
z = self.shared_layers(obs)
value = self.value_layers(z)
return value
def choose(self, obs):
return self.policy(obs, 'choice')
def policy(self, obs, choice):
z = self.shared_layers(obs)
action_logits = self.policy_layers[choice](z)
return action_logits
def forward(self, obs): # mandatory
torch.autograd.set_detect_anomaly(True)
# obs = obs.clone().detach().requires_grad_(True)
z = self.shared_layers(obs)
# print(z)
action_logits = {}
action_logits['choice'] = self.policy_layers['choice'](z)
choice_distribution = Categorical(logits=action_logits['choice'])
choice = choice_distribution.sample()
choice_log_prob = choice_distribution.log_prob(choice).item()
for action in self.action_choices[choice.item()]:
action_logits[action] = self.policy_layers[action](z)
value = self.value_layers(z)
return choice, choice_log_prob, action_logits, value
class PPOTrainer():
def __init__(self,
actor_critic: ActorCriticNetwork,
ppo_clip_val=0.2,
target_kl_div=0.01,
max_policy_train_iters=80,
value_train_iters=80,
policy_lr=3e-4,
value_lr=1e-2):
self.actor_critic = actor_critic
self.ppo_clip_val = ppo_clip_val
self.target_kl_div = target_kl_div
self.max_policy_train_iters = max_policy_train_iters
self.value_train_iters = value_train_iters
self.policy_params = {}
self.policy_optims = {}
value_params = list(self.actor_critic.shared_layers.parameters()) + \
list(self.actor_critic.value_layers.parameters())
self.value_optim = optim.Adam(value_params, lr=value_lr)
for k, layers in self.actor_critic.policy_layers.items():
self.policy_params[k] = list(
self.actor_critic.shared_layers.parameters()
) + list(layers.parameters())
self.policy_optims[k] = optim.Adam(self.policy_params[k], lr=policy_lr)
@property
def device(self):
self.actor_critic.device
def train_policy(self, obs, acts, old_log_probs, gaes, which, actions, mask=None):
for _ in range(self.max_policy_train_iters):
# If a network is used at every step (like 'choice'), no mask needed
obs, gaes = (obs[mask], gaes[mask]) if mask is not None else (obs, gaes)
self.policy_optims[which].zero_grad()
# Here, we calculate the new log probs, which is different
# (simpler) for 'choice' than everything else
if which == 'choice':
new_logits = self.actor_critic.choose(obs)
new_distro = Categorical(logits=new_logits)
new_log_probs = new_distro.log_prob(acts)
else:
new_logprobs_list = []
for ob1, ac1 in zip(obs, acts):
new_logits = self.actor_critic.policy(ob1, which)
new_distros = actions[which].logits_2_distros(new_logits)
act_part_log_probs = tensorise_dict(OrderedDict({
k: d.log_prob(ac1[k]) for k, d in new_distros.items()
}), device=self.device)
new_logprobs_list.append(logprobdict_2_tensor(act_part_log_probs))
new_log_probs = torch.stack(
new_logprobs_list,
dim=0
).to(self.device, dtype=torch.float64)
policy_ratio = torch.exp(new_log_probs - old_log_probs) # torch.exp(new_log_probs - old_log_probs)
clipped_ratio = policy_ratio.clamp(
1 - self.ppo_clip_val, 1 + self.ppo_clip_val)
if len(clipped_ratio.shape) > 1:
gaes = gaes.expand(1, gaes.shape[0]).permute((1,0))
clipped_loss = clipped_ratio * gaes
full_loss = policy_ratio * gaes
policy_loss = -torch.min(full_loss, clipped_loss).mean()
policy_loss.backward()
self.policy_optims[which].step()
# If the new probability distro diverges too far from the old,
# stop training and get new training data
kl_div = (old_log_probs - new_log_probs).mean() # (old_log_probs - new_log_probs).mean()
if kl_div >= self.target_kl_div:
break
def train_value(self, obs, returns):
for _ in range(self.value_train_iters):
self.value_optim.zero_grad()
values = self.actor_critic.value(obs)
value_loss = (returns - values) ** 2
value_loss = value_loss.mean()
value_loss.backward()
self.value_optim.step()
def discount_rewards(rewards, gamma=0.99):
"""
Return discounted rewards based on the given rewards and gamma param.
"""
new_rewards = [float(rewards[len(rewards)-1])]
for i in reversed(range(len(rewards)-1)):
new_rewards.append(float(rewards[i]) + gamma * new_rewards[-1])
return np.array(new_rewards[::-1])
def calculate_gaes(rewards, values, gamma=0.99, decay=0.97):
"""
Return the General Advantage Estimates from the given rewards and values.
Paper: https://arxiv.org/pdf/1506.02438.pdf
"""
next_values = np.concatenate([values[1:], [0]])
deltas = [rew + gamma * next_val - val for rew, val, next_val in zip(rewards, values, next_values)]
gaes = [deltas[-1]]
for i in reversed(range(len(deltas)-1)):
gaes.append(deltas[i] + decay * gamma * gaes[-1])
return np.array(gaes[::-1])
/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/autograd/__init__.py:266: UserWarning: Error detected in AddmmBackward0. Traceback of forward call that caused the error:
File "/Users/xan/CodeToJoy/philoso-py/philoso_py.py", line 192, in
model.run(100, 25)
File "/Users/xan/CodeToJoy/philoso-py/philoso_py.py", line 54, in run
asyncio.run(self.day(steps_per_day))
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/runners.py", line 194, in run
return runner.run(main)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/base_events.py", line 672, in run_until_complete
self.run_forever()
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/base_events.py", line 639, in run_forever
self._run_once()
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/base_events.py", line 1985, in _run_once
handle._run()
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/events.py", line 88, in _run
self._context.run(self._callback, *self._args)
File "/Users/xan/CodeToJoy/philoso-py/agent.py", line 91, in day_step
choice, choice_log_prob, action_logits, val = self.nn(obs)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/Users/xan/CodeToJoy/philoso-py/ppo.py", line 157, in forward
z = self.shared_layers(obs)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/container.py", line 217, in forward
input = module(input)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/linear.py", line 116, in forward
return F.linear(input, self.weight, self.bias)
(Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/python_anomaly_mode.cpp:118.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
File "/Users/xan/CodeToJoy/philoso-py/philoso_py.py", line 192, in
model.run(100, 25)
File "/Users/xan/CodeToJoy/philoso-py/philoso_py.py", line 55, in run
self.night()
File "/Users/xan/CodeToJoy/philoso-py/philoso_py.py", line 78, in night
agent.night()
File "/Users/xan/CodeToJoy/philoso-py/agent.py", line 276, in night
self.trainer.train_policy(
File "/Users/xan/CodeToJoy/philoso-py/ppo.py", line 256, in train_policy
policy_loss.backward()
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/_tensor.py", line 522, in backward
torch.autograd.backward(
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/autograd/__init__.py", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.DoubleTensor [64, 64]], which is output 0 of AsStridedBackward0, is at version 81; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
Я новичок в работе с pytorch и раньше не сталкивался с этой проблемой.
Я настроил аномальный режим, согласно которому проблема, похоже, связана с одним из моих линейных слоев в общих слоях. Однако это еще больше сбивает с толку, поскольку я не могу вспомнить ни одного места в своем коде, которое бы напрямую меняло это. Я попробовал клонировать obs в начале прямого вызова, основываясь на предложении в другом месте, но это не помогло. Может кто-нибудь увидеть, что я пропустил? Я использую Pytorch 2.2.2.
Я пытаюсь реализовать оптимизацию проксимальной политики с несколькими головами актеров: агент может выполнять несколько действий, поэтому мне нужна одна голова, чтобы выбрать, какое действие(я) выполнять, а затем несколько других, чтобы определить параметры выбранное действие [code]import numpy as np from collections import OrderedDict
import torch from torch import nn from torch import optim from torch.distributions.categorical import Categorical from gymnasium.spaces import Discrete from icecream import ic
# Since I'm using dictionaries to store outputs based on which # action head they related to, I need functions to convert # dicts of tensors to tensors
def logprobdict_2_tensor(logprob_dict): for k, v in logprob_dict.items(): if len(v.shape)==2 and v.shape[0]==1: logprob_dict[k] = v[0] return torch.concatenate(list(logprob_dict.values()))
def tensorise_dict(act_dict, device): return OrderedDict({k: t if isinstance(t, torch.Tensor) else torch.tensor(t, device=device) for k, t in act_dict.items()})
# Policy and value model class ActorCriticNetwork(nn.Module): def __init__(self, obs_space_size, action_space_sizes, device:str="cpu",): super().__init__() # manadatory self.policy_layers: dict[str, nn.Sequential] = {}
torch.autograd.set_detect_anomaly(True)
# self.last_policy_networks = [] # ??? XXX self.action_choices = [ ['gp_new'], ['gp_continue'], ['use_mem', 'gp_new'], # note, actions will be performed in list order, if more than one is specified ['use_mem', 'gp_continue'], ['store_mem'], ['publish'], ['read'] ] self.action_choice_space = Discrete(len(self.action_choices)) self.device = device
# shared layers for all heads, including critic self.shared_layers = nn.Sequential( nn.Linear(obs_space_size, 64).double(), nn.ReLU().double(), nn.Linear(64, 64).double(), nn.ReLU().double())
# dict of heads for specific action paramers for name, size in action_space_sizes.items(): self.policy_layers[name] = nn.Sequential( nn.Linear(64, 64).double(), nn.ReLU().double(), nn.Linear(64, size).double())
def train_policy(self, obs, acts, old_log_probs, gaes, which, actions, mask=None): for _ in range(self.max_policy_train_iters): # If a network is used at every step (like 'choice'), no mask needed obs, gaes = (obs[mask], gaes[mask]) if mask is not None else (obs, gaes)
self.policy_optims[which].zero_grad()
# Here, we calculate the new log probs, which is different # (simpler) for 'choice' than everything else if which == 'choice': new_logits = self.actor_critic.choose(obs) new_distro = Categorical(logits=new_logits) new_log_probs = new_distro.log_prob(acts) else: new_logprobs_list = [] for ob1, ac1 in zip(obs, acts): new_logits = self.actor_critic.policy(ob1, which) new_distros = actions[which].logits_2_distros(new_logits) act_part_log_probs = tensorise_dict(OrderedDict({ k: d.log_prob(ac1[k]) for k, d in new_distros.items() }), device=self.device) new_logprobs_list.append(logprobdict_2_tensor(act_part_log_probs)) new_log_probs = torch.stack( new_logprobs_list, dim=0 ).to(self.device, dtype=torch.float64)
# If the new probability distro diverges too far from the old, # stop training and get new training data kl_div = (old_log_probs - new_log_probs).mean() # (old_log_probs - new_log_probs).mean() if kl_div >= self.target_kl_div: break
def train_value(self, obs, returns): for _ in range(self.value_train_iters): self.value_optim.zero_grad()
def discount_rewards(rewards, gamma=0.99): """ Return discounted rewards based on the given rewards and gamma param. """ new_rewards = [float(rewards[len(rewards)-1])] for i in reversed(range(len(rewards)-1)): new_rewards.append(float(rewards[i]) + gamma * new_rewards[-1]) return np.array(new_rewards[::-1])
def calculate_gaes(rewards, values, gamma=0.99, decay=0.97): """ Return the General Advantage Estimates from the given rewards and values. Paper: https://arxiv.org/pdf/1506.02438.pdf """ next_values = np.concatenate([values[1:], [0]]) deltas = [rew + gamma * next_val - val for rew, val, next_val in zip(rewards, values, next_values)]
gaes = [deltas[-1]] for i in reversed(range(len(deltas)-1)): gaes.append(deltas[i] + decay * gamma * gaes[-1])
return np.array(gaes[::-1]) [/code] Вот трассировка ошибки: [code]/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/autograd/__init__.py:266: UserWarning: Error detected in AddmmBackward0. Traceback of forward call that caused the error: File "/Users/xan/CodeToJoy/philoso-py/philoso_py.py", line 192, in model.run(100, 25) File "/Users/xan/CodeToJoy/philoso-py/philoso_py.py", line 54, in run asyncio.run(self.day(steps_per_day)) File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/runners.py", line 194, in run return runner.run(main) File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/runners.py", line 118, in run return self._loop.run_until_complete(task) File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/base_events.py", line 672, in run_until_complete self.run_forever() File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/base_events.py", line 639, in run_forever self._run_once() File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/base_events.py", line 1985, in _run_once handle._run() File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/events.py", line 88, in _run self._context.run(self._callback, *self._args) File "/Users/xan/CodeToJoy/philoso-py/agent.py", line 91, in day_step choice, choice_log_prob, action_logits, val = self.nn(obs) File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/Users/xan/CodeToJoy/philoso-py/ppo.py", line 157, in forward z = self.shared_layers(obs) File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/container.py", line 217, in forward input = module(input) File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/linear.py", line 116, in forward return F.linear(input, self.weight, self.bias) (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/python_anomaly_mode.cpp:118.) Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass Traceback (most recent call last): File "/Users/xan/CodeToJoy/philoso-py/philoso_py.py", line 192, in model.run(100, 25) File "/Users/xan/CodeToJoy/philoso-py/philoso_py.py", line 55, in run self.night() File "/Users/xan/CodeToJoy/philoso-py/philoso_py.py", line 78, in night agent.night() File "/Users/xan/CodeToJoy/philoso-py/agent.py", line 276, in night self.trainer.train_policy( File "/Users/xan/CodeToJoy/philoso-py/ppo.py", line 256, in train_policy policy_loss.backward() File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/_tensor.py", line 522, in backward torch.autograd.backward( File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/autograd/__init__.py", line 266, in backward Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.DoubleTensor [64, 64]], which is output 0 of AsStridedBackward0, is at version 81; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck! [/code] Я новичок в работе с pytorch и раньше не сталкивался с этой проблемой. Я настроил аномальный режим, согласно которому проблема, похоже, связана с одним из моих линейных слоев в общих слоях. Однако это еще больше сбивает с толку, поскольку я не могу вспомнить ни одного места в своем коде, которое бы напрямую меняло это. Я попробовал клонировать obs в начале прямого вызова, основываясь на предложении в другом месте, но это не помогло. Может кто-нибудь увидеть, что я пропустил? Я использую Pytorch 2.2.2.
Просто любопытно, если вариант (b) более эффективен, чем вариант (а)? На первый взгляд, вариант (а) будет иметь несколько раз больше операций, чем вариант (b). Но я сделал несколько симуляций для миллиона рядов в df , вариант (b) в среднем всего на...
Просто любопытно, если вариант (b) более эффективен, чем вариант (а)? На первый взгляд, вариант (а) будет иметь несколько раз больше операций, чем вариант (b). Но я сделал несколько симуляций для миллиона рядов в df , вариант (b) в среднем всего на...
У меня есть проблема, когда у меня есть список редких квадратных массивов (матрицы CSC) a = , а затем список плотных квадратных массивов
b =
Я хочу рассчитать сумму C = A1 B1 + a2 B2 + ... An Bn. Разреженные массивы очень скудны, а размеры...
У меня есть проблема, когда у меня есть список редких квадратных массивов (матрицы CSC) a = , а затем список плотных квадратных массивов
b =
Я хочу рассчитать сумму C = A1 B1 + a2 B2 + ... An Bn. Разреженные массивы очень скудны, а размеры...