Pytorch, не могу найти операцию на месте, которая не позволяет моей сети рассчитывать градиент.Python

Программы на Python
Ответить Пред. темаСлед. тема
Anonymous
 Pytorch, не могу найти операцию на месте, которая не позволяет моей сети рассчитывать градиент.

Сообщение Anonymous »

Я пытаюсь реализовать оптимизацию проксимальной политики с несколькими головами актеров: агент может выполнять несколько действий, поэтому мне нужна одна голова, чтобы выбрать, какое действие(я) выполнять, а затем несколько других, чтобы определить параметры выбранное действие

Код: Выделить всё

import numpy as np
from collections import OrderedDict

import torch
from torch import nn
from torch import optim
from torch.distributions.categorical import Categorical
from gymnasium.spaces import Discrete
from icecream import ic

# Since I'm using dictionaries to store outputs based on which
# action head they related to, I need functions to convert
# dicts of tensors to tensors

def logprobdict_2_tensor(logprob_dict):
for k, v in logprob_dict.items():
if len(v.shape)==2 and v.shape[0]==1:
logprob_dict[k] = v[0]
return torch.concatenate(list(logprob_dict.values()))

def tensorise_dict(act_dict, device):
return OrderedDict({k: t if isinstance(t, torch.Tensor) else torch.tensor(t, device=device) for k, t in act_dict.items()})

# Policy and value model
class ActorCriticNetwork(nn.Module):
def __init__(self, obs_space_size, action_space_sizes, device:str="cpu",):
super().__init__() # manadatory
self.policy_layers: dict[str, nn.Sequential] = {}

torch.autograd.set_detect_anomaly(True)

# self.last_policy_networks = [] # ??? XXX
self.action_choices = [
['gp_new'],
['gp_continue'],
['use_mem', 'gp_new'], # note, actions will be performed in list order, if more than one is specified
['use_mem', 'gp_continue'],
['store_mem'],
['publish'],
['read']
]
self.action_choice_space = Discrete(len(self.action_choices))
self.device = device

# shared layers for all heads, including critic
self.shared_layers = nn.Sequential(
nn.Linear(obs_space_size, 64).double(),
nn.ReLU().double(),
nn.Linear(64, 64).double(),
nn.ReLU().double())

# action choice head
self.policy_layers['choice'] = nn.Sequential(
nn.Linear(64, 64).double(),
nn.ReLU().double(),
nn.Linear(64, self.action_choice_space.n).double())

# dict of heads for specific action paramers
for name, size in action_space_sizes.items():
self.policy_layers[name] = nn.Sequential(
nn.Linear(64, 64).double(),
nn.ReLU().double(),
nn.Linear(64, size).double())

# critic
self.value_layers = nn.Sequential(
nn.Linear(64, 64).double(),
nn.ReLU().double(),
nn.Linear(64, 1).double())

def value(self, obs):
z = self.shared_layers(obs)
value = self.value_layers(z)
return value

def choose(self, obs):
return self.policy(obs, 'choice')

def policy(self, obs, choice):
z = self.shared_layers(obs)
action_logits = self.policy_layers[choice](z)
return action_logits

def forward(self, obs): # mandatory

torch.autograd.set_detect_anomaly(True)

# obs = obs.clone().detach().requires_grad_(True)
z = self.shared_layers(obs)
# print(z)
action_logits = {}
action_logits['choice'] = self.policy_layers['choice'](z)
choice_distribution = Categorical(logits=action_logits['choice'])
choice = choice_distribution.sample()
choice_log_prob = choice_distribution.log_prob(choice).item()
for action in self.action_choices[choice.item()]:
action_logits[action] = self.policy_layers[action](z)
value = self.value_layers(z)
return choice, choice_log_prob, action_logits, value

class PPOTrainer():
def __init__(self,
actor_critic:  ActorCriticNetwork,
ppo_clip_val=0.2,
target_kl_div=0.01,
max_policy_train_iters=80,
value_train_iters=80,
policy_lr=3e-4,
value_lr=1e-2):
self.actor_critic = actor_critic
self.ppo_clip_val = ppo_clip_val
self.target_kl_div = target_kl_div
self.max_policy_train_iters = max_policy_train_iters
self.value_train_iters = value_train_iters
self.policy_params = {}
self.policy_optims = {}

value_params = list(self.actor_critic.shared_layers.parameters()) + \
list(self.actor_critic.value_layers.parameters())
self.value_optim = optim.Adam(value_params, lr=value_lr)

for k, layers in self.actor_critic.policy_layers.items():
self.policy_params[k] = list(
self.actor_critic.shared_layers.parameters()
) + list(layers.parameters())
self.policy_optims[k] = optim.Adam(self.policy_params[k], lr=policy_lr)

@property
def device(self):
self.actor_critic.device

def train_policy(self, obs, acts, old_log_probs, gaes, which, actions, mask=None):
for _ in range(self.max_policy_train_iters):
# If a network is used at every step (like 'choice'), no mask needed
obs, gaes = (obs[mask], gaes[mask]) if mask is not None else (obs, gaes)

self.policy_optims[which].zero_grad()

# Here, we calculate the new log probs, which is different
# (simpler) for 'choice' than everything else
if which == 'choice':
new_logits = self.actor_critic.choose(obs)
new_distro = Categorical(logits=new_logits)
new_log_probs = new_distro.log_prob(acts)
else:
new_logprobs_list = []
for ob1, ac1 in zip(obs, acts):
new_logits = self.actor_critic.policy(ob1, which)
new_distros = actions[which].logits_2_distros(new_logits)
act_part_log_probs = tensorise_dict(OrderedDict({
k: d.log_prob(ac1[k]) for k, d in new_distros.items()
}), device=self.device)
new_logprobs_list.append(logprobdict_2_tensor(act_part_log_probs))
new_log_probs = torch.stack(
new_logprobs_list,
dim=0
).to(self.device, dtype=torch.float64)

policy_ratio = torch.exp(new_log_probs - old_log_probs) # torch.exp(new_log_probs - old_log_probs)
clipped_ratio = policy_ratio.clamp(
1 - self.ppo_clip_val, 1 + self.ppo_clip_val)

if len(clipped_ratio.shape) >  1:
gaes = gaes.expand(1, gaes.shape[0]).permute((1,0))
clipped_loss = clipped_ratio * gaes
full_loss = policy_ratio * gaes
policy_loss = -torch.min(full_loss, clipped_loss).mean()

policy_loss.backward()
self.policy_optims[which].step()

# If the new probability distro diverges too far from the old,
# stop training and get new training data
kl_div = (old_log_probs - new_log_probs).mean() # (old_log_probs - new_log_probs).mean()
if kl_div >= self.target_kl_div:
break

def train_value(self, obs, returns):
for _ in range(self.value_train_iters):
self.value_optim.zero_grad()

values = self.actor_critic.value(obs)
value_loss = (returns - values) ** 2
value_loss = value_loss.mean()

value_loss.backward()
self.value_optim.step()

def discount_rewards(rewards, gamma=0.99):
"""
Return discounted rewards based on the given rewards and gamma param.
"""
new_rewards = [float(rewards[len(rewards)-1])]
for i in reversed(range(len(rewards)-1)):
new_rewards.append(float(rewards[i]) + gamma * new_rewards[-1])
return np.array(new_rewards[::-1])

def calculate_gaes(rewards, values, gamma=0.99, decay=0.97):
"""
Return the General Advantage Estimates from the given rewards and values.
Paper: https://arxiv.org/pdf/1506.02438.pdf
"""
next_values = np.concatenate([values[1:], [0]])
deltas = [rew + gamma * next_val - val for rew, val, next_val in zip(rewards, values, next_values)]

gaes = [deltas[-1]]
for i in reversed(range(len(deltas)-1)):
gaes.append(deltas[i] + decay * gamma * gaes[-1])

return np.array(gaes[::-1])
Вот трассировка ошибки:

Код: Выделить всё

/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/autograd/__init__.py:266: UserWarning: Error detected in AddmmBackward0.  Traceback of forward call that caused the error:
File "/Users/xan/CodeToJoy/philoso-py/philoso_py.py", line 192, in 
model.run(100, 25)
File "/Users/xan/CodeToJoy/philoso-py/philoso_py.py", line 54, in run
asyncio.run(self.day(steps_per_day))
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/runners.py", line 194, in run
return runner.run(main)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/base_events.py", line 672, in run_until_complete
self.run_forever()
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/base_events.py", line 639, in run_forever
self._run_once()
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/base_events.py", line 1985, in _run_once
handle._run()
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/asyncio/events.py", line 88, in _run
self._context.run(self._callback, *self._args)
File "/Users/xan/CodeToJoy/philoso-py/agent.py", line 91, in day_step
choice, choice_log_prob, action_logits, val = self.nn(obs)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/Users/xan/CodeToJoy/philoso-py/ppo.py", line 157, in forward
z = self.shared_layers(obs)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/container.py", line 217, in forward
input = module(input)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/nn/modules/linear.py", line 116, in forward
return F.linear(input, self.weight, self.bias)
(Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/python_anomaly_mode.cpp:118.)
Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
File "/Users/xan/CodeToJoy/philoso-py/philoso_py.py", line 192, in 
model.run(100, 25)
File "/Users/xan/CodeToJoy/philoso-py/philoso_py.py", line 55, in run
self.night()
File "/Users/xan/CodeToJoy/philoso-py/philoso_py.py", line 78, in night
agent.night()
File "/Users/xan/CodeToJoy/philoso-py/agent.py", line 276, in night
self.trainer.train_policy(
File "/Users/xan/CodeToJoy/philoso-py/ppo.py", line 256, in train_policy
policy_loss.backward()
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/_tensor.py", line 522, in backward
torch.autograd.backward(
File "/Users/xan/.pyenv/versions/3.12.2/lib/python3.12/site-packages/torch/autograd/__init__.py", line 266, in backward
Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.DoubleTensor [64, 64]], which is output 0 of AsStridedBackward0, is at version 81; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
Я новичок в работе с pytorch и раньше не сталкивался с этой проблемой.
Я настроил аномальный режим, согласно которому проблема, похоже, связана с одним из моих линейных слоев в общих слоях. Однако это еще больше сбивает с толку, поскольку я не могу вспомнить ни одного места в своем коде, которое бы напрямую меняло это. Я попробовал клонировать obs в начале прямого вызова, основываясь на предложении в другом месте, но это не помогло. Может кто-нибудь увидеть, что я пропустил? Я использую Pytorch 2.2.2.

Подробнее здесь: https://stackoverflow.com/questions/786 ... -from-calc
Реклама
Ответить Пред. темаСлед. тема

Быстрый ответ

Изменение регистра текста: 
Смайлики
:) :( :oops: :roll: :wink: :muza: :clever: :sorry: :angel: :read: *x)
Ещё смайлики…
   
К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми.

Максимально разрешённый размер вложения: 15 МБ.

  • Похожие темы
    Ответы
    Просмотры
    Последнее сообщение

Вернуться в «Python»