Я не могу понять, почему я получаю нулевые градиенты в моем RNN в pytorch.

Я не могу понять, почему я получаю нулевые градиенты в моем RNN в pytorch. ⇐ Python

Ответить

1 сообщение • Страница 1 из 1

Anonymous

Я не могу понять, почему я получаю нулевые градиенты в моем RNN в pytorch.

Цитата

Сообщение Anonymous » 31 окт 2025, 20:48

Моя модель:

Код: Выделить всё

class RandomRNN(nn.Module):
def __init__(self, width, distribution, recurrent_activation = nn.Tanh, out_activation = nn.Sigmoid):
super().__init__()

self.width = width

self.hidden_weights = nn.Parameter(distribution(width, width))
self.input_weights = nn.Parameter(distribution(1, width))
self.output_weights = nn.Parameter(distribution(width, 1))

self.bias = nn.Parameter(distribution(1, width))
self.recurrent_activation = recurrent_activation()
self.out_activation = out_activation()

def forward(self, x_t, hidden_activations=None):
if hidden_activations is None:
hidden_activations = torch.zeros((1, self.width)).float().to(device)

hidden_activations = self.recurrent_activation((hidden_activations@self.hidden_weights)+(x_t@self.input_weights)+self.bias)
y_t = self.out_activation(hidden_activations@self.output_weights)

return y_t, hidden_activations

Где распространение буквально просто:

Код: Выделить всё

def uniform_distribution(x,y):
return torch.rand((x,y)).to(torch.float32)

И мой цикл обучения:

Код: Выделить всё

def train(model, dataset, labels_fn, batch_size = 2, lr = 0.001, optimizer = torch.optim.Adam, num_epochs=20, loss = nn.MSELoss()):
dataloader = DataLoader(dataset, batch_size, shuffle = True)
criterion = loss
optimizer = optimizer(model.parameters(), lr=lr)

model.train()
train_accuracies = []
for epoch in range(num_epochs):
total_loss = 0
total_correct = 0
total = 0
i = 0
for pulses in tqdm(dataloader, f'epochs {epoch}'):
optimizer.zero_grad()

pulses = pulses.to(device)

# forward pass
hidden = None
for timestep in range(pulses.shape[1]):
output, hidden = model(pulses[:, timestep:timestep+1], hidden)

label = labels_fn(pulses).float().to(device)

preds = torch.round(output)

total_correct += sum(preds.squeeze(1) == label)
total +=len(preds.squeeze(1))

loss = criterion(output.squeeze(1), label.float())  # mseloss

total_loss += loss.item()
loss.backward()

optimizer.step()

print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(dataloader)}, Train Accuracy: {total_correct/total}')
train_accuracies.append(total_correct/total)
return train_accuracies

В настоящее время я пытаюсь переобучить одну двоичную серию длиной 5 (1 или 0) с одной классификацией в конце, и получаю ровно нулевые векторы. Я понимаю, что некоторые части моей модели могут быть нестабильными, но мне нужен совет о том, что может вызывать постоянные нулевые градиенты при переоснащении одного образца. Я также дважды проверил dtype градиентов, и они оказались плавающими.

Подробнее здесь: https://stackoverflow.com/questions/798 ... in-pytorch

1761932930

Anonymous

Моя модель:
[code]class RandomRNN(nn.Module):
def __init__(self, width, distribution, recurrent_activation = nn.Tanh, out_activation = nn.Sigmoid):
super().__init__()

self.width = width

self.hidden_weights = nn.Parameter(distribution(width, width))
self.input_weights = nn.Parameter(distribution(1, width))
self.output_weights = nn.Parameter(distribution(width, 1))

self.bias = nn.Parameter(distribution(1, width))
self.recurrent_activation = recurrent_activation()
self.out_activation = out_activation()

def forward(self, x_t, hidden_activations=None):
if hidden_activations is None:
hidden_activations = torch.zeros((1, self.width)).float().to(device)

hidden_activations = self.recurrent_activation((hidden_activations@self.hidden_weights)+(x_t@self.input_weights)+self.bias)
y_t = self.out_activation(hidden_activations@self.output_weights)

return y_t, hidden_activations
[/code]
Где распространение буквально просто:
[code]def uniform_distribution(x,y):
return torch.rand((x,y)).to(torch.float32)
[/code]
И мой цикл обучения:
[code]def train(model, dataset, labels_fn, batch_size = 2, lr = 0.001, optimizer = torch.optim.Adam, num_epochs=20, loss = nn.MSELoss()):
dataloader = DataLoader(dataset, batch_size, shuffle = True)
criterion = loss
optimizer = optimizer(model.parameters(), lr=lr)

model.train()
train_accuracies = []
for epoch in range(num_epochs):
total_loss = 0
total_correct = 0
total = 0
i = 0
for pulses in tqdm(dataloader, f'epochs {epoch}'):
optimizer.zero_grad()

pulses = pulses.to(device)

# forward pass
hidden = None
for timestep in range(pulses.shape[1]):
output, hidden = model(pulses[:, timestep:timestep+1], hidden)

label = labels_fn(pulses).float().to(device)

preds = torch.round(output)

total_correct += sum(preds.squeeze(1) == label)
total +=len(preds.squeeze(1))

loss = criterion(output.squeeze(1), label.float())  # mseloss

total_loss += loss.item()
loss.backward()

optimizer.step()

print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(dataloader)}, Train Accuracy: {total_correct/total}')
train_accuracies.append(total_correct/total)
return train_accuracies
[/code]
В настоящее время я пытаюсь [b]переобучить[/b] одну двоичную серию длиной 5 (1 или 0) с одной классификацией в конце, и получаю [b]ровно[/b] нулевые векторы. Я понимаю, что некоторые части моей модели могут быть нестабильными, но мне нужен совет о том, что может вызывать постоянные нулевые градиенты при переоснащении одного образца. Я также дважды проверил dtype градиентов, и они оказались плавающими. 

Подробнее здесь: [url]https://stackoverflow.com/questions/79806075/i-cannot-figure-out-why-im-getting-zero-gradients-in-my-rnn-in-pytorch[/url]