Пользовательская модель чат-бота на основе трансформатора не генерирует действительные ответы

Пользовательская модель чат-бота на основе трансформатора не генерирует действительные ответы ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Пользовательская модель чат-бота на основе трансформатора не генерирует действительные ответы

Цитата

Сообщение Anonymous » 01 июл 2024, 16:33

В настоящее время я работаю над созданием собственного чат-бота с использованием модели на основе Transformer для личного проекта. Несмотря на то, что я пробовал использовать разные гиперпараметры, корректировать эпохи и увеличивать размер набора данных, я сталкиваюсь с проблемами, из-за которых модель не может генерировать действительные ответы на основе моих наборов данных.
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
import math
from transformers import BertTokenizer

# Define the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the conversational data from a CSV file
def load_conversational_data(file_path):
data = pd.read_csv(file_path)
print("Loaded data:")
print(data.head())
return data['input'].tolist(), data['response'].tolist()

class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=50):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)

def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)

class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads, dropout=0.1):
super(MultiHeadAttention, self).__init__()
assert d_model % num_heads == 0
self.d_head = d_model // num_heads
self.num_heads = num_heads
self.linear_q = nn.Linear(d_model, d_model)
self.linear_k = nn.Linear(d_model, d_model)
self.linear_v = nn.Linear(d_model, d_model)
self.linear_out = nn.Linear(d_model, d_model)
self.dropout = nn.Dropout(dropout)

def forward(self, query, key, value, mask=None):
batch_size = query.size(0)
seq_length = query.size(1)
q = self.linear_q(query)
k = self.linear_k(key)
v = self.linear_v(value)
q = q.view(batch_size, seq_length, self.num_heads, self.d_head).transpose(1, 2)
k = k.view(batch_size, seq_length, self.num_heads, self.d_head).transpose(1, 2)
v = v.view(batch_size, seq_length, self.num_heads, self.d_head).transpose(1, 2)
scores = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(self.d_head)

if mask is not None:
mask = mask.unsqueeze(1).unsqueeze(1)
scores = scores.masked_fill(mask == 0, float('-inf'))

attn_weights = torch.softmax(scores, dim=-1)
attn_weights = self.dropout(attn_weights)
context = torch.matmul(attn_weights, v)
context = context.transpose(1, 2).contiguous().view(batch_size, seq_length, self.num_heads * self.d_head)
output = self.linear_out(context)
return output

class FeedForward(nn.Module):
def __init__(self, d_model, d_ff, dropout=0.1):
super(FeedForward, self).__init__()
self.linear1 = nn.Linear(d_model, d_ff)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(d_ff, d_model)

def forward(self, x):
x = nn.functional.relu(self.linear1(x))
x = self.dropout(x)
x = self.linear2(x)
return x

class TransformerEncoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
super(TransformerEncoderLayer, self).__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
self.ffn = FeedForward(d_model, d_ff, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)

def forward(self, src, src_mask=None):
src2 = self.norm1(src)
src2 = self.self_attn(src2, src2, src2, src_mask)
src = src + self.dropout1(src2)

src2 = self.norm2(src)
src2 = self.ffn(src2)
src = src + self.dropout2(src2)
return src

class TransformerEncoder(nn.Module):
def __init__(self, num_layers, d_model, num_heads, d_ff, dropout=0.1):
super(TransformerEncoder, self).__init__()
self.layers = nn.ModuleList(
[TransformerEncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]
)

def forward(self, src, src_mask=None):
output = src
for layer in self.layers:
output = layer(output, src_mask)
return output

class CustomTransformerModel(nn.Module):
def __init__(self, vocab_size, d_model=128, num_heads=4, num_layers=6, d_ff=256, dropout=0.1):
super(CustomTransformerModel, self).__init__()
self.embedding = nn.Embedding(vocab_size, d_model)
self.pos_encoder = PositionalEncoding(d_model, dropout)
self.transformer_encoder = TransformerEncoder(num_layers, d_model, num_heads, d_ff, dropout)
self.fc = nn.Linear(d_model, vocab_size)
self.init_weights()

def init_weights(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)

def forward(self, src, src_mask=None):
x = self.embedding(src)
x = self.pos_encoder(x)
x = self.transformer_encoder(x, src_mask)
x = self.fc(x)
return x

# Custom Dataset class
class ConversationDataset(Dataset):
def __init__(self, inputs, responses, tokenizer, max_length=50):
self.inputs = inputs
self.responses = responses
self.tokenizer = tokenizer
self.max_length = max_length

def __len__(self):
return len(self.inputs)

def __getitem__(self, idx):
input_text = self.inputs[idx]
response_text = self.responses[idx]
input_tokens = self.tokenizer.encode(input_text, add_special_tokens=True, max_length=self.max_length, truncation=True, padding='max_length')
response_tokens = self.tokenizer.encode(response_text, add_special_tokens=True, max_length=self.max_length, truncation=True, padding='max_length')
return torch.tensor(input_tokens), torch.tensor(response_tokens)

# Load and preprocess your real-world dataset here
inputs, responses = load_conversational_data('conversations.csv')
dataset = ConversationDataset(inputs, responses, tokenizer)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Data loaders
batch_size = 14
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Instantiate the model
vocab_size = tokenizer.vocab_size
model = CustomTransformerModel(vocab_size)

# Loss and optimizer
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.AdamW(model.parameters(), lr=0.001)

# Function to generate responses
def generate_response(model, tokenizer, input_text, max_length=50):
model.eval()
try:
# Tokenize the input text
tokens = tokenizer.encode(input_text, add_special_tokens=True, max_length=max_length, truncation=True, padding='max_length')
input_data = torch.tensor([tokens])
input_mask = (input_data != tokenizer.pad_token_id).long()

# Log the tokenized input
print(f"Tokenized input: {tokens}")

with torch.no_grad():
# Forward pass through the model
output = model(input_data)

# Check for NaNs in the output
if torch.sum(torch.isnan(output)) > 0:
print("Error: Model output contains NaNs.")
return "I'm sorry, there was an error in generating the response."

# Get the predicted tokens
output_tokens = torch.argmax(output, dim=-1).squeeze().tolist()
print(f"Output tokens: {output_tokens}")

# Decode the tokens into a string
response = tokenizer.decode(output_tokens, skip_special_tokens=True)
print(f"Generated response: {response}")

if response.strip() == "":
print("Error: Generated response is empty.")
return "I'm sorry, I couldn't generate a valid response."

return response

except Exception as e:
print(f"Exception during response generation: {str(e)}")
return "I'm sorry, an unexpected error occurred."

# Training and validation loop
num_epochs = 10
for epoch in range(num_epochs):
# Training
model.train()
train_loss = 0.0
for input_data, target in train_loader:
optimizer.zero_grad()
input_mask = (input_data != tokenizer.pad_token_id).long()

output = model(input_data, input_mask)
loss = loss_fn(output.view(-1, vocab_size), target.view(-1))
loss.backward()

optimizer.step()

train_loss += loss.item()

avg_train_loss = train_loss / len(train_loader)

# Validation
model.eval()
val_loss = 0.0
with torch.no_grad():
for input_data, target in val_loader:
input_mask = (input_data != tokenizer.pad_token_id).long()

output = model(input_data, input_mask)
loss = loss_fn(output.view(-1, vocab_size), target.view(-1))
val_loss += loss.item()

avg_val_loss = val_loss / len(val_loader)

print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')

# Log a sample response from the model
sample_input = "Hello, how are you?"
sample_response = generate_response(model, tokenizer, sample_input)
print(f"Sample input: {sample_input}\nSample response: {sample_response}")

# Test the model interactively
while True:
test_input = input("you: ")
if test_input.lower() == 'exit':
break
response = generate_response(model, tokenizer, test_input)
print(f"Input: {test_input}\nResponse: {response}")

Я экспериментировал с различными гиперпараметрами, такими как скорость обучения, размеры пакетов и разное количество эпох обучения. Я ожидал, что эти корректировки помогут улучшить качество ответов, генерируемых моей моделью на основе Transformer. Однако, несмотря на увеличение размера набора данных и уточнение этих параметров, модель продолжает выдавать либо повторяющиеся токены, либо совершенно пустые ответы во время обучения и вывода.
вот выходные данные:
python ai.py
Loaded data:
input response
0 Hello, how are you? I'm good, thank you!
1 What are you up to today? Just relaxing at home. How about you?
2 Have you seen any good movies lately? Yes, I watched a great thriller last night.
3 Can you recommend any good books? Sure! Have you read 'To Kill a Mockingbird'?
4 Tell me about your hobbies. I enjoy painting and hiking.
Epoch [1/10], Train Loss: 9.8999, Val Loss: 8.5966
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [2/10], Train Loss: 6.4885, Val Loss: 8.5841
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [3/10], Train Loss: 5.1883, Val Loss: 10.1303
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012]
Generated response: ..................................................
Sample input: Hello, how are you?
Sample response: ..................................................
Epoch [4/10], Train Loss: 4.8297, Val Loss: 10.9283
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [5/10], Train Loss: 4.8646, Val Loss: 11.6196
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [6/10], Train Loss: 4.8166, Val Loss: 11.6514
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012]
Generated response: ..................................................
Sample input: Hello, how are you?
Sample response: ..................................................
Epoch [7/10], Train Loss: 4.7412, Val Loss: 11.3170
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012]
Generated response: ..................................................
Sample input: Hello, how are you?
Sample response: ..................................................
Epoch [8/10], Train Loss: 4.7109, Val Loss: 10.7475
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102]
Generated response:
Error: Generated response is empty.
Sample input: Hello, how are you?
Sample response: I'm sorry, I couldn't generate a valid response.
Epoch [9/10], Train Loss: 4.6831, Val Loss: 10.1369
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [10/10], Train Loss: 4.6578, Val Loss: 9.8423
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
you:

Подробнее здесь: https://stackoverflow.com/questions/786 ... -responses

1719840826

Anonymous

В настоящее время я работаю над созданием собственного чат-бота с использованием модели на основе Transformer для личного проекта.  Несмотря на то, что я пробовал использовать разные гиперпараметры, корректировать эпохи и увеличивать размер набора данных, я сталкиваюсь с проблемами, из-за которых модель не может генерировать действительные ответы на основе моих наборов данных.
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
import math
from transformers import BertTokenizer

# Define the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the conversational data from a CSV file
def load_conversational_data(file_path):
data = pd.read_csv(file_path)
print("Loaded data:")
print(data.head())
return data['input'].tolist(), data['response'].tolist()

class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=50):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)

def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)

class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads, dropout=0.1):
super(MultiHeadAttention, self).__init__()
assert d_model % num_heads == 0
self.d_head = d_model // num_heads
self.num_heads = num_heads
self.linear_q = nn.Linear(d_model, d_model)
self.linear_k = nn.Linear(d_model, d_model)
self.linear_v = nn.Linear(d_model, d_model)
self.linear_out = nn.Linear(d_model, d_model)
self.dropout = nn.Dropout(dropout)

def forward(self, query, key, value, mask=None):
batch_size = query.size(0)
seq_length = query.size(1)
q = self.linear_q(query)
k = self.linear_k(key)
v = self.linear_v(value)
q = q.view(batch_size, seq_length, self.num_heads, self.d_head).transpose(1, 2)
k = k.view(batch_size, seq_length, self.num_heads, self.d_head).transpose(1, 2)
v = v.view(batch_size, seq_length, self.num_heads, self.d_head).transpose(1, 2)
scores = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(self.d_head)

if mask is not None:
mask = mask.unsqueeze(1).unsqueeze(1)
scores = scores.masked_fill(mask == 0, float('-inf'))

attn_weights = torch.softmax(scores, dim=-1)
attn_weights = self.dropout(attn_weights)
context = torch.matmul(attn_weights, v)
context = context.transpose(1, 2).contiguous().view(batch_size, seq_length, self.num_heads * self.d_head)
output = self.linear_out(context)
return output

class FeedForward(nn.Module):
def __init__(self, d_model, d_ff, dropout=0.1):
super(FeedForward, self).__init__()
self.linear1 = nn.Linear(d_model, d_ff)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(d_ff, d_model)

def forward(self, x):
x = nn.functional.relu(self.linear1(x))
x = self.dropout(x)
x = self.linear2(x)
return x

class TransformerEncoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
super(TransformerEncoderLayer, self).__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
self.ffn = FeedForward(d_model, d_ff, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)

def forward(self, src, src_mask=None):
src2 = self.norm1(src)
src2 = self.self_attn(src2, src2, src2, src_mask)
src = src + self.dropout1(src2)

src2 = self.norm2(src)
src2 = self.ffn(src2)
src = src + self.dropout2(src2)
return src

class TransformerEncoder(nn.Module):
def __init__(self, num_layers, d_model, num_heads, d_ff, dropout=0.1):
super(TransformerEncoder,  self).__init__()
self.layers = nn.ModuleList(
[TransformerEncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]
)

def forward(self, src, src_mask=None):
output = src
for layer in self.layers:
output = layer(output, src_mask)
return output

class CustomTransformerModel(nn.Module):
def __init__(self, vocab_size, d_model=128, num_heads=4, num_layers=6, d_ff=256, dropout=0.1):
super(CustomTransformerModel, self).__init__()
self.embedding = nn.Embedding(vocab_size, d_model)
self.pos_encoder = PositionalEncoding(d_model, dropout)
self.transformer_encoder = TransformerEncoder(num_layers, d_model, num_heads, d_ff, dropout)
self.fc = nn.Linear(d_model, vocab_size)
self.init_weights()

def init_weights(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)

def forward(self, src, src_mask=None):
x = self.embedding(src)
x = self.pos_encoder(x)
x = self.transformer_encoder(x, src_mask)
x = self.fc(x)
return x

# Custom Dataset class
class ConversationDataset(Dataset):
def __init__(self, inputs, responses, tokenizer, max_length=50):
self.inputs = inputs
self.responses = responses
self.tokenizer = tokenizer
self.max_length = max_length

def __len__(self):
return len(self.inputs)

def __getitem__(self, idx):
input_text = self.inputs[idx]
response_text = self.responses[idx]
input_tokens = self.tokenizer.encode(input_text, add_special_tokens=True, max_length=self.max_length, truncation=True, padding='max_length')
response_tokens = self.tokenizer.encode(response_text, add_special_tokens=True, max_length=self.max_length, truncation=True, padding='max_length')
return torch.tensor(input_tokens), torch.tensor(response_tokens)

# Load and preprocess your real-world dataset here
inputs, responses = load_conversational_data('conversations.csv')
dataset = ConversationDataset(inputs, responses, tokenizer)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Data loaders
batch_size = 14
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Instantiate the model
vocab_size = tokenizer.vocab_size
model = CustomTransformerModel(vocab_size)

# Loss and optimizer
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.AdamW(model.parameters(), lr=0.001)

# Function to generate responses
def generate_response(model, tokenizer, input_text, max_length=50):
model.eval()
try:
# Tokenize the input text
tokens = tokenizer.encode(input_text, add_special_tokens=True, max_length=max_length, truncation=True, padding='max_length')
input_data = torch.tensor([tokens])
input_mask = (input_data != tokenizer.pad_token_id).long()

# Log the tokenized input
print(f"Tokenized input: {tokens}")

with torch.no_grad():
# Forward pass through the model
output = model(input_data)

# Check for NaNs in the output
if torch.sum(torch.isnan(output)) >  0:
print("Error: Model output contains NaNs.")
return "I'm sorry, there was an error in generating the response."

# Get the predicted tokens
output_tokens = torch.argmax(output, dim=-1).squeeze().tolist()
print(f"Output tokens: {output_tokens}")

# Decode the tokens into a string
response = tokenizer.decode(output_tokens, skip_special_tokens=True)
print(f"Generated response: {response}")

if response.strip() == "":
print("Error: Generated response is empty.")
return "I'm sorry, I couldn't generate a valid response."

return response

except Exception as e:
print(f"Exception during response generation: {str(e)}")
return "I'm sorry, an unexpected error occurred."

# Training and validation loop
num_epochs = 10
for epoch in range(num_epochs):
# Training
model.train()
train_loss = 0.0
for input_data, target in train_loader:
optimizer.zero_grad()
input_mask = (input_data != tokenizer.pad_token_id).long()

output = model(input_data, input_mask)
loss = loss_fn(output.view(-1, vocab_size), target.view(-1))
loss.backward()

optimizer.step()

train_loss += loss.item()

avg_train_loss = train_loss / len(train_loader)

# Validation
model.eval()
val_loss = 0.0
with torch.no_grad():
for input_data, target in val_loader:
input_mask = (input_data != tokenizer.pad_token_id).long()

output = model(input_data, input_mask)
loss = loss_fn(output.view(-1, vocab_size), target.view(-1))
val_loss += loss.item()

avg_val_loss = val_loss / len(val_loader)

print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')

# Log a sample response from the model
sample_input = "Hello, how are you?"
sample_response = generate_response(model, tokenizer, sample_input)
print(f"Sample input: {sample_input}\nSample response: {sample_response}")

# Test the model interactively
while True:
test_input = input("you: ")
if test_input.lower() == 'exit':
break
response = generate_response(model, tokenizer, test_input)
print(f"Input: {test_input}\nResponse: {response}")


Я экспериментировал с различными гиперпараметрами, такими как скорость обучения, размеры пакетов и разное количество эпох обучения. Я ожидал, что эти корректировки помогут улучшить качество ответов, генерируемых моей моделью на основе Transformer. Однако, несмотря на увеличение размера набора данных и уточнение этих параметров, модель продолжает выдавать либо повторяющиеся токены, либо совершенно пустые ответы во время обучения и вывода.
вот выходные данные:
python ai.py
Loaded data:
input                                      response
0                    Hello, how are you?                          I'm good, thank you!
1              What are you up to today?         Just relaxing at home. How about you?
2  Have you seen any good movies lately?   Yes, I watched a great thriller last night.
3      Can you recommend any good books?  Sure! Have you read 'To Kill a Mockingbird'?
4            Tell me about your hobbies.                   I enjoy painting and hiking.
Epoch [1/10], Train Loss: 9.8999, Val Loss: 8.5966
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [2/10], Train Loss: 6.4885, Val Loss: 8.5841
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [3/10], Train Loss: 5.1883, Val Loss: 10.1303
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012]
Generated response: ..................................................
Sample input: Hello, how are you?
Sample response: ..................................................
Epoch [4/10], Train Loss: 4.8297, Val Loss: 10.9283
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [5/10], Train Loss: 4.8646, Val Loss: 11.6196
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [6/10], Train Loss: 4.8166, Val Loss: 11.6514
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012]
Generated response: ..................................................
Sample input: Hello, how are you?
Sample response:  ..................................................
Epoch [7/10], Train Loss: 4.7412, Val Loss: 11.3170
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012]
Generated response: ..................................................
Sample input: Hello, how are you?
Sample response: ..................................................
Epoch [8/10], Train Loss: 4.7109, Val Loss: 10.7475
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102]
Generated response:
Error: Generated response is empty.
Sample input: Hello, how are you?
Sample response: I'm sorry, I couldn't generate a valid response.
Epoch [9/10], Train Loss: 4.6831, Val Loss: 10.1369
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [10/10], Train Loss: 4.6578, Val Loss: 9.8423
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
you:
 

Подробнее здесь: [url]https://stackoverflow.com/questions/78691646/custom-transformer-based-chatbot-model-not-generating-valid-responses[/url]

Ответить Пред. тема След. тема

1 сообщение • Страница 1 из 1

Быстрый ответ

Заголовок:

Имя пользователя:

Изменение регистра текста:

Смайлики

Ещё смайлики…

К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми. Можно прикреплять файлы, перетаскивая их в окно сообщения.

Максимально разрешённый размер вложения: 15 МБ.

Имя файла:

Комментарий к файлу:

Имя файла	Комментарий к файлу	Размер	Статус

Похожие темы

Ответы

Просмотры

Последнее сообщение

Пользовательская модель чат-бота на основе трансформатора не генерирует действительные ответы

Последнее сообщение Anonymous « 01 июл 2024, 13:26
Добавлено в форуме Python

Anonymous » 01 июл 2024, 13:26 » в форуме Python

В настоящее время я работаю над созданием собственного чат-бота с использованием модели на основе Transformer для личного проекта. Несмотря на то, что я пробовал использовать разные гиперпараметры, корректировать эпохи и увеличивать размер набора...

0 Ответы

27 Просмотры

Последнее сообщение Anonymous
01 июл 2024, 13:26
Проблема с чат-ботом на основе трансформатора, генерирующим бессвязные ответы

Последнее сообщение Anonymous « 01 июн 2024, 18:46
Добавлено в форуме Python

Anonymous » 01 июн 2024, 18:46 » в форуме Python

Я разрабатываю чат-бота, используя библиотеку трансформеров Hugging Face. Предполагается, что бот определяет язык ввода (испанский, каталанский или английский) и реагирует соответствующим образом, используя предварительно обученные модели. Однако...

0 Ответы

25 Просмотры

Последнее сообщение Anonymous
01 июн 2024, 18:46
Сообщения пользователей и сообщения бота отображаются дважды в интерфейсе чат-бота

Последнее сообщение Anonymous « 30 окт 2024, 05:01
Добавлено в форуме CSS

Anonymous » 30 окт 2024, 05:01 » в форуме CSS

Я работаю над созданием чат-бота с искусственным интеллектом. Хотя серверная часть почти завершена, я борюсь с реализацией пользовательского интерфейса.
Проблема
Сообщения от обоих бот и пользователь отображаются дважды. Например, если я ввожу...

0 Ответы

24 Просмотры

Последнее сообщение Anonymous
30 окт 2024, 05:01
Сообщения пользователей и сообщения бота отображаются дважды в интерфейсе чат-бота

Последнее сообщение Anonymous « 30 окт 2024, 06:59
Добавлено в форуме CSS

Anonymous » 30 окт 2024, 06:59 » в форуме CSS

Я работаю над созданием чат-бота с искусственным интеллектом. Хотя серверная часть почти завершена, я борюсь с реализацией пользовательского интерфейса.
Проблема
Сообщения от обоих бот и пользователь отображаются дважды. Например, если я ввожу...

0 Ответы

36 Просмотры

Последнее сообщение Anonymous
30 окт 2024, 06:59
DOCC (XCODE 16) генерирует пустой индекс.html для статического сайта, несмотря на действительные данные .doccarchive & J

Последнее сообщение Anonymous « 29 май 2025, 04:55
Добавлено в форуме IOS

Anonymous » 29 май 2025, 04:55 » в форуме IOS

Я использую xcode 16.2 для создания статической документации HTML для моего проекта iOS swiftfulcrypto для страниц Gitlab.
Проблема
(~ 2KB), хотя:

Промежуточный .doccarchive можно просмотреть в просмотре документации Xcode.

Появляется...

0 Ответы

2 Просмотры

Последнее сообщение Anonymous
29 май 2025, 04:55

Вернуться в «Python»