В настоящее время я работаю над созданием собственного чат-бота с использованием модели на основе Transformer для личного проекта. Несмотря на то, что я пробовал использовать разные гиперпараметры, корректировать эпохи и увеличивать размер набора данных, я сталкиваюсь с проблемами, из-за которых модель не может генерировать действительные ответы на основе моих наборов данных.
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
import math
from transformers import BertTokenizer
# Define the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load the conversational data from a CSV file
def load_conversational_data(file_path):
data = pd.read_csv(file_path)
print("Loaded data:")
print(data.head())
return data['input'].tolist(), data['response'].tolist()
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=50):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads, dropout=0.1):
super(MultiHeadAttention, self).__init__()
assert d_model % num_heads == 0
self.d_head = d_model // num_heads
self.num_heads = num_heads
self.linear_q = nn.Linear(d_model, d_model)
self.linear_k = nn.Linear(d_model, d_model)
self.linear_v = nn.Linear(d_model, d_model)
self.linear_out = nn.Linear(d_model, d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, query, key, value, mask=None):
batch_size = query.size(0)
seq_length = query.size(1)
q = self.linear_q(query)
k = self.linear_k(key)
v = self.linear_v(value)
q = q.view(batch_size, seq_length, self.num_heads, self.d_head).transpose(1, 2)
k = k.view(batch_size, seq_length, self.num_heads, self.d_head).transpose(1, 2)
v = v.view(batch_size, seq_length, self.num_heads, self.d_head).transpose(1, 2)
scores = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(self.d_head)
if mask is not None:
mask = mask.unsqueeze(1).unsqueeze(1)
scores = scores.masked_fill(mask == 0, float('-inf'))
attn_weights = torch.softmax(scores, dim=-1)
attn_weights = self.dropout(attn_weights)
context = torch.matmul(attn_weights, v)
context = context.transpose(1, 2).contiguous().view(batch_size, seq_length, self.num_heads * self.d_head)
output = self.linear_out(context)
return output
class FeedForward(nn.Module):
def __init__(self, d_model, d_ff, dropout=0.1):
super(FeedForward, self).__init__()
self.linear1 = nn.Linear(d_model, d_ff)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(d_ff, d_model)
def forward(self, x):
x = nn.functional.relu(self.linear1(x))
x = self.dropout(x)
x = self.linear2(x)
return x
class TransformerEncoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
super(TransformerEncoderLayer, self).__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
self.ffn = FeedForward(d_model, d_ff, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
def forward(self, src, src_mask=None):
src2 = self.norm1(src)
src2 = self.self_attn(src2, src2, src2, src_mask)
src = src + self.dropout1(src2)
src2 = self.norm2(src)
src2 = self.ffn(src2)
src = src + self.dropout2(src2)
return src
class TransformerEncoder(nn.Module):
def __init__(self, num_layers, d_model, num_heads, d_ff, dropout=0.1):
super(TransformerEncoder, self).__init__()
self.layers = nn.ModuleList(
[TransformerEncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]
)
def forward(self, src, src_mask=None):
output = src
for layer in self.layers:
output = layer(output, src_mask)
return output
class CustomTransformerModel(nn.Module):
def __init__(self, vocab_size, d_model=128, num_heads=4, num_layers=6, d_ff=256, dropout=0.1):
super(CustomTransformerModel, self).__init__()
self.embedding = nn.Embedding(vocab_size, d_model)
self.pos_encoder = PositionalEncoding(d_model, dropout)
self.transformer_encoder = TransformerEncoder(num_layers, d_model, num_heads, d_ff, dropout)
self.fc = nn.Linear(d_model, vocab_size)
self.init_weights()
def init_weights(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def forward(self, src, src_mask=None):
x = self.embedding(src)
x = self.pos_encoder(x)
x = self.transformer_encoder(x, src_mask)
x = self.fc(x)
return x
# Custom Dataset class
class ConversationDataset(Dataset):
def __init__(self, inputs, responses, tokenizer, max_length=50):
self.inputs = inputs
self.responses = responses
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.inputs)
def __getitem__(self, idx):
input_text = self.inputs[idx]
response_text = self.responses[idx]
input_tokens = self.tokenizer.encode(input_text, add_special_tokens=True, max_length=self.max_length, truncation=True, padding='max_length')
response_tokens = self.tokenizer.encode(response_text, add_special_tokens=True, max_length=self.max_length, truncation=True, padding='max_length')
return torch.tensor(input_tokens), torch.tensor(response_tokens)
# Load and preprocess your real-world dataset here
inputs, responses = load_conversational_data('conversations.csv')
dataset = ConversationDataset(inputs, responses, tokenizer)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
# Data loaders
batch_size = 14
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# Instantiate the model
vocab_size = tokenizer.vocab_size
model = CustomTransformerModel(vocab_size)
# Loss and optimizer
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.AdamW(model.parameters(), lr=0.001)
# Function to generate responses
def generate_response(model, tokenizer, input_text, max_length=50):
model.eval()
try:
# Tokenize the input text
tokens = tokenizer.encode(input_text, add_special_tokens=True, max_length=max_length, truncation=True, padding='max_length')
input_data = torch.tensor([tokens])
input_mask = (input_data != tokenizer.pad_token_id).long()
# Log the tokenized input
print(f"Tokenized input: {tokens}")
with torch.no_grad():
# Forward pass through the model
output = model(input_data)
# Check for NaNs in the output
if torch.sum(torch.isnan(output)) > 0:
print("Error: Model output contains NaNs.")
return "I'm sorry, there was an error in generating the response."
# Get the predicted tokens
output_tokens = torch.argmax(output, dim=-1).squeeze().tolist()
print(f"Output tokens: {output_tokens}")
# Decode the tokens into a string
response = tokenizer.decode(output_tokens, skip_special_tokens=True)
print(f"Generated response: {response}")
if response.strip() == "":
print("Error: Generated response is empty.")
return "I'm sorry, I couldn't generate a valid response."
return response
except Exception as e:
print(f"Exception during response generation: {str(e)}")
return "I'm sorry, an unexpected error occurred."
# Training and validation loop
num_epochs = 10
for epoch in range(num_epochs):
# Training
model.train()
train_loss = 0.0
for input_data, target in train_loader:
optimizer.zero_grad()
input_mask = (input_data != tokenizer.pad_token_id).long()
output = model(input_data, input_mask)
loss = loss_fn(output.view(-1, vocab_size), target.view(-1))
loss.backward()
optimizer.step()
train_loss += loss.item()
avg_train_loss = train_loss / len(train_loader)
# Validation
model.eval()
val_loss = 0.0
with torch.no_grad():
for input_data, target in val_loader:
input_mask = (input_data != tokenizer.pad_token_id).long()
output = model(input_data, input_mask)
loss = loss_fn(output.view(-1, vocab_size), target.view(-1))
val_loss += loss.item()
avg_val_loss = val_loss / len(val_loader)
print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')
# Log a sample response from the model
sample_input = "Hello, how are you?"
sample_response = generate_response(model, tokenizer, sample_input)
print(f"Sample input: {sample_input}\nSample response: {sample_response}")
# Test the model interactively
while True:
test_input = input("you: ")
if test_input.lower() == 'exit':
break
response = generate_response(model, tokenizer, test_input)
print(f"Input: {test_input}\nResponse: {response}")
Я экспериментировал с различными гиперпараметрами, такими как скорость обучения, размеры пакетов и разное количество эпох обучения. Я ожидал, что эти корректировки помогут улучшить качество ответов, генерируемых моей моделью на основе Transformer. Однако, несмотря на увеличение размера набора данных и уточнение этих параметров, модель продолжает выдавать либо повторяющиеся токены, либо совершенно пустые ответы во время обучения и вывода.
вот выходные данные:
python ai.py
Loaded data:
input response
0 Hello, how are you? I'm good, thank you!
1 What are you up to today? Just relaxing at home. How about you?
2 Have you seen any good movies lately? Yes, I watched a great thriller last night.
3 Can you recommend any good books? Sure! Have you read 'To Kill a Mockingbird'?
4 Tell me about your hobbies. I enjoy painting and hiking.
Epoch [1/10], Train Loss: 9.8999, Val Loss: 8.5966
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [2/10], Train Loss: 6.4885, Val Loss: 8.5841
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [3/10], Train Loss: 5.1883, Val Loss: 10.1303
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012]
Generated response: ..................................................
Sample input: Hello, how are you?
Sample response: ..................................................
Epoch [4/10], Train Loss: 4.8297, Val Loss: 10.9283
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [5/10], Train Loss: 4.8646, Val Loss: 11.6196
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [6/10], Train Loss: 4.8166, Val Loss: 11.6514
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012]
Generated response: ..................................................
Sample input: Hello, how are you?
Sample response: ..................................................
Epoch [7/10], Train Loss: 4.7412, Val Loss: 11.3170
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012]
Generated response: ..................................................
Sample input: Hello, how are you?
Sample response: ..................................................
Epoch [8/10], Train Loss: 4.7109, Val Loss: 10.7475
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102]
Generated response:
Error: Generated response is empty.
Sample input: Hello, how are you?
Sample response: I'm sorry, I couldn't generate a valid response.
Epoch [9/10], Train Loss: 4.6831, Val Loss: 10.1369
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [10/10], Train Loss: 4.6578, Val Loss: 9.8423
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
you:
Подробнее здесь: https://stackoverflow.com/questions/786 ... -responses
Пользовательская модель чат-бота на основе трансформатора не генерирует действительные ответы ⇐ Python
Программы на Python
1719840826
Anonymous
В настоящее время я работаю над созданием собственного чат-бота с использованием модели на основе Transformer для личного проекта. Несмотря на то, что я пробовал использовать разные гиперпараметры, корректировать эпохи и увеличивать размер набора данных, я сталкиваюсь с проблемами, из-за которых модель не может генерировать действительные ответы на основе моих наборов данных.
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
import math
from transformers import BertTokenizer
# Define the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load the conversational data from a CSV file
def load_conversational_data(file_path):
data = pd.read_csv(file_path)
print("Loaded data:")
print(data.head())
return data['input'].tolist(), data['response'].tolist()
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=50):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads, dropout=0.1):
super(MultiHeadAttention, self).__init__()
assert d_model % num_heads == 0
self.d_head = d_model // num_heads
self.num_heads = num_heads
self.linear_q = nn.Linear(d_model, d_model)
self.linear_k = nn.Linear(d_model, d_model)
self.linear_v = nn.Linear(d_model, d_model)
self.linear_out = nn.Linear(d_model, d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, query, key, value, mask=None):
batch_size = query.size(0)
seq_length = query.size(1)
q = self.linear_q(query)
k = self.linear_k(key)
v = self.linear_v(value)
q = q.view(batch_size, seq_length, self.num_heads, self.d_head).transpose(1, 2)
k = k.view(batch_size, seq_length, self.num_heads, self.d_head).transpose(1, 2)
v = v.view(batch_size, seq_length, self.num_heads, self.d_head).transpose(1, 2)
scores = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(self.d_head)
if mask is not None:
mask = mask.unsqueeze(1).unsqueeze(1)
scores = scores.masked_fill(mask == 0, float('-inf'))
attn_weights = torch.softmax(scores, dim=-1)
attn_weights = self.dropout(attn_weights)
context = torch.matmul(attn_weights, v)
context = context.transpose(1, 2).contiguous().view(batch_size, seq_length, self.num_heads * self.d_head)
output = self.linear_out(context)
return output
class FeedForward(nn.Module):
def __init__(self, d_model, d_ff, dropout=0.1):
super(FeedForward, self).__init__()
self.linear1 = nn.Linear(d_model, d_ff)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(d_ff, d_model)
def forward(self, x):
x = nn.functional.relu(self.linear1(x))
x = self.dropout(x)
x = self.linear2(x)
return x
class TransformerEncoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
super(TransformerEncoderLayer, self).__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
self.ffn = FeedForward(d_model, d_ff, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
def forward(self, src, src_mask=None):
src2 = self.norm1(src)
src2 = self.self_attn(src2, src2, src2, src_mask)
src = src + self.dropout1(src2)
src2 = self.norm2(src)
src2 = self.ffn(src2)
src = src + self.dropout2(src2)
return src
class TransformerEncoder(nn.Module):
def __init__(self, num_layers, d_model, num_heads, d_ff, dropout=0.1):
super(TransformerEncoder, self).__init__()
self.layers = nn.ModuleList(
[TransformerEncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]
)
def forward(self, src, src_mask=None):
output = src
for layer in self.layers:
output = layer(output, src_mask)
return output
class CustomTransformerModel(nn.Module):
def __init__(self, vocab_size, d_model=128, num_heads=4, num_layers=6, d_ff=256, dropout=0.1):
super(CustomTransformerModel, self).__init__()
self.embedding = nn.Embedding(vocab_size, d_model)
self.pos_encoder = PositionalEncoding(d_model, dropout)
self.transformer_encoder = TransformerEncoder(num_layers, d_model, num_heads, d_ff, dropout)
self.fc = nn.Linear(d_model, vocab_size)
self.init_weights()
def init_weights(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def forward(self, src, src_mask=None):
x = self.embedding(src)
x = self.pos_encoder(x)
x = self.transformer_encoder(x, src_mask)
x = self.fc(x)
return x
# Custom Dataset class
class ConversationDataset(Dataset):
def __init__(self, inputs, responses, tokenizer, max_length=50):
self.inputs = inputs
self.responses = responses
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.inputs)
def __getitem__(self, idx):
input_text = self.inputs[idx]
response_text = self.responses[idx]
input_tokens = self.tokenizer.encode(input_text, add_special_tokens=True, max_length=self.max_length, truncation=True, padding='max_length')
response_tokens = self.tokenizer.encode(response_text, add_special_tokens=True, max_length=self.max_length, truncation=True, padding='max_length')
return torch.tensor(input_tokens), torch.tensor(response_tokens)
# Load and preprocess your real-world dataset here
inputs, responses = load_conversational_data('conversations.csv')
dataset = ConversationDataset(inputs, responses, tokenizer)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
# Data loaders
batch_size = 14
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# Instantiate the model
vocab_size = tokenizer.vocab_size
model = CustomTransformerModel(vocab_size)
# Loss and optimizer
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.AdamW(model.parameters(), lr=0.001)
# Function to generate responses
def generate_response(model, tokenizer, input_text, max_length=50):
model.eval()
try:
# Tokenize the input text
tokens = tokenizer.encode(input_text, add_special_tokens=True, max_length=max_length, truncation=True, padding='max_length')
input_data = torch.tensor([tokens])
input_mask = (input_data != tokenizer.pad_token_id).long()
# Log the tokenized input
print(f"Tokenized input: {tokens}")
with torch.no_grad():
# Forward pass through the model
output = model(input_data)
# Check for NaNs in the output
if torch.sum(torch.isnan(output)) > 0:
print("Error: Model output contains NaNs.")
return "I'm sorry, there was an error in generating the response."
# Get the predicted tokens
output_tokens = torch.argmax(output, dim=-1).squeeze().tolist()
print(f"Output tokens: {output_tokens}")
# Decode the tokens into a string
response = tokenizer.decode(output_tokens, skip_special_tokens=True)
print(f"Generated response: {response}")
if response.strip() == "":
print("Error: Generated response is empty.")
return "I'm sorry, I couldn't generate a valid response."
return response
except Exception as e:
print(f"Exception during response generation: {str(e)}")
return "I'm sorry, an unexpected error occurred."
# Training and validation loop
num_epochs = 10
for epoch in range(num_epochs):
# Training
model.train()
train_loss = 0.0
for input_data, target in train_loader:
optimizer.zero_grad()
input_mask = (input_data != tokenizer.pad_token_id).long()
output = model(input_data, input_mask)
loss = loss_fn(output.view(-1, vocab_size), target.view(-1))
loss.backward()
optimizer.step()
train_loss += loss.item()
avg_train_loss = train_loss / len(train_loader)
# Validation
model.eval()
val_loss = 0.0
with torch.no_grad():
for input_data, target in val_loader:
input_mask = (input_data != tokenizer.pad_token_id).long()
output = model(input_data, input_mask)
loss = loss_fn(output.view(-1, vocab_size), target.view(-1))
val_loss += loss.item()
avg_val_loss = val_loss / len(val_loader)
print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')
# Log a sample response from the model
sample_input = "Hello, how are you?"
sample_response = generate_response(model, tokenizer, sample_input)
print(f"Sample input: {sample_input}\nSample response: {sample_response}")
# Test the model interactively
while True:
test_input = input("you: ")
if test_input.lower() == 'exit':
break
response = generate_response(model, tokenizer, test_input)
print(f"Input: {test_input}\nResponse: {response}")
Я экспериментировал с различными гиперпараметрами, такими как скорость обучения, размеры пакетов и разное количество эпох обучения. Я ожидал, что эти корректировки помогут улучшить качество ответов, генерируемых моей моделью на основе Transformer. Однако, несмотря на увеличение размера набора данных и уточнение этих параметров, модель продолжает выдавать либо повторяющиеся токены, либо совершенно пустые ответы во время обучения и вывода.
вот выходные данные:
python ai.py
Loaded data:
input response
0 Hello, how are you? I'm good, thank you!
1 What are you up to today? Just relaxing at home. How about you?
2 Have you seen any good movies lately? Yes, I watched a great thriller last night.
3 Can you recommend any good books? Sure! Have you read 'To Kill a Mockingbird'?
4 Tell me about your hobbies. I enjoy painting and hiking.
Epoch [1/10], Train Loss: 9.8999, Val Loss: 8.5966
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [2/10], Train Loss: 6.4885, Val Loss: 8.5841
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [3/10], Train Loss: 5.1883, Val Loss: 10.1303
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012]
Generated response: ..................................................
Sample input: Hello, how are you?
Sample response: ..................................................
Epoch [4/10], Train Loss: 4.8297, Val Loss: 10.9283
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [5/10], Train Loss: 4.8646, Val Loss: 11.6196
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [6/10], Train Loss: 4.8166, Val Loss: 11.6514
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012]
Generated response: ..................................................
Sample input: Hello, how are you?
Sample response: ..................................................
Epoch [7/10], Train Loss: 4.7412, Val Loss: 11.3170
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012]
Generated response: ..................................................
Sample input: Hello, how are you?
Sample response: ..................................................
Epoch [8/10], Train Loss: 4.7109, Val Loss: 10.7475
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102]
Generated response:
Error: Generated response is empty.
Sample input: Hello, how are you?
Sample response: I'm sorry, I couldn't generate a valid response.
Epoch [9/10], Train Loss: 4.6831, Val Loss: 10.1369
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Epoch [10/10], Train Loss: 4.6578, Val Loss: 9.8423
Tokenized input: [101, 7592, 1010, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output tokens: [1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045, 1045]
Generated response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
Sample input: Hello, how are you?
Sample response: i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i
you:
Подробнее здесь: [url]https://stackoverflow.com/questions/78691646/custom-transformer-based-chatbot-model-not-generating-valid-responses[/url]
-
- Похожие темы
- Ответы
- Просмотры
- Последнее сообщение
-
-
Пользовательская модель чат-бота на основе трансформатора не генерирует действительные ответы
Anonymous » » в форуме PythonВ настоящее время я работаю над созданием собственного чат-бота с использованием модели на основе Transformer для личного проекта. Несмотря на то, что я пробовал использовать разные гиперпараметры, корректировать эпохи и увеличивать размер набора... - 0 Ответы
- 27 Просмотры
-
Последнее сообщение Anonymous
-
-
-
Проблема с чат-ботом на основе трансформатора, генерирующим бессвязные ответы
Anonymous » » в форуме PythonЯ разрабатываю чат-бота, используя библиотеку трансформеров Hugging Face. Предполагается, что бот определяет язык ввода (испанский, каталанский или английский) и реагирует соответствующим образом, используя предварительно обученные модели. Однако... - 0 Ответы
- 25 Просмотры
-
Последнее сообщение Anonymous
-
-
-
Сообщения пользователей и сообщения бота отображаются дважды в интерфейсе чат-бота
Anonymous » » в форуме CSSЯ работаю над созданием чат-бота с искусственным интеллектом. Хотя серверная часть почти завершена, я борюсь с реализацией пользовательского интерфейса.
Проблема
Сообщения от обоих бот и пользователь отображаются дважды. Например, если я ввожу... - 0 Ответы
- 24 Просмотры
-
Последнее сообщение Anonymous
-
-
-
Сообщения пользователей и сообщения бота отображаются дважды в интерфейсе чат-бота
Anonymous » » в форуме CSSЯ работаю над созданием чат-бота с искусственным интеллектом. Хотя серверная часть почти завершена, я борюсь с реализацией пользовательского интерфейса.
Проблема
Сообщения от обоих бот и пользователь отображаются дважды. Например, если я ввожу... - 0 Ответы
- 36 Просмотры
-
Последнее сообщение Anonymous
-
-
-
DOCC (XCODE 16) генерирует пустой индекс.html для статического сайта, несмотря на действительные данные .doccarchive & J
Anonymous » » в форуме IOSЯ использую xcode 16.2 для создания статической документации HTML для моего проекта iOS swiftfulcrypto для страниц Gitlab.
Проблема
(~ 2KB), хотя:
Промежуточный .doccarchive можно просмотреть в просмотре документации Xcode.
Появляется... - 0 Ответы
- 2 Просмотры
-
Последнее сообщение Anonymous
-
Перейти
- Кемерово-IT
- ↳ Javascript
- ↳ C#
- ↳ JAVA
- ↳ Elasticsearch aggregation
- ↳ Python
- ↳ Php
- ↳ Android
- ↳ Html
- ↳ Jquery
- ↳ C++
- ↳ IOS
- ↳ CSS
- ↳ Excel
- ↳ Linux
- ↳ Apache
- ↳ MySql
- Детский мир
- Для души
- ↳ Музыкальные инструменты даром
- ↳ Печатная продукция даром
- Внешняя красота и здоровье
- ↳ Одежда и обувь для взрослых даром
- ↳ Товары для здоровья
- ↳ Физкультура и спорт
- Техника - даром!
- ↳ Автомобилистам
- ↳ Компьютерная техника
- ↳ Плиты: газовые и электрические
- ↳ Холодильники
- ↳ Стиральные машины
- ↳ Телевизоры
- ↳ Телефоны, смартфоны, плашеты
- ↳ Швейные машинки
- ↳ Прочая электроника и техника
- ↳ Фототехника
- Ремонт и интерьер
- ↳ Стройматериалы, инструмент
- ↳ Мебель и предметы интерьера даром
- ↳ Cантехника
- Другие темы
- ↳ Разное даром
- ↳ Давай меняться!
- ↳ Отдам\возьму за копеечку
- ↳ Работа и подработка в Кемерове
- ↳ Давай с тобой поговорим...