Я создаю преобразователь только для декодера, используя Pytorch, и я выбрал полный набор данных на английском языке из Kaggle Plaintext Wikipedia (полный английский).
Проблема в том, что выходные данные моей модели повторяются:
print(generate(model=model, prompt="what is anarchism", tokenizer=tokenizer, max_new_tokens=100, temperature=1.0, device=DEVICE))
для этого фиктивного поколения выходные данные:
what is anarchism anarchism anarchism anarch anarch anarchism anarchism anarchismismismismismismismismismismismismismismism anarchism anarchismismismismismismismismismismismismismismismismismismismismismismism anarchismismism anarchismismism anarchism anarchismismismismismismismism anarchismismismismism anarch anarchism anarchismismism anarchism anarchism anarchismismismismism anarchism anarchism anarchism
Поскольку я новичок в трансформаторах, я сам не могу разобраться в проблеме. Поэтому, несмотря на то, что публиковать весь код не идеально, я прошу прощения за это.
Вот мой код
import os
import math
from pathlib import Path
import random
import bisect
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tokenizers import ByteLevelBPETokenizer
from tqdm import tqdm
"""#Config"""
DATA_DIR = './data'
TOKENIZER_DIR = './tokenizer'
OUT_DIR = './out'
MODEL_DIR = './models'
MODEL_PATH = './models/wiki_chatbot.pth'
BLOCK_SIZE = 256
APPROX_SAMPLE = 100000
BATCH_SIZE = 8
EPOCHS = 2
LR = 3e-4
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
D_MODEL = 256
N_HEADS = 4
N_LAYERS = 4
DROPOUT = 0.1
SEED = 42
torch.manual_seed(SEED)
if DEVICE=='cuda': torch.cuda.manual_seed_all(SEED)
"""#Tokenizer"""
if not Path(TOKENIZER_DIR).exists():
Path(TOKENIZER_DIR).mkdir()
files = [str(p) for p in Path(DATA_DIR).glob('**/*') if p.is_file() and not p.name.startswith('.')]
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=files, vocab_size=30000, min_frequency=2, special_tokens=["", "", "", "", ""])
tokenizer.save_model(TOKENIZER_DIR)
else:
tokenizer = ByteLevelBPETokenizer.from_file(f"{TOKENIZER_DIR}/vocab.json", f"{TOKENIZER_DIR}/merges.txt")
VOCAB_SIZE = tokenizer.get_vocab_size()
"""#Dataset & Dataloader"""
class RandomWindowWikiDataset(Dataset):
def __init__(self, data_dir, tokenizer, block_size, approx_sample):
self.files = sorted([p for p in Path(DATA_DIR).glob('**/*') if p.is_file() and not p.name.startswith('.')])
if len(self.files) == 0:
raise FileNotFoundError('No files found in data_dir')
self.tokenizer = tokenizer
self.block_size = block_size
self.approx_sample = approx_sample
self.sizes = [os.path.getsize(p) for p in self.files]
self.cum_sizes = []
total = 0
for s in self.sizes:
total += s
self.cum_sizes.append(total)
self.total = total
def __len__(self):
return self.approx_sample
def __getitem__(self, idx):
offset = random.randint(0, self.total - 1)
file_idx = bisect.bisect_right(self.cum_sizes, offset)
if file_idx == 0:
local_offset = offset
else:
local_offset = offset - self.cum_sizes[file_idx - 1]
fp = self.files[file_idx]
with open(fp, 'rb') as f:
f.seek(local_offset)
chunk = f.read(self.block_size * 4)
text = chunk.decode('utf-8', errors='ignore')
ids = self.tokenizer.encode(text).ids
if len(ids) < self.block_size + 1:
pad_id = self.tokenizer.token_to_id('') or 0
ids += [pad_id] * (self.block_size + 1 - len(ids))
x = ids[:self.block_size]
y = ids[1:self.block_size+1]
return torch.tensor(x), torch.tensor(y)
dataset = RandomWindowWikiDataset(data_dir=DATA_DIR, tokenizer=tokenizer, block_size=BLOCK_SIZE, approx_sample=APPROX_SAMPLE)
dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)
"""#DecoderModel"""
class DecoderOnlyModel(nn.Module):
def __init__(self, vocab_size, block_size, d_model, n_heads, n_layers, dropout):
super().__init__()
self.tok_emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
self.pos_emb = nn.Parameter(torch.zeros(1, block_size, d_model))
self.dropout = nn.Dropout(dropout)
dec_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=n_heads, dim_feedforward=4*d_model, dropout=dropout, activation='gelu', batch_first=True)
self.decoder = nn.TransformerDecoder(decoder_layer=dec_layer, num_layers=n_layers)
self.ln = nn.LayerNorm(d_model)
self.head = nn.Linear(in_features=d_model, out_features=vocab_size, bias=False)
self.block_size = block_size
self._init_weights()
def _init_weights(self):
nn.init.normal_(tensor=self.tok_emb.weight, mean=0.0, std=0.2)
nn.init.normal_(tensor=self.head.weight, mean=0.0, std=0.2)
nn.init.normal_(tensor=self.pos_emb, mean=0.0, std=0.2)
def forward(self, x):
b, t = x.size()
tok = self.tok_emb(x) * math.sqrt(self.tok_emb.embedding_dim)
pos = self.pos_emb[:, :t, :]
x = self.dropout(tok + pos)
mask = torch.triu(torch.ones(t, t, device=x.device) * float('-inf'), diagonal=1)
out = self.decoder(tgt=x, memory=x, tgt_mask=mask)
out = self.ln(out)
logits = self.head(out)
return logits
"""#Train function"""
model = DecoderOnlyModel(vocab_size=VOCAB_SIZE, block_size=BLOCK_SIZE, d_model=D_MODEL, n_heads=N_HEADS, n_layers=N_LAYERS, dropout=DROPOUT)
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()
def train(model, optimizer, criterion, epochs, device, vocab_size, dataloader):
model.to(device)
model.train()
for epoch in range(epochs):
epoch_loss = 0.0
for xb, yb in tqdm(dataloader):
xb, yb = xb.to(device), yb.to(device)
optimizer.zero_grad()
logits = model(xb)
loss = criterion(logits.view(-1, vocab_size), yb.view(-1))
loss.backward()
optimizer.step()
epoch_loss += loss.item()
print(f"Epoch: {epoch+1}/{epochs} | Loss: {epoch_loss/len(dataloader):.4f}")
if not os.path.exists(MODEL_PATH):
if not Path(MODEL_DIR).exists():
Path(MODEL_DIR).mkdir()
train(model=model, optimizer=optimizer, criterion=criterion, epochs=EPOCHS, device=DEVICE, vocab_size=VOCAB_SIZE, dataloader=dataloader)
torch.save(obj=model.state_dict(), f=MODEL_PATH)
else:
model.load_state_dict(torch.load(MODEL_PATH))
"""#Generate function"""
@torch.inference_mode()
def generate(model, prompt, tokenizer, max_new_tokens, temperature, top_k, device):
model.to(device)
model.eval()
input_ids = torch.tensor([tokenizer.encode(prompt).ids], dtype=torch.long).to(device)
for _ in range(max_new_tokens):
logits = model(input_ids)[:, -1, :] / temperature
probs = torch.softmax(logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
input_ids = torch.cat([input_ids, next_token], dim=1)
text = tokenizer.decode(input_ids[0].tolist())
return text
print(generate(model=model, prompt="what is anarchism", tokenizer=tokenizer, max_new_tokens=100, temperature=1.0, device=DEVICE))
Подробнее здесь: https://stackoverflow.com/questions/798 ... -responses
Декодер моделирует только ИИ, дающий повторяющиеся ответы. ⇐ Python
Программы на Python
-
Anonymous
1761836347
Anonymous
Я создаю преобразователь только для декодера, используя Pytorch, и я выбрал полный набор данных на английском языке из Kaggle Plaintext Wikipedia (полный английский).
Проблема в том, что выходные данные моей модели повторяются:
print(generate(model=model, prompt="what is anarchism", tokenizer=tokenizer, max_new_tokens=100, temperature=1.0, device=DEVICE))
для этого фиктивного поколения выходные данные:
what is anarchism anarchism anarchism anarch anarch anarchism anarchism anarchismismismismismismismismismismismismismismism anarchism anarchismismismismismismismismismismismismismismismismismismismismismismism anarchismismism anarchismismism anarchism anarchismismismismismismismism anarchismismismismism anarch anarchism anarchismismism anarchism anarchism anarchismismismismism anarchism anarchism anarchism
Поскольку я новичок в трансформаторах, я сам не могу разобраться в проблеме. Поэтому, несмотря на то, что публиковать весь код не идеально, я прошу прощения за это.
Вот мой код
import os
import math
from pathlib import Path
import random
import bisect
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tokenizers import ByteLevelBPETokenizer
from tqdm import tqdm
"""#Config"""
DATA_DIR = './data'
TOKENIZER_DIR = './tokenizer'
OUT_DIR = './out'
MODEL_DIR = './models'
MODEL_PATH = './models/wiki_chatbot.pth'
BLOCK_SIZE = 256
APPROX_SAMPLE = 100000
BATCH_SIZE = 8
EPOCHS = 2
LR = 3e-4
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
D_MODEL = 256
N_HEADS = 4
N_LAYERS = 4
DROPOUT = 0.1
SEED = 42
torch.manual_seed(SEED)
if DEVICE=='cuda': torch.cuda.manual_seed_all(SEED)
"""#Tokenizer"""
if not Path(TOKENIZER_DIR).exists():
Path(TOKENIZER_DIR).mkdir()
files = [str(p) for p in Path(DATA_DIR).glob('**/*') if p.is_file() and not p.name.startswith('.')]
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=files, vocab_size=30000, min_frequency=2, special_tokens=["", "", "", "", ""])
tokenizer.save_model(TOKENIZER_DIR)
else:
tokenizer = ByteLevelBPETokenizer.from_file(f"{TOKENIZER_DIR}/vocab.json", f"{TOKENIZER_DIR}/merges.txt")
VOCAB_SIZE = tokenizer.get_vocab_size()
"""#Dataset & Dataloader"""
class RandomWindowWikiDataset(Dataset):
def __init__(self, data_dir, tokenizer, block_size, approx_sample):
self.files = sorted([p for p in Path(DATA_DIR).glob('**/*') if p.is_file() and not p.name.startswith('.')])
if len(self.files) == 0:
raise FileNotFoundError('No files found in data_dir')
self.tokenizer = tokenizer
self.block_size = block_size
self.approx_sample = approx_sample
self.sizes = [os.path.getsize(p) for p in self.files]
self.cum_sizes = []
total = 0
for s in self.sizes:
total += s
self.cum_sizes.append(total)
self.total = total
def __len__(self):
return self.approx_sample
def __getitem__(self, idx):
offset = random.randint(0, self.total - 1)
file_idx = bisect.bisect_right(self.cum_sizes, offset)
if file_idx == 0:
local_offset = offset
else:
local_offset = offset - self.cum_sizes[file_idx - 1]
fp = self.files[file_idx]
with open(fp, 'rb') as f:
f.seek(local_offset)
chunk = f.read(self.block_size * 4)
text = chunk.decode('utf-8', errors='ignore')
ids = self.tokenizer.encode(text).ids
if len(ids) < self.block_size + 1:
pad_id = self.tokenizer.token_to_id('') or 0
ids += [pad_id] * (self.block_size + 1 - len(ids))
x = ids[:self.block_size]
y = ids[1:self.block_size+1]
return torch.tensor(x), torch.tensor(y)
dataset = RandomWindowWikiDataset(data_dir=DATA_DIR, tokenizer=tokenizer, block_size=BLOCK_SIZE, approx_sample=APPROX_SAMPLE)
dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)
"""#DecoderModel"""
class DecoderOnlyModel(nn.Module):
def __init__(self, vocab_size, block_size, d_model, n_heads, n_layers, dropout):
super().__init__()
self.tok_emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
self.pos_emb = nn.Parameter(torch.zeros(1, block_size, d_model))
self.dropout = nn.Dropout(dropout)
dec_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=n_heads, dim_feedforward=4*d_model, dropout=dropout, activation='gelu', batch_first=True)
self.decoder = nn.TransformerDecoder(decoder_layer=dec_layer, num_layers=n_layers)
self.ln = nn.LayerNorm(d_model)
self.head = nn.Linear(in_features=d_model, out_features=vocab_size, bias=False)
self.block_size = block_size
self._init_weights()
def _init_weights(self):
nn.init.normal_(tensor=self.tok_emb.weight, mean=0.0, std=0.2)
nn.init.normal_(tensor=self.head.weight, mean=0.0, std=0.2)
nn.init.normal_(tensor=self.pos_emb, mean=0.0, std=0.2)
def forward(self, x):
b, t = x.size()
tok = self.tok_emb(x) * math.sqrt(self.tok_emb.embedding_dim)
pos = self.pos_emb[:, :t, :]
x = self.dropout(tok + pos)
mask = torch.triu(torch.ones(t, t, device=x.device) * float('-inf'), diagonal=1)
out = self.decoder(tgt=x, memory=x, tgt_mask=mask)
out = self.ln(out)
logits = self.head(out)
return logits
"""#Train function"""
model = DecoderOnlyModel(vocab_size=VOCAB_SIZE, block_size=BLOCK_SIZE, d_model=D_MODEL, n_heads=N_HEADS, n_layers=N_LAYERS, dropout=DROPOUT)
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()
def train(model, optimizer, criterion, epochs, device, vocab_size, dataloader):
model.to(device)
model.train()
for epoch in range(epochs):
epoch_loss = 0.0
for xb, yb in tqdm(dataloader):
xb, yb = xb.to(device), yb.to(device)
optimizer.zero_grad()
logits = model(xb)
loss = criterion(logits.view(-1, vocab_size), yb.view(-1))
loss.backward()
optimizer.step()
epoch_loss += loss.item()
print(f"Epoch: {epoch+1}/{epochs} | Loss: {epoch_loss/len(dataloader):.4f}")
if not os.path.exists(MODEL_PATH):
if not Path(MODEL_DIR).exists():
Path(MODEL_DIR).mkdir()
train(model=model, optimizer=optimizer, criterion=criterion, epochs=EPOCHS, device=DEVICE, vocab_size=VOCAB_SIZE, dataloader=dataloader)
torch.save(obj=model.state_dict(), f=MODEL_PATH)
else:
model.load_state_dict(torch.load(MODEL_PATH))
"""#Generate function"""
@torch.inference_mode()
def generate(model, prompt, tokenizer, max_new_tokens, temperature, top_k, device):
model.to(device)
model.eval()
input_ids = torch.tensor([tokenizer.encode(prompt).ids], dtype=torch.long).to(device)
for _ in range(max_new_tokens):
logits = model(input_ids)[:, -1, :] / temperature
probs = torch.softmax(logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
input_ids = torch.cat([input_ids, next_token], dim=1)
text = tokenizer.decode(input_ids[0].tolist())
return text
print(generate(model=model, prompt="what is anarchism", tokenizer=tokenizer, max_new_tokens=100, temperature=1.0, device=DEVICE))
Подробнее здесь: [url]https://stackoverflow.com/questions/79803908/decoder-only-model-ai-making-repetitive-responses[/url]
Ответить
1 сообщение
• Страница 1 из 1
Перейти
- Кемерово-IT
- ↳ Javascript
- ↳ C#
- ↳ JAVA
- ↳ Elasticsearch aggregation
- ↳ Python
- ↳ Php
- ↳ Android
- ↳ Html
- ↳ Jquery
- ↳ C++
- ↳ IOS
- ↳ CSS
- ↳ Excel
- ↳ Linux
- ↳ Apache
- ↳ MySql
- Детский мир
- Для души
- ↳ Музыкальные инструменты даром
- ↳ Печатная продукция даром
- Внешняя красота и здоровье
- ↳ Одежда и обувь для взрослых даром
- ↳ Товары для здоровья
- ↳ Физкультура и спорт
- Техника - даром!
- ↳ Автомобилистам
- ↳ Компьютерная техника
- ↳ Плиты: газовые и электрические
- ↳ Холодильники
- ↳ Стиральные машины
- ↳ Телевизоры
- ↳ Телефоны, смартфоны, плашеты
- ↳ Швейные машинки
- ↳ Прочая электроника и техника
- ↳ Фототехника
- Ремонт и интерьер
- ↳ Стройматериалы, инструмент
- ↳ Мебель и предметы интерьера даром
- ↳ Cантехника
- Другие темы
- ↳ Разное даром
- ↳ Давай меняться!
- ↳ Отдам\возьму за копеечку
- ↳ Работа и подработка в Кемерове
- ↳ Давай с тобой поговорим...
Мобильная версия