Я работал над тонкой настройкой модели декодера Transformer в TensorFlow для генерации текста. Хотя первоначальная модель работает нормально, выдавая согласованные выходные данные на основе обучающих данных, проблема возникает в процессе тонкой настройки.
Процесс тонкой настройки проходит без ошибок, и модель работает через ожидаемое количество эпох. Однако предложения из моего корпуса точной настройки никогда не появляются в выходных данных модели. Точность модели снижается во время процесса тонкой настройки, и вместо генерации осмысленного текста она выводит последовательность ненужных слов.
Я также пробовал использовать разные скорости обучения и планировщик скорости обучения и останавливаться раньше, если потери не улучшатся после определенного количества эпох.
import os
import re
import numpy as np
import tensorflow as tf
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, LambdaCallback
from tensorflow.keras import layers
import pickle
import shutil
import keras
import pandas as pd
import matplotlib.pyplot as plt
# Configure eager execution
tf.config.run_functions_eagerly(True)
tf.data.experimental.enable_debug_mode()
# Hyperparameters
MAX_LEN = 5
EMBED_DIM = 2048
NUM_HEADS = 7
FF_DIM = 2048
LEARNING_RATE = 1e-2
FINETUNE_LEARNING_RATE = 1e-5
EPOCHS = 1000
FINETUNE_EPOCHS = 100
BATCH_SIZE = 16
CORPUS = """
The orange cat with an insatiable love for lasagna is perhaps one of the most well-known and beloved characters
in popular culture. His life revolves around a few simple pleasures: sleeping eating and avoiding any form of
effort. Whether lounging on the couch or sneaking a snack from the fridge his daily routine is one of absolute
laziness much to the amusement of fans worldwide. He manages to make doing nothing seem like an art form and
in a way people admire that simplicity.
This feline has a special talent for sarcasm often making sharp-witted comments about the world around him. His
dry sense of humor and unbothered attitude offer a satirical lens on everyday life. From his disdain for Mondays
to his creative excuses for skipping responsibilities garfield expresses thoughts many of us secretly harbor but are
too polite to say out loud. His sharp tongue is as iconic as his orange fur.
Despite being surrounded by humans and other animals garfield often seems to view the world with mild disinterest as
though everything is beneath his effort. His owner Jon frequently finds himself on the receiving end of the
cat's apathy as every attempt to engage or discipline him is met with complete indifference. The cat's superiority
complex is one of his defining characteristics as he believes the world exists solely to serve his needs.
Interestingly even though he is often portrayed as selfish and aloof there's an undeniable charm in his attitude.
His laziness which would be a flaw in any other character is what makes him so endearing. In many ways this cat
embodies the kind of carefree existence we all wish we could have. His laid-back nature coupled with his constant
search for comfort makes him one of the most relatable characters ever created.
Ultimately it’s this mix of humor indifference and relatable flaws that have solidified his place in our hearts.
Whether plotting to steal Jon’s food or simply stretching out for a long nap he continues to capture the imagination
of audiences around the world with his witty sardonic charm.
"""
FINETUNE_CORPUS = [
'excuses for skipping responsibilities garfield expresses thoughts many of',
'by humans and other animals garfield often seems to view the'
]
def normalize(corpus):
corpus = re.sub(r'\(\),.;\n\r', ' ', corpus)
corpus = re.sub(r'\s+', ' ', corpus)
return corpus.strip().lower()
def load_or_create_tokenizer(paragraph, masked_paragraph, tokenizer_path='model/tokenizer.pickle'):
if os.path.exists(tokenizer_path):
with open(tokenizer_path, 'rb') as handle:
tokenizer = pickle.load(handle)
else:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="[OOV]")
tokenizer.fit_on_texts([masked_paragraph, paragraph])
with open(tokenizer_path, 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
return tokenizer
def prepare_data(paragraph, masked_paragraph, max_len):
tokenizer = load_or_create_tokenizer(paragraph, masked_paragraph)
vocab_size = len(tokenizer.word_index) + 1
tokenized_seq = tokenizer.texts_to_sequences([masked_paragraph])[0]
X, Y = [], []
for i in range(len(tokenized_seq) - max_len - 1):
X.append(tokenized_seq[i:i + max_len])
Y.append(tokenized_seq[i + max_len])
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=max_len, padding="post")
return np.array(X), np.array(Y), vocab_size, tokenizer
def create_callbacks(checkpoint_dir, model_name, patience):
return [
EarlyStopping(monitor='loss', patience=patience, restore_best_weights=True, mode='min'),
ModelCheckpoint(
filepath=f'{checkpoint_dir}/model.{model_name}.keras',
monitor='loss', verbose=1, save_best_only=True, save_freq="epoch", mode="min",
),
ReduceLROnPlateau(monitor="loss", patience=10, factor=0.9, min_delta=0, min_lr=1e-4, mode='min')
]
@keras.saving.register_keras_serializable()
class TransformerDecoder(tf.keras.Model):
def __init__(self, vocab_size, embed_dim, num_heads, ff_dim, max_len):
super(TransformerDecoder, self).__init__()
self.embedding = layers.Embedding(vocab_size, embed_dim)
self.pos_encoding = layers.Embedding(input_dim=max_len, output_dim=embed_dim)
self.decoder = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = tf.keras.Sequential([layers.Dense(ff_dim), layers.Dense(embed_dim)])
self.layernorm1 = layers.LayerNormalization()
self.layernorm2 = layers.LayerNormalization()
self.final_layer = layers.Dense(vocab_size)
def call(self, inputs, training=True, mask=None):
seq_len = tf.shape(inputs)[1]
positions = tf.range(start=0, limit=seq_len, delta=1)
x = self.embedding(inputs) + self.pos_encoding(positions)
attn_output = self.decoder(x, x)
out1 = self.layernorm1(x + attn_output)
ffn_output = self.ffn(out1)
out2 = self.layernorm2(out1 + ffn_output)
return self.final_layer(out2)[:, -1, :]
def train_model(X, Y, vocab_size, embed_dim, num_heads, ff_dim, max_len, learning_rate, epochs, batch_size, checkpoint_dir, model_name):
model = TransformerDecoder(vocab_size, embed_dim, num_heads, ff_dim, max_len)
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)
callbacks = create_callbacks(checkpoint_dir, model_name, patience=100)
return model.fit(X, Y, epochs=epochs, batch_size=batch_size, callbacks=callbacks, shuffle=True)
def generate(input_tokens, model, tokenizer, output_len: int):
sequence = []
for _ in range(output_len):
predictions = model(tf.convert_to_tensor([input_tokens]), training=False)[-1]
predicted_token_id = tf.argmax(predictions, axis=-1).numpy()
predicted_token = tokenizer.sequences_to_texts([[predicted_token_id]])
sequence.append(predicted_token[0])
input_tokens.append(predicted_token_id)
del input_tokens[0]
return ' '.join(sequence)
def plot_history(history):
plt.plot(history.history['loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.show()
checkpoint_dir = './model/base/checkpoint/'
for dpath in [ './model/base/keras/', checkpoint_dir,
'./model/finetune/keras/', './model/fintune/checkpoint',]:
os.makedirs(checkpoint_dir, exist_ok=True)
# Normalize the corpus
CORPUS = normalize(CORPUS).replace('garfield', '[MASK]')
FINETUNE_CORPUS = [normalize(corpus) for corpus in FINETUNE_CORPUS]
# Prepare data
X, Y, vocab_size, tokenizer = prepare_data('garfield', CORPUS, MAX_LEN)
# Train and save model
history = train_model(X, Y, vocab_size, EMBED_DIM, NUM_HEADS, FF_DIM, MAX_LEN, LEARNING_RATE, EPOCHS, BATCH_SIZE, checkpoint_dir, 'base')
plot_history(history)
# Base generation
base_generate = generate(tokenizer.texts_to_sequences(['from his disdain for mondays'])[0], model, tokenizer, 50)
print('Base model story:\n', base_generate)
# Fine-tune the model
for fintune_input in FINETUNE_CORPUS:
FINETUNE_X, FINETUNE_Y, _, _ = prepare_data(None, fintune_input, MAX_LEN)
model.fit(FINETUNE_X, FINETUNE_Y, epochs=FINETUNE_EPOCHS, batch_size=1, callbacks=create_callbacks('model/finetune/checkpoint', 'finetune', patience=20))
# Fine-tuned generation
finetune_generate = generate(tokenizer.texts_to_sequences(['from his disdain for mondays'])[0], model, tokenizer, 50)
print('\nFine-tuned model story:\n', finetune_generate)
Подробнее здесь: https://stackoverflow.com/questions/790 ... erformance
Точная настройка модели трансформатора не улучшает производительность ⇐ Python
-
- Похожие темы
- Ответы
- Просмотры
- Последнее сообщение