Я работаю над тонкой настройкой модели SpeechT5 TTS. Первоначально я вручную собрал некоторые элементы для набора данных и начал их предварительную обработку. После предварительной обработки я сохранил их в формате «.npy». Но я столкнулся с ошибкой «RuntimeError: mat1 и mat2 должны иметь один и тот же тип dtype, но получили Long и Float» для приведенного ниже кода:
import numpy as np
import torch
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor
from torch.utils.data import DataLoader, Dataset
# Load processed audio data and transcripts
processed_audios = np.load('processed_audios.npy', allow_pickle=True)
transcripts = np.load('transcripts.npy', allow_pickle=True)
# Parameters
target_length = 16000 # Adjust according to your needs
# Function to preprocess audio data
def preprocess_audios(audios, target_length):
processed = []
for audio in audios:
if len(audio) < target_length:
padded_audio = np.pad(audio, (0, target_length - len(audio)), mode='constant')
processed.append(padded_audio)
else:
processed.append(audio[:target_length])
return np.array(processed)
# Preprocess audio clips
audio_tensors = preprocess_audios(processed_audios, target_length)
audio_tensors = torch.tensor(audio_tensors, dtype=torch.float32)
# Define the dataset class
class AudioTextDataset(Dataset):
def __init__(self, audio_tensors, transcripts):
self.audio_tensors = audio_tensors
self.transcripts = transcripts
def __len__(self):
return len(self.transcripts)
def __getitem__(self, idx):
return {
'input_values': self.audio_tensors[idx], # Audio tensor
'text': str(self.transcripts[idx]) # Ensure transcript is a string
}
# Create a dataset and DataLoader
dataset = AudioTextDataset(audio_tensors, transcripts)
data_loader = DataLoader(dataset, batch_size=8, shuffle=True)
# Load SpeechT5 model and processor
model_name = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(model_name)
model = SpeechT5ForTextToSpeech.from_pretrained(model_name)
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
# Fine-tuning loop
num_epochs = 5
for epoch in range(num_epochs):
model.train()
total_loss = 0
for batch in data_loader:
# Get the text input
text_inputs = batch['text']
# Ensure text_inputs are standard Python strings
text_inputs = [str(text) for text in text_inputs]
# Debugging: Print the contents of the batch
print("Text inputs:", text_inputs)
# Prepare text inputs for the processor
text_inputs = processor(text=text_inputs, return_tensors='pt', padding=True, truncation=True).input_ids.to(device)
# Forward pass (pass tokenized text as input, no audio)
outputs = model(input_ids=text_inputs, labels=text_inputs) # Use labels as text inputs
loss = outputs.loss
total_loss += loss.item()
# Backward pass and optimization
optimizer.zero_grad()
loss.backward()
optimizer.step()
avg_loss = total_loss / len(data_loader)
print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")
# Save the fine-tuned model
model.save_pretrained("fine_tuned_speacht5")
processor.save_pretrained("fine_tuned_speacht5")
print("Fine-tuning completed and model saved.")
Text inputs: ['TTS models are used for generating natural-sounding speech', 'RDS is a managed relational database service that simplifies database setup and scaling.', 'BGP is the protocol used to exchange routing information between different networks on the internet.', 'We implemented MFA to add an extra layer of protection to user accounts."', 'A higher area under the ROC curve indicates better classification performance.', 'We configured QoS to prioritize video conferencing traffic during remote meetings.', 'LSTMs are a type of RNN designed to capture long-term dependencies in sequential data.', 'RBAC restricts access to systems based on the roles assigned to users.']
Traceback (most recent call last):
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\Scripts\make.py", line 78, in
outputs = model(input_ids=text_inputs, labels=text_inputs) # Use labels as text inputs
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\models\speecht5\modeling_speecht5.py", line 2712, in forward
outputs = self.speecht5(
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\models\speecht5\modeling_speecht5.py", line 2210, in forward
decoder_outputs = self.decoder(
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\models\speecht5\modeling_speecht5.py", line 1731, in forward
decoder_hidden_states = self.prenet(input_values, speaker_embeddings)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\models\speecht5\modeling_speecht5.py", line 690, in forward
inputs_embeds = nn.functional.relu(layer(inputs_embeds))
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\linear.py", line 125, in forward
return F.linear(input, self.weight, self.bias)
RuntimeError: mat1 and mat2 must have the same dtype, but got Long and Float
Помогите мне решить эту проблему и получить результат.
Я работаю над тонкой настройкой модели SpeechT5 TTS. Первоначально я вручную собрал некоторые элементы для набора данных и начал их предварительную обработку. После предварительной обработки я сохранил их в формате «.npy». Но я столкнулся с ошибкой «RuntimeError: mat1 и mat2 должны иметь один и тот же тип dtype, но получили Long и Float» для приведенного ниже кода: [code]import numpy as np import torch from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor from torch.utils.data import DataLoader, Dataset
# Load processed audio data and transcripts processed_audios = np.load('processed_audios.npy', allow_pickle=True) transcripts = np.load('transcripts.npy', allow_pickle=True)
# Parameters target_length = 16000 # Adjust according to your needs
# Function to preprocess audio data def preprocess_audios(audios, target_length): processed = [] for audio in audios: if len(audio) < target_length: padded_audio = np.pad(audio, (0, target_length - len(audio)), mode='constant') processed.append(padded_audio) else: processed.append(audio[:target_length]) return np.array(processed)
# Define the dataset class class AudioTextDataset(Dataset): def __init__(self, audio_tensors, transcripts): self.audio_tensors = audio_tensors self.transcripts = transcripts
def __len__(self): return len(self.transcripts)
def __getitem__(self, idx): return { 'input_values': self.audio_tensors[idx], # Audio tensor 'text': str(self.transcripts[idx]) # Ensure transcript is a string }
# Create a dataset and DataLoader dataset = AudioTextDataset(audio_tensors, transcripts) data_loader = DataLoader(dataset, batch_size=8, shuffle=True)
# Load SpeechT5 model and processor model_name = "microsoft/speecht5_tts" processor = SpeechT5Processor.from_pretrained(model_name) model = SpeechT5ForTextToSpeech.from_pretrained(model_name)
# Move model to GPU if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device)
# Fine-tuning loop num_epochs = 5 for epoch in range(num_epochs): model.train() total_loss = 0 for batch in data_loader: # Get the text input text_inputs = batch['text']
# Ensure text_inputs are standard Python strings text_inputs = [str(text) for text in text_inputs]
# Debugging: Print the contents of the batch print("Text inputs:", text_inputs)
# Prepare text inputs for the processor text_inputs = processor(text=text_inputs, return_tensors='pt', padding=True, truncation=True).input_ids.to(device)
# Forward pass (pass tokenized text as input, no audio) outputs = model(input_ids=text_inputs, labels=text_inputs) # Use labels as text inputs loss = outputs.loss total_loss += loss.item()
# Backward pass and optimization optimizer.zero_grad() loss.backward() optimizer.step()
# Save the fine-tuned model model.save_pretrained("fine_tuned_speacht5") processor.save_pretrained("fine_tuned_speacht5")
print("Fine-tuning completed and model saved.")
[/code] Полный набор вывода кода: [code]Text inputs: ['TTS models are used for generating natural-sounding speech', 'RDS is a managed relational database service that simplifies database setup and scaling.', 'BGP is the protocol used to exchange routing information between different networks on the internet.', 'We implemented MFA to add an extra layer of protection to user accounts."', 'A higher area under the ROC curve indicates better classification performance.', 'We configured QoS to prioritize video conferencing traffic during remote meetings.', 'LSTMs are a type of RNN designed to capture long-term dependencies in sequential data.', 'RBAC restricts access to systems based on the roles assigned to users.'] Traceback (most recent call last): File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\Scripts\make.py", line 78, in outputs = model(input_ids=text_inputs, labels=text_inputs) # Use labels as text inputs File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl return forward_call(*args, **kwargs) File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\models\speecht5\modeling_speecht5.py", line 2712, in forward outputs = self.speecht5( File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl return forward_call(*args, **kwargs) File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\models\speecht5\modeling_speecht5.py", line 2210, in forward decoder_outputs = self.decoder( File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl return forward_call(*args, **kwargs) File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\models\speecht5\modeling_speecht5.py", line 1731, in forward decoder_hidden_states = self.prenet(input_values, speaker_embeddings) File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl return forward_call(*args, **kwargs) File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\models\speecht5\modeling_speecht5.py", line 690, in forward inputs_embeds = nn.functional.relu(layer(inputs_embeds)) File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl return forward_call(*args, **kwargs) File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\linear.py", line 125, in forward return F.linear(input, self.weight, self.bias) RuntimeError: mat1 and mat2 must have the same dtype, but got Long and Float
[/code] Помогите мне решить эту проблему и получить результат.
Я работаю над тонкой настройкой модели SpeechT5 TTS. Первоначально я вручную собрал некоторые элементы для набора данных и начал их предварительную обработку. После предварительной обработки я сохранил их в формате «.npy». Но я столкнулся с ошибкой...
Вот моя модель:
импортировать torch
импортировать torch.nn как nn
импортировать torch.nn.functional как F
# Define the shallow CNN
class ShallowCNN(nn.Module):
def __init__(self, in_channels, out_dim):
super(ShallowCNN, self).__init__()
self.conv1...