RuntimeError: mat1 и mat2 должны иметь один и тот же тип dtype, но иметь Long и Float

RuntimeError: mat1 и mat2 должны иметь один и тот же тип dtype, но иметь Long и Float ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

RuntimeError: mat1 и mat2 должны иметь один и тот же тип dtype, но иметь Long и Float

Цитата

Сообщение Anonymous » 23 окт 2024, 12:43

Я работаю над тонкой настройкой модели SpeechT5 TTS. Первоначально я вручную собрал некоторые элементы для набора данных и начал их предварительную обработку. После предварительной обработки я сохранил их в формате «.npy». Но я столкнулся с ошибкой RuntimeError: mat1 и mat2 должны иметь один и тот же тип dtype, но иметь Long и Float для приведенного ниже кода:

Код: Выделить всё

import numpy as np
import torch
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor
from torch.utils.data import DataLoader, Dataset

# Load processed audio data and transcripts
processed_audios = np.load('processed_audios.npy', allow_pickle=True)
transcripts = np.load('transcripts.npy', allow_pickle=True)

# Parameters
target_length = 16000  # Adjust according to your needs

# Function to preprocess audio data
def preprocess_audios(audios, target_length):
processed = []
for audio in audios:
if len(audio) < target_length:
padded_audio = np.pad(audio, (0, target_length - len(audio)), mode='constant')
processed.append(padded_audio)
else:
processed.append(audio[:target_length])
return np.array(processed)

# Preprocess audio clips
audio_tensors = preprocess_audios(processed_audios, target_length)
audio_tensors = torch.tensor(audio_tensors, dtype=torch.float32)

# Define the dataset class
class AudioTextDataset(Dataset):
def __init__(self, audio_tensors, transcripts):
self.audio_tensors = audio_tensors
self.transcripts = transcripts

def __len__(self):
return len(self.transcripts)

def __getitem__(self, idx):
return {
'input_values': self.audio_tensors[idx],  # Audio tensor
'text': str(self.transcripts[idx])          # Ensure transcript is a string
}

# Create a dataset and DataLoader
dataset = AudioTextDataset(audio_tensors, transcripts)
data_loader = DataLoader(dataset, batch_size=8, shuffle=True)

# Load SpeechT5 model and processor
model_name = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(model_name)
model = SpeechT5ForTextToSpeech.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda"  if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Fine-tuning loop
num_epochs = 5
for epoch in range(num_epochs):
model.train()
total_loss = 0
for batch in data_loader:
# Get the text input
text_inputs = batch['text']

# Ensure text_inputs are standard Python strings
text_inputs = [str(text) for text in text_inputs]

# Debugging: Print the contents of the batch
print("Text inputs:", text_inputs)

# Prepare text inputs for the processor
text_inputs = processor(text=text_inputs, return_tensors='pt', padding=True, truncation=True).input_ids.to(device)

# Forward pass (pass tokenized text as input, no audio)
outputs = model(input_ids=text_inputs, labels=text_inputs)  # Use labels as text inputs
loss = outputs.loss
total_loss += loss.item()

# Backward pass and optimization
optimizer.zero_grad()
loss.backward()
optimizer.step()

avg_loss = total_loss / len(data_loader)
print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_speacht5")
processor.save_pretrained("fine_tuned_speacht5")

print("Fine-tuning completed and model saved.")

Полный набор вывода кода:

Код: Выделить всё

Text inputs:  ['TTS models are used for generating natural-sounding speech', 'RDS is a managed relational database service that simplifies database setup and scaling.', 'BGP is the protocol used to exchange routing information between different networks on the internet.', 'We implemented MFA to add an extra layer of protection to user accounts."', 'A higher area under the ROC curve indicates better classification performance.', 'We configured QoS to prioritize video conferencing traffic during remote meetings.', 'LSTMs are a type of RNN designed to capture long-term dependencies in sequential data.', 'RBAC restricts access to systems based on the roles assigned to users.']
Traceback (most recent call last):
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\Scripts\make.py", line 78, in 
outputs = model(input_ids=text_inputs, labels=text_inputs)  # Use labels as text inputs
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\models\speecht5\modeling_speecht5.py", line 2712, in forward
outputs = self.speecht5(
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\models\speecht5\modeling_speecht5.py", line 2210, in forward
decoder_outputs = self.decoder(
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\models\speecht5\modeling_speecht5.py", line 1731, in forward
decoder_hidden_states = self.prenet(input_values, speaker_embeddings)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\models\speecht5\modeling_speecht5.py", line 690, in forward
inputs_embeds = nn.functional.relu(layer(inputs_embeds))
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\linear.py", line 125, in forward
return F.linear(input, self.weight, self.bias)
RuntimeError: mat1 and mat2 must have the same dtype, but got Long and Float

Помогите мне решить эту проблему и получить результат.

Подробнее здесь: https://stackoverflow.com/questions/791 ... -and-float

1729676628

Anonymous

Я работаю над тонкой настройкой модели SpeechT5 TTS. Первоначально я вручную собрал некоторые элементы для набора данных и начал их предварительную обработку. После предварительной обработки я сохранил их в формате «.npy».  Но я столкнулся с ошибкой RuntimeError: mat1 и mat2 должны иметь один и тот же тип dtype, но иметь Long и Float для приведенного ниже кода:
[code]import numpy as np
import torch
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor
from torch.utils.data import DataLoader, Dataset

# Load processed audio data and transcripts
processed_audios = np.load('processed_audios.npy', allow_pickle=True)
transcripts = np.load('transcripts.npy', allow_pickle=True)

# Parameters
target_length = 16000  # Adjust according to your needs

# Function to preprocess audio data
def preprocess_audios(audios, target_length):
processed = []
for audio in audios:
if len(audio) < target_length:
padded_audio = np.pad(audio, (0, target_length - len(audio)), mode='constant')
processed.append(padded_audio)
else:
processed.append(audio[:target_length])
return np.array(processed)

# Preprocess audio clips
audio_tensors = preprocess_audios(processed_audios, target_length)
audio_tensors = torch.tensor(audio_tensors, dtype=torch.float32)

# Define the dataset class
class AudioTextDataset(Dataset):
def __init__(self, audio_tensors, transcripts):
self.audio_tensors = audio_tensors
self.transcripts = transcripts

def __len__(self):
return len(self.transcripts)

def __getitem__(self, idx):
return {
'input_values': self.audio_tensors[idx],  # Audio tensor
'text': str(self.transcripts[idx])          # Ensure transcript is a string
}

# Create a dataset and DataLoader
dataset = AudioTextDataset(audio_tensors, transcripts)
data_loader = DataLoader(dataset, batch_size=8, shuffle=True)

# Load SpeechT5 model and processor
model_name = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(model_name)
model = SpeechT5ForTextToSpeech.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda"  if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Fine-tuning loop
num_epochs = 5
for epoch in range(num_epochs):
model.train()
total_loss = 0
for batch in data_loader:
# Get the text input
text_inputs = batch['text']

# Ensure text_inputs are standard Python strings
text_inputs = [str(text) for text in text_inputs]

# Debugging: Print the contents of the batch
print("Text inputs:", text_inputs)

# Prepare text inputs for the processor
text_inputs = processor(text=text_inputs, return_tensors='pt', padding=True, truncation=True).input_ids.to(device)

# Forward pass (pass tokenized text as input, no audio)
outputs = model(input_ids=text_inputs, labels=text_inputs)  # Use labels as text inputs
loss = outputs.loss
total_loss += loss.item()

# Backward pass and optimization
optimizer.zero_grad()
loss.backward()
optimizer.step()

avg_loss = total_loss / len(data_loader)
print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_speacht5")
processor.save_pretrained("fine_tuned_speacht5")

print("Fine-tuning completed and model saved.")

[/code]
Полный набор вывода кода:
[code]Text inputs:  ['TTS models are used for generating natural-sounding speech', 'RDS is a managed relational database service that simplifies database setup and scaling.', 'BGP is the protocol used to exchange routing information between different networks on the internet.', 'We implemented MFA to add an extra layer of protection to user accounts."', 'A higher area under the ROC curve indicates better classification performance.', 'We configured QoS to prioritize video conferencing traffic during remote meetings.', 'LSTMs are a type of RNN designed to capture long-term dependencies in sequential data.', 'RBAC restricts access to systems based on the roles assigned to users.']
Traceback (most recent call last):
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\Scripts\make.py", line 78, in 
outputs = model(input_ids=text_inputs, labels=text_inputs)  # Use labels as text inputs
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\models\speecht5\modeling_speecht5.py", line 2712, in forward
outputs = self.speecht5(
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\models\speecht5\modeling_speecht5.py", line 2210, in forward
decoder_outputs = self.decoder(
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\models\speecht5\modeling_speecht5.py", line 1731, in forward
decoder_hidden_states = self.prenet(input_values, speaker_embeddings)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\models\speecht5\modeling_speecht5.py", line 690, in forward
inputs_embeds = nn.functional.relu(layer(inputs_embeds))
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\nihaj\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\nn\modules\linear.py", line 125, in forward
return F.linear(input, self.weight, self.bias)
RuntimeError: mat1 and mat2 must have the same dtype, but got Long and Float

[/code]
Помогите мне решить эту проблему и получить результат. 

Подробнее здесь: [url]https://stackoverflow.com/questions/79116516/runtimeerror-mat1-and-mat2-must-have-the-same-dtype-but-got-long-and-float[/url]