Ошибка выполнения: r.nvmlDeviceGetNvLinkRemoteDeviceType_ INTERNAL ASSERT FAILED вPython

Программы на Python
Ответить
Anonymous
 Ошибка выполнения: r.nvmlDeviceGetNvLinkRemoteDeviceType_ INTERNAL ASSERT FAILED в

Сообщение Anonymous »

Я пишу код Python, который обучает классификатор классифицировать образцы (10 предложений на образец). Я использую Sentence_Transformer с дополнительными слоями и запускаю обучение модели на Linux-сервере. Код приведен ниже. Самая важная часть — это последняя часть кода, особенно при подгонке модели.
import math
import logging
from datetime import datetime
import pandas as pd
import numpy as np
import sys
import os
import csv
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from torch.utils.data import DataLoader
from collections import Counter
from LabelAccuracyEvaluator import *
from SoftmaxLoss import *
from layers import Dense, MultiHeadAttention
from sklearn.utils import resample
import torch
import random
import json

model_name = sys.argv[1] if len(sys.argv) > 1 else 'distilroberta-base'

train_batch_size = 8

model_save_path = 'Slashdot/output/gascom_hate_attention_' + model_name.replace("/", "-") # this is the line for saving the model you need for random walks

word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False)

dense_model = Dense.Dense(in_features=3*760, out_features=6) #called last , u, v, u-v
multihead_attn = MultiHeadAttention.MultiHeadAttention(760, 5, batch_first=True)

# idea is every attention head should be learning something new and that is why you need different q,k, and v. Now I understand!
linear_proj_q = Dense.Dense(word_embedding_model.get_word_embedding_dimension(), 760)
linear_proj_k = Dense.Dense(word_embedding_model.get_word_embedding_dimension(), 760)
linear_proj_v = Dense.Dense(word_embedding_model.get_word_embedding_dimension(), 760)
linear_proj_node = Dense.Dense(word_embedding_model.get_word_embedding_dimension(), 760) #760 to 760

model = SentenceTransformer(modules=[word_embedding_model, multihead_attn, dense_model, linear_proj_q, linear_proj_k, linear_proj_v, linear_proj_node])
model_uv = SentenceTransformer(modules=[word_embedding_model, pooling_model])# w?

train_samples = []
test_samples = []

# Load and clean training dataset
trainset = pd.read_csv('Slashdot/random-walks/S_train_simil_random_walk.csv')
trainset = trainset.fillna('')

# Create a label mapping: Map each unique string label to an integer
unique_labels = trainset['label'].unique()
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}

# Process train set and convert string labels to integer labels using the mapping
for i in range(len(trainset)):
texts = []
for j in range(1, 11):
texts.append(trainset.iloc['sent' + str(j)])
# Convert string label to integer using the mapping
label = label_mapping[trainset.iloc['label']]
train_samples.append(InputExample(texts=texts, label=label))

# Split into train and dev sets (80/20 split)
dev_samples = train_samples[math.ceil(0.8 * len(train_samples)):]
train_samples = train_samples[:math.ceil(0.8 * len(train_samples))]

# Load and clean test dataset
testset = pd.read_csv('Slashdot/random-walks/S_test_simil_random_walk.csv')
testset = testset.fillna('')

# Convert string labels to integer labels using the same mapping for the test set
for i in range(len(testset)):
texts = []
for j in range(1, 11):
texts.append(testset.iloc['sent' + str(j)])
# Convert string label to integer using the same mapping
label = label_mapping[testset.iloc['label']]
test_samples.append(InputExample(texts=texts, label=label))

# Count the number of samples for each numerical category (label)
train_labels = [example.label for example in train_samples]
dev_labels =[example.label for example in dev_samples]
test_labels = [example.label for example in test_samples]

# Count occurrences of each label in the train, valid, and test sets
train_label_count = Counter(train_labels)
dev_label_count = Counter(dev_labels)
test_label_count = Counter(test_labels)

# Print the counts for each label
print("Label mapping (string to integer):", label_mapping)
print("Initial Train set label distribution:", train_label_count)
print("Initial Valid set label distribution:", dev_label_count)
print("Initial Test set label distribution:", test_label_count)

print('length of train samples=', len(train_samples))
print('length of dev samples=', len(dev_samples))
print('length of test samples=', len(test_samples))

#BALANCING DATASET-------------------------------------------------BALANCING DATASET----------------------------------------------------
# Load the synonym dictionary from the JSON file
with open('Slashdot/synonym_dic.json', 'r') as f:
synonym_dict = json.load(f)

def get_synonyms(word):
"""Get synonyms from the pre-defined dictionary."""
return synonym_dict.get(word.lower(), [])

def replace_with_synonyms(sentence, num_replacements=2):
"""Replace words with synonyms using a hardcoded dictionary, preserving punctuation."""
words = sentence.split()
new_words = []

for word in words:
# Capture punctuation to reattach it after replacement
prefix = ""
suffix = ""

# Check and remove leading punctuation
while word and word[0] in '.,!?':
prefix += word[0]
word = word[1:]

# Check and remove trailing punctuation
while word and word[-1] in '.,!?':
suffix += word[-1]
word = word[:-1]

clean_word = word # word without punctuation

# Skip words that don't have a good replacement
if len(clean_word) < 4:
new_words.append(prefix + clean_word + suffix)
continue

# Get synonyms using the dictionary
synonyms = get_synonyms(clean_word)

if synonyms:
# Replace the word with a random synonym
replacement = random.choice(synonyms)
# Maintain the original case
if clean_word[0].isupper():
replacement = replacement.capitalize()
new_words.append(prefix + replacement + suffix)
# Uncomment to debug replacement
#print(clean_word, 'replaced with', replacement)
else:
new_words.append(prefix + clean_word + suffix)

return ' '.join(new_words)

def augment_sample(sample, num_augments=1):
"""Augment sample sentences using the hardcoded synonym dictionary."""
augmented_samples = []
for _ in range(num_augments):
new_texts = []
for sentence in sample.texts:
#print('**SENTENCE:', sentence)
new_sentence = replace_with_synonyms(sentence)
new_texts.append(new_sentence)
#print('**NEW SENTENCE:', new_sentence)
#print('----------------------------------------------------------')
augmented_samples.append(InputExample(texts=new_texts, label=sample.label))
return augmented_samples

def oversample_to_balance(label_count,samples,dataset_name):
# Oversample to balance classes
print('Balancing',dataset_name,'data:')
max_count = max(label_count.values())
balanced_samples = []
for label, count in label_count.items():
label_samples = [sample for sample in samples if sample.label == label]
if count < max_count:
print('balancing',label,'from',count,'to',max_count,'...')
augment_count = max_count - count
aug_samples = [augment_sample(sample)[0] for sample in resample(label_samples, n_samples=augment_count)]
balanced_samples.extend(aug_samples)
print('balanced')
balanced_samples.extend(label_samples)
return balanced_samples

# Update the samples with the balanced set
train_samples = oversample_to_balance(train_label_count,train_samples,'Train')
dev_samples = oversample_to_balance(dev_label_count,dev_samples,'Dev')
test_samples = oversample_to_balance(test_label_count,test_samples,'Test')

train_label_count = Counter([sample.label for sample in train_samples])
dev_label_count = Counter([sample.label for sample in dev_samples])
test_label_count = Counter([sample.label for sample in test_samples])

print("Balanced Train set label distribution:", train_label_count)
print("Balanced Dev set label distribution:", dev_label_count)
print("Balanced Test set label distribution:", test_label_count)

print('length of train samples=', len(train_samples))
print('length of dev samples=', len(dev_samples))
print('length of test samples=', len(test_samples))
#----------------------------------------------------------------------------------------------------------------------------------------

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
dev_dataloader = DataLoader(dev_samples, shuffle=True, batch_size=train_batch_size)
test_dataloader = DataLoader(test_samples, shuffle=True, batch_size=train_batch_size)

# Ensure that CUDA is available and get the device name
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('CUDA Available:', torch.cuda.is_available())
if torch.cuda.is_available():
print('GPU in use:', torch.cuda.get_device_name(0))

# You can check memory usage like this:
if torch.cuda.is_available():
print(f"Allocated GPU Memory: {torch.cuda.memory_allocated()} bytes")
print(f"Cached GPU Memory: {torch.cuda.memory_reserved()} bytes")

#############################################GPU Check########################################################

print(f"Total training samples: {len(train_samples)}")

for i in range(1):
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = SoftmaxLoss(model=model, model_uv=model_uv, multihead_attn=multihead_attn, linear_proj_q=linear_proj_q,
linear_proj_k=linear_proj_k, linear_proj_v=linear_proj_v, linear_proj_node=linear_proj_node,
sentence_embedding_dimension=pooling_model.get_sentence_embedding_dimension(),
num_labels=6)

dev_evaluator = LabelAccuracyEvaluator(dev_dataloader, name='sts-dev', softmax_model=train_loss)

num_epochs = 3
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up, weight initialised randomly I can check that

print('fitting...')
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=1000, # after 1000 examples the evaluation will happen on the validation set (development).
warmup_steps=warmup_steps,
output_path=model_save_path
)

test_evaluator = LabelAccuracyEvaluator(test_dataloader, name='sts-test', softmax_model=train_loss)
test_evaluator(model, output_path=model_save_path)

Когда я запускаю код, я получаю следующую ошибку:
fitting...
Currently using DataParallel (DP) for multi-gpu training, while DistributedDataParallel (DDP) is recommended for faster training. See https://sbert.net/docs/sentence_transfo ... buted.html for more information.
0%| | 0/19638 [00:00

Подробнее здесь: https://stackoverflow.com/questions/791 ... -failed-at
Ответить

Быстрый ответ

Изменение регистра текста: 
Смайлики
:) :( :oops: :roll: :wink: :muza: :clever: :sorry: :angel: :read: *x)
Ещё смайлики…
   
К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми.

Максимально разрешённый размер вложения: 15 МБ.

Вернуться в «Python»