Вот исходный код:
Код: Выделить всё
from datasets import load_dataset
from transformers import *
import torch
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
# Load dataset (adjust path if needed)
dataset = load_dataset('csv', data_files={
"train": "data/train_split.csv",
"validation": "data/validation_split.csv"
})
# Define label columns used in Jigsaw multi-label setup
LABEL_COLUMNS = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
# Tokenizer and model setup
MODEL_NAME = "cardiffnlp/twitter-roberta-base-offensive"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Define label count and problem type
config = AutoConfig.from_pretrained(
MODEL_NAME,
num_labels=len(LABEL_COLUMNS),
problem_type="multi_label_classification"
)
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_NAME,
config=config,
ignore_mismatched_sizes=True
)
# Preprocessing function
def preprocess(example):
encoding = tokenizer(
example["comment_text"],
truncation=True,
padding="max_length",
max_length=128
)
labels = [example[col] for col in LABEL_COLUMNS]
encoding["labels"] = torch.tensor(labels, dtype=torch.float)
return encoding
# Apply preprocessing
encoded_dataset = dataset.map(preprocess)
# Training configuration
training_args = TrainingArguments(
output_dir="./results",
save_strategy="epoch",
evaluation_strategy="epoch", # 0.5).astype(int)
f1 = f1_score(labels, preds, average="macro")
acc = accuracy_score(labels, preds)
return {"f1": f1, "accuracy": acc}
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=encoded_dataset["train"],
eval_dataset=encoded_dataset["validation"],
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
# Train the model
trainer.train()
Подробнее здесь: https://stackoverflow.com/questions/796 ... rs-library