Мне нужно обучить многоклассовую модель, но у меня небольшой набор данных [закрыто]

Мне нужно обучить многоклассовую модель, но у меня небольшой набор данных [закрыто] ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Мне нужно обучить многоклассовую модель, но у меня небольшой набор данных [закрыто]

Цитата

Сообщение Anonymous » 14 янв 2025, 21:52

У меня есть файл Excel, содержащий два столбца: один с текстом, похожим на фразы, а другой с указанием классификации от «CS1» до «CS8». Текст такой

Код: Выделить всё

"NE PAGTO PROVENTOS APOSENTADORIA ESPECIAL SERVIDORES SAÚDE, NOV/2024. REF. FATURA 033/2024. INCLUI REFORMA DE ESCOLAS."

Я уже чистил другой файл, всего в файле 72 текста с df.shape = (72, 2).
Точность остается ниже 50%. Но мне нужно, чтобы она была выше.
файл clean_text.py:

Код: Выделить всё

import re

def clean_text(text):
text = re.sub(r'\d{1,4}/\d{4}', '', text)
text = re.sub(r'\d+', '', text)
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()
return text

файл main.py:

Код: Выделить всё

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from transformers import TFAutoModel, AutoTokenizer
import joblib
import pandas as pd
from nltk.corpus import stopwords
import re
from clean_text import clean_text

df = pd.read_excel("DADOS PARA CLASSIFICAÇÃO MULTICLASSE.xlsx", sheet_name="TREINAMENTO")
df['EMPENHO'] = df['EMPENHO'].apply(clean_text)
descriptions = df['EMPENHO'].tolist()
labels = df['CLASSE SINTETICA'].tolist()

print(f"Amostras: {df.shape}")

label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

vect = TfidfVectorizer()
X = vect.fit_transform(descriptions).toarray()

X_train, X_test, y_train, y_test = train_test_split(X, labels_encoded, test_size=0.2, random_state=42)

model = Sequential([
Input(shape=(X_train.shape[1],)),
Dense(128, activation='relu'),
Dropout(0.3),
Dense(64, activation='relu'),
Dropout(0.3),
Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

accuracy = 0
while accuracy < 0.90:
print("Treinando modelo...")
emp_train = model.fit(X_train, y_train, epochs=100, batch_size=16, validation_split=0.2, verbose=0)

y_pred = np.argmax(model.predict(X_test), axis=-1)
accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia: {accuracy * 100:.2f}%")

joblib.dump(vect, "vectorizer.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")
model.save("empenho_model.keras")

print("Modelo treinado e salvo com sucesso!")

Я пробовал использовать BERT и PyTorch, но мне так удобнее.

Подробнее здесь: https://stackoverflow.com/questions/793 ... ll-dataset

1736880720

Anonymous

У меня есть файл Excel, содержащий два столбца: один с текстом, похожим на фразы, а другой с указанием классификации от «CS1» до «CS8». Текст такой
[code]"NE PAGTO PROVENTOS APOSENTADORIA ESPECIAL SERVIDORES SAÚDE, NOV/2024. REF. FATURA 033/2024. INCLUI REFORMA DE ESCOLAS."
[/code]
Я уже чистил другой файл, всего в файле 72 текста с df.shape = (72, 2).
Точность остается ниже 50%. Но мне нужно, чтобы она была выше.
файл clean_text.py:
[code]import re

def clean_text(text):
text = re.sub(r'\d{1,4}/\d{4}', '', text)
text = re.sub(r'\d+', '', text)
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()
return text
[/code]
файл main.py:
[code]import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from transformers import TFAutoModel, AutoTokenizer
import joblib
import pandas as pd
from nltk.corpus import stopwords
import re
from clean_text import clean_text

df = pd.read_excel("DADOS PARA CLASSIFICAÇÃO MULTICLASSE.xlsx", sheet_name="TREINAMENTO")
df['EMPENHO'] = df['EMPENHO'].apply(clean_text)
descriptions = df['EMPENHO'].tolist()
labels = df['CLASSE SINTETICA'].tolist()

print(f"Amostras: {df.shape}")

label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

vect = TfidfVectorizer()
X = vect.fit_transform(descriptions).toarray()

X_train, X_test, y_train, y_test = train_test_split(X, labels_encoded, test_size=0.2, random_state=42)

model = Sequential([
Input(shape=(X_train.shape[1],)),
Dense(128, activation='relu'),
Dropout(0.3),
Dense(64, activation='relu'),
Dropout(0.3),
Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

accuracy = 0
while accuracy < 0.90:
print("Treinando modelo...")
emp_train = model.fit(X_train, y_train, epochs=100, batch_size=16, validation_split=0.2, verbose=0)

y_pred = np.argmax(model.predict(X_test), axis=-1)
accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia: {accuracy * 100:.2f}%")

joblib.dump(vect, "vectorizer.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")
model.save("empenho_model.keras")

print("Modelo treinado e salvo com sucesso!")
[/code]
Я пробовал использовать BERT и PyTorch, но мне так удобнее. 

Подробнее здесь: [url]https://stackoverflow.com/questions/79355992/i-need-to-train-a-multiclass-model-but-i-have-a-small-dataset[/url]