Низкая точность при использовании видеотрансформаторов для классификации изображений.Python

Программы на Python
Ответить Пред. темаСлед. тема
Anonymous
 Низкая точность при использовании видеотрансформаторов для классификации изображений.

Сообщение Anonymous »

Я изучал работу преобразователя Vision, но сначала не смог его запустить (собирал ViT с нуля). Но каким-то образом мне удалось зашифровать код, который показывает очень низкую точность (3%).
Пытался разобраться в нем, но, похоже, не смог понять.
Я использовал ту же процедуру, что и для ранее построенный Vit на другом наборе данных (fashionMnist), который дал очень хорошую точность, и попытался сопоставить его с этим набором данных.

Код: Выделить всё

from google.colab import files
files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d mahdavi1202/skin-cancer

import zipfile
import os

with zipfile.ZipFile('skin-cancer.zip', 'r') as zip_ref:
zip_ref.extractall('skin-cancer')

os.listdir('skin-cancer')
import pandas as pd
import shutil
os.mkdir("/content/all_images")
source_dir="skin-cancer/imgs_part_1/imgs_part_1"
file_names = os.listdir(source_dir)
for file_name in file_names:
shutil.move(os.path.join(source_dir, file_name), "/content/all_images")

source_dir="skin-cancer/imgs_part_2/imgs_part_2"
file_names = os.listdir(source_dir)
for file_name in file_names:
shutil.move(os.path.join(source_dir, file_name), "/content/all_images")

source_dir="skin-cancer/imgs_part_3/imgs_part_3"
file_names = os.listdir(source_dir)
for file_name in file_names:
shutil.move(os.path.join(source_dir, file_name), "/content/all_images")

data = pd.read_csv("skin-cancer/metadata.csv")
data['full_link'] = '/kaggle/working/full_images/' + data['img_id']
data.info()
import matplotlib.pyplot as plt
fig, ax1 = plt.subplots(1, 1, figsize= (10, 5))
data['diagnostic'].value_counts().plot(kind='bar', ax=ax1)

diagnostic_classes = {0:'BCC', 1 : 'ACK', 2 : 'NEV', 3 : 'SEK', 4 : 'SCC', 5: 'MEL'}

# a function for encoding classes
def create_class(X):
if X == 'BCC':
return 0
elif X =='ACK':
return 1
elif X == 'NEV':
return 2
elif X == 'SEK':
return 3
elif X == 'SCC':
return 4
elif X == 'MEL':
return 5
else:
print('error class')
data['encoded_class'] = data['diagnostic'].apply(create_class)
data.drop(['diagnostic'], axis = 1, inplace = True)
data.sort_values(by ='patient_id', ascending = True, inplace = True, ignore_index = True)
data.info()
data.drop([ 'biopsed','patient_id','img_id','lesion_id','smoke', 'drink', 'background_father', 'background_mother', 'pesticide', 'gender', 'skin_cancer_history',
'cancer_history', 'has_piped_water', 'has_sewage_system', 'fitspatrick', 'diameter_1', 'diameter_2'], axis = 1, inplace = True)
data.info()

from sklearn.utils import shuffle
import tensorflow as tf
import numpy as np
IMG_SIZE = 32,32
BATCH_SIZE = 256
SEED = 55
AUTO = tf.data.AUTOTUNE
train_data = data[:2000]
test_data = data[2000:]
test_data = shuffle(test_data, random_state = SEED).reset_index(drop = True)

print('train  ->', train_data.shape)
print('test  ->', test_data.shape)
counts = np.bincount(train_data['encoded_class'])

weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]
weight_for_2 = 1.0 / counts[2]
weight_for_3 = 1.0 / counts[3]
weight_for_4 = 1.0 / counts[4]
weight_for_5 = 1.0 / counts[5]

class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2, 3: weight_for_3, 4: weight_for_4, 5:  weight_for_5}
class_weight
data.info()
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import cv2
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

class ClassToken(Layer):
def __init__(self, **kwargs):
super().__init__(**kwargs)

def build(self, input_shape):
self.cls = self.add_weight(             #adding a trainable paramter to a custom layer
name="cls",                         #name of the weight
shape=(1, 1, input_shape[-1]),
initializer="zeros",
trainable=True,
)

def call(self, x):
batch_size = tf.shape(x)[0]
cls = tf.tile(self.cls, [batch_size, 1, 1])
x = tf.concat([cls, x], axis=1)
return x

def mlp(x, cf):
x = Dense(cf["mlp_dim"], activation="gelu")(x)
x = Dropout(cf["dropout_rate"])(x)
x = Dense(cf["hidden_dim"])(x)
x = Dropout(cf["dropout_rate"])(x)
return x

def transformer_encoder(x, cf):
skip_1 = x
x = LayerNormalization()(x)
x = MultiHeadAttention(
num_heads=cf["num_heads"], key_dim=cf["hidden_dim"]
)(x, x)
x = Add()([x, skip_1])

skip_2 = x
x = LayerNormalization()(x)
x = mlp(x, cf)
x = Add()([x, skip_2])

return x

def ViT(cf):

input_shape = (cf["num_patches"], cf["patch_size"]*cf["patch_size"]*cf["num_channels"])
inputs = Input(input_shape)

patch_embed = Dense(cf["hidden_dim"])(inputs)

positions = tf.range(start=0, limit=cf["num_patches"], delta=1)
pos_embed = Embedding(input_dim=cf["num_patches"], output_dim=cf["hidden_dim"])(positions)
embed = patch_embed + pos_embed

x = ClassToken()(embed)

for _ in range(cf["num_layers"]):
x = transformer_encoder(x, cf)

x = LayerNormalization()(x)
x = x[:, 0, :]
x = Dense(cf["num_classes"], activation="softmax")(x)

model = Model(inputs, x)
return model

def preprocess_image(image, patch_size, target_size=(32, 32)): # IMG_SIZE is (32, 32)
# Decode the image within the function if it's a byte string
if isinstance(image, bytes):
image = tf.image.decode_jpeg(image, channels=3)  # Decode byte string to tensor

# Ensure image has 3 dimensions (height, width, channels) before resizing
image = tf.ensure_shape(image, [None, None, 3])

image = tf.image.resize(image, target_size)
image = tf.cast(image, tf.float32) / 255.0
# Remove tf.expand_dims(image, axis=0) to avoid extra dimension

patches = tf.image.extract_patches(
images=image,
sizes=[1, patch_size, patch_size, 1],
strides=[1, patch_size, patch_size, 1],
rates=[1, 1, 1, 1],
padding='VALID'
)
# Reshape to (num_patches, patch_size * patch_size * num_channels)
# This should match the input shape expected by your ViT model
patches = tf.reshape(patches, [-1, patch_size * patch_size * 3])
return patches

def create_dataset(df):
image_paths = df['full_link'].values
labels = df['encoded_class'].values

dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))
# Pass the raw file content to preprocess_image
dataset = dataset.map(lambda path, label: (preprocess_image(tf.io.read_file(tf.strings.as_string(path)), cf["patch_size"]), label))
dataset = dataset.batch(BATCH_SIZE)
return dataset

image_dir = "all_images"

data.info()
import tensorflow as tf

def load_and_preprocess_image(image_path, label):
"""Loads an image from a file path, preprocesses it, and returns a tuple of (image, label).

Args:
image_path: The path to the image file.
label:  The label associated with the image.

Returns:
A tuple containing the preprocessed image and label.
"""
img = tf.io.read_file(image_path)
img = tf.image.decode_jpeg(img, channels=3)
img = tf.image.resize(img, IMG_SIZE)  # Assuming IMG_SIZE is defined as (32, 32)
img = tf.cast(img, tf.float32) / 255.0  # Normalize pixel values
# Convert the image into patches here
img = preprocess_image(img, patch_size=cf["patch_size"]) # patch_size is defined in the cf dictionary
return img, label

def augment(image, label):
"""Applies data augmentation to an image.

Args:
image: The image to augment.
label: The label associated with the image.

Returns:
A tuple containing the augmented image and label.
"""
# You can add more augmentation techniques as needed
image = tf.image.random_flip_left_right(image)
image = tf.image.random_flip_up_down(image)
image = tf.image.random_brightness(image, max_delta=0.2)  # Adjust brightness
image = tf.image.random_contrast(image, lower=0.8, upper=1.2)  # Adjust contrast
# Other augmentations: random rotation, cropping, etc.
return image, label
IMG_SIZE = (32, 32)
BATCH_SIZE = 256
SEED = 55
AUTO = tf.data.AUTOTUNE

# Create file paths (assuming all images are in /content/all_images)
all_image_files = os.listdir("/content/all_images")
image_paths = ["/content/all_images/" + filename for filename in all_image_files]

# Create labels based on file order and original DataFrame order
labels = []
filenames_no_ext = [f.split('.')[0] for f in all_image_files]

# Ensure correct mapping even after shuffling filenames
for filename in filenames_no_ext:
matching_row = data[data['full_link'].str.contains(filename)] #using full_link which wasn't dropped
if not matching_row.empty:
labels.append(matching_row['encoded_class'].values[0])
else:
print(f"Warning: No matching entry found for file: {filename}")
#Handle missing images in the CSV. One option is to skip or assign a default label
#labels.append(-1) #Example of assigning -1 as a missing label

labels = np.array(labels)
image_paths = np.array(image_paths)

# Shuffle image paths and labels together
image_paths, labels = shuffle(image_paths, labels, random_state=SEED)

# Split data
train_image_paths = image_paths[:2000]
train_labels = labels[:2000]
test_image_paths = image_paths[2000:]
test_labels = labels[2000:]

train_dataset = tf.data.Dataset.from_tensor_slices((train_image_paths, train_labels))

train_dataset = (
train_dataset.map(load_and_preprocess_image, num_parallel_calls=AUTO)
.map(augment, num_parallel_calls=AUTO) # Augmentation
.shuffle(buffer_size=len(train_image_paths))
.batch(BATCH_SIZE)
.prefetch(AUTO)
)
test_dataset = tf.data.Dataset.from_tensor_slices((test_image_paths, test_labels))
test_dataset = (
test_dataset.map(load_and_preprocess_image, num_parallel_calls=AUTO)
.batch(BATCH_SIZE)
.prefetch(AUTO)
)

# ...  (Your ViT model definition remains the same) ...

cf = {
"num_patches": (IMG_SIZE[0] // 4) * (IMG_SIZE[1] // 4), #adjust patch size if needed
"patch_size": 4,
"num_channels": 3,
"hidden_dim": 64,
"mlp_dim": 128,
"num_heads": 4,
"num_layers": 4,
"dropout_rate": 0.1,
"num_classes": 6,
}

model = ViT(cf)

# Compile the model
model.compile(
loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"],
)

# Train the model
history = model.fit(
train_dataset,
epochs=10,  # Adjust as needed
validation_data=test_dataset,
class_weight=class_weight
)

# Evaluate the model
loss, accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Prediction Example
def predict_image(image_path):
img = tf.io.read_file(image_path)
img = tf.image.decode_jpeg(img, channels=3)
img = tf.image.resize(img, IMG_SIZE)
img = tf.image.convert_image_dtype(img, dtype=tf.float32) / 255.0
img = tf.expand_dims(img, axis=0)  # Add batch dimension
prediction = model.predict(img)
predicted_class = np.argmax(prediction)
return predicted_class

# Example usage:
sample_image_path = test_image_paths[0]
predicted_class = predict_image(sample_image_path)
print(f"Predicted class for {sample_image_path}: {predicted_class}")

# Save the model
model.save('skin_cancer_vit_model.h5')

make all the changes preferably after the definition of the vit model to make the code work and return the accuracy og the classificaiton of the model
Сопоставил ранее созданный ViT с набором данных Fashion Mnist для классификации изображений, получил низкую точность. Ожидается точность не менее 50% за 10 эпох. Я ожидаю, что большинство изменений будут внесены в предварительную обработку и увеличение изображения, поскольку моя модель отлично работает с другими наборами данных.
ViT(fashionMnist)

Код: Выделить всё

import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

class ClassToken(Layer):
def __init__(self, **kwargs):
super().__init__(**kwargs)

def build(self, input_shape):
self.cls = self.add_weight(             #adding a trainable paramter to a custom layer
name="cls",                         #name of the weight
shape=(1, 1, input_shape[-1]),
initializer="zeros",
trainable=True,
)

def call(self, x):
batch_size = tf.shape(x)[0]
cls = tf.tile(self.cls, [batch_size, 1, 1])
x = tf.concat([cls, x], axis=1)
return x

def mlp(x, cf):
x = Dense(cf["mlp_dim"], activation="gelu")(x)
x = Dropout(cf["dropout_rate"])(x)
x = Dense(cf["hidden_dim"])(x)
x = Dropout(cf["dropout_rate"])(x)
return x

def transformer_encoder(x, cf):
skip_1 = x
x = LayerNormalization()(x)
x = MultiHeadAttention(
num_heads=cf["num_heads"], key_dim=cf["hidden_dim"]
)(x, x)
x = Add()([x, skip_1])

skip_2 = x
x = LayerNormalization()(x)
x = mlp(x, cf)
x = Add()([x, skip_2])

return x

def ViT(cf):

input_shape = (cf["num_patches"], cf["patch_size"]*cf["patch_size"]*cf["num_channels"])
inputs = Input(input_shape)

patch_embed = Dense(cf["hidden_dim"])(inputs)

positions = tf.range(start=0, limit=cf["num_patches"], delta=1)
pos_embed = Embedding(input_dim=cf["num_patches"], output_dim=cf["hidden_dim"])(positions)
embed = patch_embed + pos_embed

x = ClassToken()(embed)

for _ in range(cf["num_layers"]):
x = transformer_encoder(x, cf)

x = LayerNormalization()(x)
x = x[:, 0, :]
x = Dense(cf["num_classes"], activation="softmax")(x)

model = Model(inputs, x)
return model

def preprocess_image(image, patch_size):
image = tf.reshape(image, (28, 28, 1))
image = tf.image.resize(image, (28, 28))
image = tf.expand_dims(image, axis=0)
image = tf.cast(image, tf.float32) / 255.0
patches = tf.image.extract_patches(
images=image,
sizes=[1, patch_size, patch_size, 1],
strides=[1, patch_size, patch_size, 1],
rates=[1, 1, 1, 1],
padding='VALID'
)
patches = tf.reshape(patches, (patches.shape[0], -1, patches.shape[-1]))
return patches

if __name__ == "__main__":
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

subset_size = 5000
x_train, y_train = x_train[:subset_size], y_train[:subset_size]
x_test, y_test = x_test[:1000], y_test[:1000]

x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

patch_size = 14
num_patches = (28 // patch_size) ** 2

x_train = tf.image.extract_patches(
images=tf.expand_dims(x_train, axis=-1),
sizes=[1, patch_size, patch_size, 1],
strides=[1, patch_size, patch_size, 1],
rates=[1, 1, 1, 1],
padding='VALID'
)
x_train = tf.reshape(x_train, (x_train.shape[0], num_patches, -1))

x_test = tf.image.extract_patches(
images=tf.expand_dims(x_test, axis=-1),
sizes=[1, patch_size, patch_size, 1],
strides=[1, patch_size, patch_size, 1],
rates=[1, 1, 1, 1],
padding='VALID'
)
x_test = tf.reshape(x_test, (x_test.shape[0], num_patches, -1))

config = {
"num_layers": 4,
"hidden_dim": 128,
"mlp_dim": 256,
"num_heads": 4,
"dropout_rate": 0.1,
"num_patches": num_patches,
"patch_size": patch_size,
"num_channels": 1,
"num_classes":  10
}

model = ViT(config)
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])

model.fit(x_train, y_train, epochs=5, batch_size=32, validation_data=(x_test, y_test))

model.save("vit_fashion_mnist_cls.h5")

print("Model trained. Ready to classify a new image.")

y_pred = model.predict(x_test)
y_pred_classes = tf.argmax(y_pred, axis=1).numpy()

print("Evaluation Metrics on Test Set:")
print(classification_report(y_test, y_pred_classes))

new_image = x_test[19]

prediction = model.predict(tf.expand_dims(new_image,axis=0))
predicted_class = tf.argmax(prediction, axis=1).numpy()[0]
print(f"Predicted Class: {predicted_class}")
пытаемся сопоставить эту реализацию с набором данных PAD-UFES-20.

Подробнее здесь: https://stackoverflow.com/questions/793 ... sification
Реклама
Ответить Пред. темаСлед. тема

Быстрый ответ

Изменение регистра текста: 
Смайлики
:) :( :oops: :roll: :wink: :muza: :clever: :sorry: :angel: :read: *x)
Ещё смайлики…
   
К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми.

Максимально разрешённый размер вложения: 15 МБ.

  • Похожие темы
    Ответы
    Просмотры
    Последнее сообщение

Вернуться в «Python»