Я изучал работу преобразователя Vision, но сначала не смог его запустить (собирал ViT с нуля). Но каким-то образом мне удалось зашифровать код, который показывает очень низкую точность (3%).
Пытался разобраться в нем, но, похоже, не смог понять.
Я использовал ту же процедуру, что и для ранее построенный Vit на другом наборе данных (fashionMnist), который дал очень хорошую точность, и попытался сопоставить его с этим набором данных.
from google.colab import files
files.upload()
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d mahdavi1202/skin-cancer
import zipfile
import os
with zipfile.ZipFile('skin-cancer.zip', 'r') as zip_ref:
zip_ref.extractall('skin-cancer')
os.listdir('skin-cancer')
import pandas as pd
import shutil
os.mkdir("/content/all_images")
source_dir="skin-cancer/imgs_part_1/imgs_part_1"
file_names = os.listdir(source_dir)
for file_name in file_names:
shutil.move(os.path.join(source_dir, file_name), "/content/all_images")
source_dir="skin-cancer/imgs_part_2/imgs_part_2"
file_names = os.listdir(source_dir)
for file_name in file_names:
shutil.move(os.path.join(source_dir, file_name), "/content/all_images")
source_dir="skin-cancer/imgs_part_3/imgs_part_3"
file_names = os.listdir(source_dir)
for file_name in file_names:
shutil.move(os.path.join(source_dir, file_name), "/content/all_images")
data = pd.read_csv("skin-cancer/metadata.csv")
data['full_link'] = '/kaggle/working/full_images/' + data['img_id']
data.info()
import matplotlib.pyplot as plt
fig, ax1 = plt.subplots(1, 1, figsize= (10, 5))
data['diagnostic'].value_counts().plot(kind='bar', ax=ax1)
diagnostic_classes = {0:'BCC', 1 : 'ACK', 2 : 'NEV', 3 : 'SEK', 4 : 'SCC', 5: 'MEL'}
# a function for encoding classes
def create_class(X):
if X == 'BCC':
return 0
elif X =='ACK':
return 1
elif X == 'NEV':
return 2
elif X == 'SEK':
return 3
elif X == 'SCC':
return 4
elif X == 'MEL':
return 5
else:
print('error class')
data['encoded_class'] = data['diagnostic'].apply(create_class)
data.drop(['diagnostic'], axis = 1, inplace = True)
data.sort_values(by ='patient_id', ascending = True, inplace = True, ignore_index = True)
data.info()
data.drop([ 'biopsed','patient_id','img_id','lesion_id','smoke', 'drink', 'background_father', 'background_mother', 'pesticide', 'gender', 'skin_cancer_history',
'cancer_history', 'has_piped_water', 'has_sewage_system', 'fitspatrick', 'diameter_1', 'diameter_2'], axis = 1, inplace = True)
data.info()
from sklearn.utils import shuffle
import tensorflow as tf
import numpy as np
IMG_SIZE = 32,32
BATCH_SIZE = 256
SEED = 55
AUTO = tf.data.AUTOTUNE
train_data = data[:2000]
test_data = data[2000:]
test_data = shuffle(test_data, random_state = SEED).reset_index(drop = True)
print('train ->', train_data.shape)
print('test ->', test_data.shape)
counts = np.bincount(train_data['encoded_class'])
weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]
weight_for_2 = 1.0 / counts[2]
weight_for_3 = 1.0 / counts[3]
weight_for_4 = 1.0 / counts[4]
weight_for_5 = 1.0 / counts[5]
class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2, 3: weight_for_3, 4: weight_for_4, 5: weight_for_5}
class_weight
data.info()
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import cv2
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
class ClassToken(Layer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def build(self, input_shape):
self.cls = self.add_weight( #adding a trainable paramter to a custom layer
name="cls", #name of the weight
shape=(1, 1, input_shape[-1]),
initializer="zeros",
trainable=True,
)
def call(self, x):
batch_size = tf.shape(x)[0]
cls = tf.tile(self.cls, [batch_size, 1, 1])
x = tf.concat([cls, x], axis=1)
return x
def mlp(x, cf):
x = Dense(cf["mlp_dim"], activation="gelu")(x)
x = Dropout(cf["dropout_rate"])(x)
x = Dense(cf["hidden_dim"])(x)
x = Dropout(cf["dropout_rate"])(x)
return x
def transformer_encoder(x, cf):
skip_1 = x
x = LayerNormalization()(x)
x = MultiHeadAttention(
num_heads=cf["num_heads"], key_dim=cf["hidden_dim"]
)(x, x)
x = Add()([x, skip_1])
skip_2 = x
x = LayerNormalization()(x)
x = mlp(x, cf)
x = Add()([x, skip_2])
return x
def ViT(cf):
input_shape = (cf["num_patches"], cf["patch_size"]*cf["patch_size"]*cf["num_channels"])
inputs = Input(input_shape)
patch_embed = Dense(cf["hidden_dim"])(inputs)
positions = tf.range(start=0, limit=cf["num_patches"], delta=1)
pos_embed = Embedding(input_dim=cf["num_patches"], output_dim=cf["hidden_dim"])(positions)
embed = patch_embed + pos_embed
x = ClassToken()(embed)
for _ in range(cf["num_layers"]):
x = transformer_encoder(x, cf)
x = LayerNormalization()(x)
x = x[:, 0, :]
x = Dense(cf["num_classes"], activation="softmax")(x)
model = Model(inputs, x)
return model
def preprocess_image(image, patch_size, target_size=(32, 32)): # IMG_SIZE is (32, 32)
# Decode the image within the function if it's a byte string
if isinstance(image, bytes):
image = tf.image.decode_jpeg(image, channels=3) # Decode byte string to tensor
# Ensure image has 3 dimensions (height, width, channels) before resizing
image = tf.ensure_shape(image, [None, None, 3])
image = tf.image.resize(image, target_size)
image = tf.cast(image, tf.float32) / 255.0
# Remove tf.expand_dims(image, axis=0) to avoid extra dimension
patches = tf.image.extract_patches(
images=image,
sizes=[1, patch_size, patch_size, 1],
strides=[1, patch_size, patch_size, 1],
rates=[1, 1, 1, 1],
padding='VALID'
)
# Reshape to (num_patches, patch_size * patch_size * num_channels)
# This should match the input shape expected by your ViT model
patches = tf.reshape(patches, [-1, patch_size * patch_size * 3])
return patches
def create_dataset(df):
image_paths = df['full_link'].values
labels = df['encoded_class'].values
dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))
# Pass the raw file content to preprocess_image
dataset = dataset.map(lambda path, label: (preprocess_image(tf.io.read_file(tf.strings.as_string(path)), cf["patch_size"]), label))
dataset = dataset.batch(BATCH_SIZE)
return dataset
image_dir = "all_images"
data.info()
import tensorflow as tf
def load_and_preprocess_image(image_path, label):
"""Loads an image from a file path, preprocesses it, and returns a tuple of (image, label).
Args:
image_path: The path to the image file.
label: The label associated with the image.
Returns:
A tuple containing the preprocessed image and label.
"""
img = tf.io.read_file(image_path)
img = tf.image.decode_jpeg(img, channels=3)
img = tf.image.resize(img, IMG_SIZE) # Assuming IMG_SIZE is defined as (32, 32)
img = tf.cast(img, tf.float32) / 255.0 # Normalize pixel values
# Convert the image into patches here
img = preprocess_image(img, patch_size=cf["patch_size"]) # patch_size is defined in the cf dictionary
return img, label
def augment(image, label):
"""Applies data augmentation to an image.
Args:
image: The image to augment.
label: The label associated with the image.
Returns:
A tuple containing the augmented image and label.
"""
# You can add more augmentation techniques as needed
image = tf.image.random_flip_left_right(image)
image = tf.image.random_flip_up_down(image)
image = tf.image.random_brightness(image, max_delta=0.2) # Adjust brightness
image = tf.image.random_contrast(image, lower=0.8, upper=1.2) # Adjust contrast
# Other augmentations: random rotation, cropping, etc.
return image, label
IMG_SIZE = (32, 32)
BATCH_SIZE = 256
SEED = 55
AUTO = tf.data.AUTOTUNE
# Create file paths (assuming all images are in /content/all_images)
all_image_files = os.listdir("/content/all_images")
image_paths = ["/content/all_images/" + filename for filename in all_image_files]
# Create labels based on file order and original DataFrame order
labels = []
filenames_no_ext = [f.split('.')[0] for f in all_image_files]
# Ensure correct mapping even after shuffling filenames
for filename in filenames_no_ext:
matching_row = data[data['full_link'].str.contains(filename)] #using full_link which wasn't dropped
if not matching_row.empty:
labels.append(matching_row['encoded_class'].values[0])
else:
print(f"Warning: No matching entry found for file: {filename}")
#Handle missing images in the CSV. One option is to skip or assign a default label
#labels.append(-1) #Example of assigning -1 as a missing label
labels = np.array(labels)
image_paths = np.array(image_paths)
# Shuffle image paths and labels together
image_paths, labels = shuffle(image_paths, labels, random_state=SEED)
# Split data
train_image_paths = image_paths[:2000]
train_labels = labels[:2000]
test_image_paths = image_paths[2000:]
test_labels = labels[2000:]
train_dataset = tf.data.Dataset.from_tensor_slices((train_image_paths, train_labels))
train_dataset = (
train_dataset.map(load_and_preprocess_image, num_parallel_calls=AUTO)
.map(augment, num_parallel_calls=AUTO) # Augmentation
.shuffle(buffer_size=len(train_image_paths))
.batch(BATCH_SIZE)
.prefetch(AUTO)
)
test_dataset = tf.data.Dataset.from_tensor_slices((test_image_paths, test_labels))
test_dataset = (
test_dataset.map(load_and_preprocess_image, num_parallel_calls=AUTO)
.batch(BATCH_SIZE)
.prefetch(AUTO)
)
# ... (Your ViT model definition remains the same) ...
cf = {
"num_patches": (IMG_SIZE[0] // 4) * (IMG_SIZE[1] // 4), #adjust patch size if needed
"patch_size": 4,
"num_channels": 3,
"hidden_dim": 64,
"mlp_dim": 128,
"num_heads": 4,
"num_layers": 4,
"dropout_rate": 0.1,
"num_classes": 6,
}
model = ViT(cf)
# Compile the model
model.compile(
loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"],
)
# Train the model
history = model.fit(
train_dataset,
epochs=10, # Adjust as needed
validation_data=test_dataset,
class_weight=class_weight
)
# Evaluate the model
loss, accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
# Prediction Example
def predict_image(image_path):
img = tf.io.read_file(image_path)
img = tf.image.decode_jpeg(img, channels=3)
img = tf.image.resize(img, IMG_SIZE)
img = tf.image.convert_image_dtype(img, dtype=tf.float32) / 255.0
img = tf.expand_dims(img, axis=0) # Add batch dimension
prediction = model.predict(img)
predicted_class = np.argmax(prediction)
return predicted_class
# Example usage:
sample_image_path = test_image_paths[0]
predicted_class = predict_image(sample_image_path)
print(f"Predicted class for {sample_image_path}: {predicted_class}")
# Save the model
model.save('skin_cancer_vit_model.h5')
make all the changes preferably after the definition of the vit model to make the code work and return the accuracy og the classificaiton of the model
Сопоставил ранее созданный ViT с набором данных Fashion Mnist для классификации изображений, получил низкую точность. Ожидается точность не менее 50% за 10 эпох. Я ожидаю, что большинство изменений будут внесены в предварительную обработку и увеличение изображения, поскольку моя модель отлично работает с другими наборами данных.
ViT(fashionMnist)
Я изучал работу преобразователя Vision, но сначала не смог его запустить (собирал ViT с нуля). Но каким-то образом мне удалось зашифровать код, который показывает очень низкую точность (3%). Пытался разобраться в нем, но, похоже, не смог понять. Я использовал ту же процедуру, что и для ранее построенный Vit на другом наборе данных (fashionMnist), который дал очень хорошую точность, и попытался сопоставить его с этим набором данных. [code]from google.colab import files files.upload()
class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2, 3: weight_for_3, 4: weight_for_4, 5: weight_for_5} class_weight data.info() import os os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" import cv2 import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.metrics import classification_report from tensorflow.keras.layers import * from tensorflow.keras.models import Model
class ClassToken(Layer): def __init__(self, **kwargs): super().__init__(**kwargs)
def build(self, input_shape): self.cls = self.add_weight( #adding a trainable paramter to a custom layer name="cls", #name of the weight shape=(1, 1, input_shape[-1]), initializer="zeros", trainable=True, )
def call(self, x): batch_size = tf.shape(x)[0] cls = tf.tile(self.cls, [batch_size, 1, 1]) x = tf.concat([cls, x], axis=1) return x
def mlp(x, cf): x = Dense(cf["mlp_dim"], activation="gelu")(x) x = Dropout(cf["dropout_rate"])(x) x = Dense(cf["hidden_dim"])(x) x = Dropout(cf["dropout_rate"])(x) return x
def transformer_encoder(x, cf): skip_1 = x x = LayerNormalization()(x) x = MultiHeadAttention( num_heads=cf["num_heads"], key_dim=cf["hidden_dim"] )(x, x) x = Add()([x, skip_1])
skip_2 = x x = LayerNormalization()(x) x = mlp(x, cf) x = Add()([x, skip_2])
for _ in range(cf["num_layers"]): x = transformer_encoder(x, cf)
x = LayerNormalization()(x) x = x[:, 0, :] x = Dense(cf["num_classes"], activation="softmax")(x)
model = Model(inputs, x) return model
def preprocess_image(image, patch_size, target_size=(32, 32)): # IMG_SIZE is (32, 32) # Decode the image within the function if it's a byte string if isinstance(image, bytes): image = tf.image.decode_jpeg(image, channels=3) # Decode byte string to tensor
# Ensure image has 3 dimensions (height, width, channels) before resizing image = tf.ensure_shape(image, [None, None, 3])
image = tf.image.resize(image, target_size) image = tf.cast(image, tf.float32) / 255.0 # Remove tf.expand_dims(image, axis=0) to avoid extra dimension
patches = tf.image.extract_patches( images=image, sizes=[1, patch_size, patch_size, 1], strides=[1, patch_size, patch_size, 1], rates=[1, 1, 1, 1], padding='VALID' ) # Reshape to (num_patches, patch_size * patch_size * num_channels) # This should match the input shape expected by your ViT model patches = tf.reshape(patches, [-1, patch_size * patch_size * 3]) return patches
dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels)) # Pass the raw file content to preprocess_image dataset = dataset.map(lambda path, label: (preprocess_image(tf.io.read_file(tf.strings.as_string(path)), cf["patch_size"]), label)) dataset = dataset.batch(BATCH_SIZE) return dataset
image_dir = "all_images"
data.info() import tensorflow as tf
def load_and_preprocess_image(image_path, label): """Loads an image from a file path, preprocesses it, and returns a tuple of (image, label).
Args: image_path: The path to the image file. label: The label associated with the image.
Returns: A tuple containing the preprocessed image and label. """ img = tf.io.read_file(image_path) img = tf.image.decode_jpeg(img, channels=3) img = tf.image.resize(img, IMG_SIZE) # Assuming IMG_SIZE is defined as (32, 32) img = tf.cast(img, tf.float32) / 255.0 # Normalize pixel values # Convert the image into patches here img = preprocess_image(img, patch_size=cf["patch_size"]) # patch_size is defined in the cf dictionary return img, label
def augment(image, label): """Applies data augmentation to an image.
Args: image: The image to augment. label: The label associated with the image.
Returns: A tuple containing the augmented image and label. """ # You can add more augmentation techniques as needed image = tf.image.random_flip_left_right(image) image = tf.image.random_flip_up_down(image) image = tf.image.random_brightness(image, max_delta=0.2) # Adjust brightness image = tf.image.random_contrast(image, lower=0.8, upper=1.2) # Adjust contrast # Other augmentations: random rotation, cropping, etc. return image, label IMG_SIZE = (32, 32) BATCH_SIZE = 256 SEED = 55 AUTO = tf.data.AUTOTUNE
# Create file paths (assuming all images are in /content/all_images) all_image_files = os.listdir("/content/all_images") image_paths = ["/content/all_images/" + filename for filename in all_image_files]
# Create labels based on file order and original DataFrame order labels = [] filenames_no_ext = [f.split('.')[0] for f in all_image_files]
# Ensure correct mapping even after shuffling filenames for filename in filenames_no_ext: matching_row = data[data['full_link'].str.contains(filename)] #using full_link which wasn't dropped if not matching_row.empty: labels.append(matching_row['encoded_class'].values[0]) else: print(f"Warning: No matching entry found for file: {filename}") #Handle missing images in the CSV. One option is to skip or assign a default label #labels.append(-1) #Example of assigning -1 as a missing label
# Example usage: sample_image_path = test_image_paths[0] predicted_class = predict_image(sample_image_path) print(f"Predicted class for {sample_image_path}: {predicted_class}")
# Save the model model.save('skin_cancer_vit_model.h5')
make all the changes preferably after the definition of the vit model to make the code work and return the accuracy og the classificaiton of the model [/code] Сопоставил ранее созданный ViT с набором данных Fashion Mnist для классификации изображений, получил низкую точность. Ожидается точность не менее 50% за 10 эпох. Я ожидаю, что большинство изменений будут внесены в предварительную обработку и увеличение изображения, поскольку моя модель отлично работает с другими наборами данных. ViT(fashionMnist) [code]import os os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" from sklearn.metrics import classification_report import tensorflow as tf from tensorflow.keras.datasets import fashion_mnist from tensorflow.keras.layers import * from tensorflow.keras.models import Model
class ClassToken(Layer): def __init__(self, **kwargs): super().__init__(**kwargs)
def build(self, input_shape): self.cls = self.add_weight( #adding a trainable paramter to a custom layer name="cls", #name of the weight shape=(1, 1, input_shape[-1]), initializer="zeros", trainable=True, )
def call(self, x): batch_size = tf.shape(x)[0] cls = tf.tile(self.cls, [batch_size, 1, 1]) x = tf.concat([cls, x], axis=1) return x
def mlp(x, cf): x = Dense(cf["mlp_dim"], activation="gelu")(x) x = Dropout(cf["dropout_rate"])(x) x = Dense(cf["hidden_dim"])(x) x = Dropout(cf["dropout_rate"])(x) return x
def transformer_encoder(x, cf): skip_1 = x x = LayerNormalization()(x) x = MultiHeadAttention( num_heads=cf["num_heads"], key_dim=cf["hidden_dim"] )(x, x) x = Add()([x, skip_1])
skip_2 = x x = LayerNormalization()(x) x = mlp(x, cf) x = Add()([x, skip_2])
Я изучал работу преобразователя Vision, но сначала не смог его запустить (собирал ViT с нуля). Но каким-то образом мне удалось зашифровать код, который показывает очень низкую точность (3%).
Пытался разобраться в нем, но, похоже, не смог понять.
Я...
function st_makeenvelope (двойная точность, двойная точность, двойная точность, двойная точность, целое число) не существует
Подсказка: Никакая функция не соответствует данным имени и типам аргументов. Вам может потребоваться добавить явные типы....
Я скопировал пример кода прямо с официального сайта Keras и отредактировал его, чтобы создать модель машинного обучения.
Я использую Google Colab для своего кода.
Ссылка: https ://keras.io/examples/vision/image_classification_from_scratch/
import...
Я обучаю многозадачную модель глубокого обучения с использованием Tensorflow/Keras для классификации автомобильных изображений по двум целям:
L2 Task: Binary classification — Car vs. No Car
L1 Task: Binary classification — among car images only,...
Матрица путаницы показывает, как реальные метки сравниваются с прогнозируемыми метками для задачи двоичной классификации.
Используя матрицу путаницы, вычислите следующее:
Точность: какая часть прогнозов оказалась верной?
Точность: какая доля...