import time
import logging
import os
import datetime
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
os.environ["KERAS_BACKEND"] = "tensorflow" # choose any: 'tensorflow', 'torch', 'jax'
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
#os.environ['TF_CPP_MAX_VLOG_LEVEL'] = '0'
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import keras
from keras import ops
from keras import layers
from keras import mixed_precision
from medicai.models import UNETRPlusPlus
from medicai.metrics import BinaryDiceMetric
from medicai.losses import BinaryDiceCELoss
from medicai.utils.inference import SlidingWindowInference
from medicai.callbacks import SlidingWindowInferenceCallback
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from src.experiment_config import ExperimentConfig
from src.data_pipeline.data_loader import data_loader
import pandas as pd
import numpy as np
os.environ["KERAS_BACKEND"] = "tensorflow"
class TFCheckpointCallback(keras.callbacks.Callback):
"""Save model + optimizer + epoch using TF checkpointing."""
def __init__(self, ckpt, ckpt_manager):
super().__init__()
self.ckpt = ckpt
self.ckpt_manager = ckpt_manager
def on_epoch_end(self, epoch, logs=None):
# Update epoch variable and save checkpoint
self.ckpt.epoch.assign_add(1) # increment epoch counter
save_path = self.ckpt_manager.save()
print(f"Saved checkpoint: {save_path} (epoch {int(self.ckpt.epoch.numpy())})")
def get_model():
model = UNETRPlusPlus(
encoder_name="unetr_plusplus_encoder",
input_shape=ExperimentConfig.input_shape,
num_classes=ExperimentConfig.num_classes,
classifier_activation=None,
)
model.compile(
optimizer=keras.optimizers.AdamW(
learning_rate=ExperimentConfig.lr,
weight_decay=ExperimentConfig.weight_decay,
),
loss=BinaryDiceCELoss(
from_logits=True,
dice_weight=1.0,
ce_weight=1.0,
reduction="mean",
num_classes=ExperimentConfig.num_classes,
),
metrics=[
BinaryDiceMetric(
from_logits=True,
ignore_empty=True,
num_classes=ExperimentConfig.num_classes,
name='dice',
),
BinaryDiceMetric(
from_logits=True,
ignore_empty=True,
target_class_ids=[0],
num_classes=ExperimentConfig.num_classes,
name='dice_tc',
),
BinaryDiceMetric(
from_logits=True,
ignore_empty=True,
target_class_ids=[1],
num_classes=ExperimentConfig.num_classes,
name='dice_wt',
),
BinaryDiceMetric(
from_logits=True,
ignore_empty=True,
target_class_ids=[2],
num_classes=ExperimentConfig.num_classes,
name='dice_et',
)
],
)
return model
def get_inference_metric():
swi_callback_metric = BinaryDiceMetric(
from_logits=True,
ignore_empty=True,
num_classes=ExperimentConfig.num_classes,
name='val_dice',
)
return swi_callback_metric
def run_sliding_window_inference_per_class_average(model, ds, roi_size, sw_batch_size, overlap, metrics_list):
"""
Run sliding window inference on a dataset and compute all metrics (average + per class)
"""
for metric in metrics_list:
metric.reset_states()
swi = SlidingWindowInference(
model,
num_classes=metrics_list[0].num_classes,
roi_size=roi_size,
sw_batch_size=sw_batch_size,
overlap=overlap
)
for x, y in ds:
y_pred = swi(x)
for metric in metrics_list:
metric.update_state(ops.convert_to_tensor(y), ops.convert_to_tensor(y_pred))
# Gather results
results = {}
for metric in metrics_list:
results[metric.name] = float(ops.convert_to_numpy(metric.result()))
return results
def main():
print(
f"keras backend: {keras.config.backend()}\n"
f"keras version: {keras.version()}\n"
f"tensorflow version: {tf.__version__}\n"
)
# get keras backend
keras_backend = keras.config.backend()
strategy = tf.distribute.MirroredStrategy()
total_device = strategy.num_replicas_in_sync
print('Keras backend ', keras_backend)
print('Total device found ', total_device)
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
base_save_path = os.path.join(project_root, "experiments", "msd_brain")
unetrplusplus_path = os.path.join(base_save_path, "SwinUnetr")
os.makedirs(unetrplusplus_path, exist_ok=True)
# Subfolders
logs_path = os.path.join(unetrplusplus_path, "logs")
history_path = os.path.join(unetrplusplus_path, "history")
plots_path = os.path.join(unetrplusplus_path, "plots")
os.makedirs(logs_path, exist_ok=True)
os.makedirs(history_path, exist_ok=True)
os.makedirs(plots_path, exist_ok=True)
# Timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# Save path for best model weights
save_path = os.path.join(unetrplusplus_path, f"best_model_weights_{timestamp}.weights.h5")
# Load datasets
tfrecord_pattern = os.path.join(project_root, "data", "msd_brain", "tfrecords", "{}_shard_*.tfrec")
# batch size for training
train_batch = ExperimentConfig.batch_size_train * total_device
train_ds = data_loader(
tfrecord_pattern.format("training"),
batch_size=train_batch,
shuffle=True
)
val_ds = data_loader(
tfrecord_pattern.format("validation"),
batch_size=ExperimentConfig.batch_size_val,
shuffle=False
)
test_ds = data_loader(
tfrecord_pattern.format("test"),
batch_size=ExperimentConfig.batch_size_val,
shuffle=False
)
with strategy.scope():
model = get_model()
checkpoint_dir = os.path.join(unetrplusplus_path, "checkpoints")
os.makedirs(checkpoint_dir, exist_ok=True)
with strategy.scope():
ckpt = tf.train.Checkpoint(
epoch=tf.Variable(0), # epoch counter — saved as part of checkpoint
optimizer=model.optimizer, # optimizer state
model=model # model weights
)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=3)
# Validation with sliding window callback
swi_callback_metric = get_inference_metric()
# Create checkpoint callback
tf_ckpt_callback = TFCheckpointCallback(ckpt, ckpt_manager)
# Create SWI callback
swi_callback = SlidingWindowInferenceCallback(
model,
dataset=val_ds,
metrics=swi_callback_metric,
num_classes=ExperimentConfig.num_classes,
interval= ExperimentConfig.sliding_window_interval,
overlap=ExperimentConfig.sliding_window_overlap,
roi_size=(ExperimentConfig.input_shape[0],ExperimentConfig.input_shape[1],ExperimentConfig.input_shape[2]),
sw_batch_size=ExperimentConfig.sw_batch_size * total_device ,
save_path=save_path
)
print(f"Model size: {model.count_params() / 1e6:.2f} M")
start_time = time.time()
history = model.fit(
train_ds,
epochs=ExperimentConfig.epochs,
callbacks=[
swi_callback,
tf_ckpt_callback
])
end_time = time.time()
training_time = end_time - start_time
print(f"Total training time (seconds): {training_time:.2f}")
# Save training time to a file
with open(os.path.join(logs_path, f"training_time_{timestamp}.txt"), "w") as f:
f.write(f"Total training time (seconds): {training_time:.2f}\n")
# Save history to CSV
history_file = os.path.join(history_path, f"training_history_{timestamp}.csv")
pd.DataFrame(history.history).to_csv(history_file, index=False)
# Plot training loss
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='train_loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.legend()
plt.grid()
plt.savefig(os.path.join(plots_path, f"loss_curve_{timestamp}.png"))
plt.close()
# Plot average Dice
if 'dice' in history.history:
plt.figure(figsize=(10, 5))
plt.plot(history.history['dice'], label='train_dice')
plt.xlabel("Epoch")
plt.ylabel("Average Dice")
plt.title("Training Average Dice")
plt.legend()
plt.grid()
plt.savefig(os.path.join(plots_path, f"dice_curve_{timestamp}.png"))
plt.close()
print("Training and saving plots finished successfully.")
Когда я тренируюсь на одном графическом процессоре, обучение всегда завершается успешно. Однако, когда я тренируюсь на нескольких графических процессорах (пробовал использовать 2, 3 и 4 графических процессора), я всегда получаю исключение с плавающей запятой (сброс ядра)
2025-12-17 12:28:31.666487: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
keras backend: tensorflow
keras version: 3.12.0
tensorflow version: 2.20.0
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1765970958.724694 160312 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1765970958.725422 160312 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38670 MB memory: -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:87:00.0, compute capability: 8.0
I0000 00:00:1765970958.725885 160312 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 1
I0000 00:00:1765970958.726444 160312 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 38670 MB memory: -> device: 1, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:c7:00.0, compute capability: 8.0
Keras backend tensorflow
Total device found 2
WARNING:tensorflow:From /net/pr2/projects/plgrid/plggneural/3d_seg_project/3D-Medical-Image-Segmentation/venv/lib/python3.10/site-packages/tensorflow/python/util/deprecation.py:660: calling map_fn_v2 (from tensorflow.python.ops.map_fn) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Use fn_output_signature instead
Model size: 42.66 M
2025-12-17 12:29:26.512602: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:390] TFRecordDataset `buffer_size` is unspecified, default to 262144
2025-12-17 12:29:43.247925: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
2025-12-17 12:29:43.248224: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
2025-12-17 12:29:43.249328: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
Epoch 1/2
2025-12-17 12:31:05.058981: I external/local_xla/xla/service/service.cc:163] XLA service 0x146328c01540 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-12-17 12:31:05.059020: I external/local_xla/xla/service/service.cc:171] StreamExecutor device (0): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2025-12-17 12:31:05.059283: I external/local_xla/xla/service/service.cc:171] StreamExecutor device (1): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2025-12-17 12:31:05.137238: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91700
2025-12-17 12:31:07.623407: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:07.805392: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
I0000 00:00:1765971068.804270 160357 device_compiler.h:196] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.
2025-12-17 12:31:09.734860: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:09.912544: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:32.318966: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=0,k13=2,k14=3,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:32.344476: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.025559828s
Trying algorithm eng23{k2=0,k13=2,k14=3,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:34.156184: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:35.306164: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 2.15004513s
Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:36.744870: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:36.927667: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:37.081083: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:37.253482: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:37.430320: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:42.279943: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:43.257783: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.977915871s
Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:46.830807: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91700
48/Unknown 450s 6s/step - dice: 0.0777 - dice_et: 0.0201 - dice_tc: 0.0344 - dice_wt: 0.1787 - loss: 3.03802025-12-17 12:37:14.868497: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
2025-12-17 12:37:15.919125: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:16.093522: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:17.873295: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:18.032921: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:27.808009: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:37:28.394435: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.586189512s
Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:37:28.948297: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:29.110080: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:32.859598: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:37:33.324957: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.465430048s
Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
Floating point exception (core dumped)
Я попробовал другое значение ExperimentConfig.batch_size_train, но проблема осталась та же.
Вот версии платформы:
TensorFlow version: 2.20.0
Keras version: 3.12.0
CUDA version: 12.5.1
cuDNN version: 9
Итак, мои вопросы:
[*]Почему я всегда получаю исключение с плавающей запятой (сброс ядра) при использовании нескольких графических процессоров?
[*]Я всегда получаю предупреждение пользователя: на входе закончились данные; прерывание обучения.
Убедитесь, что ваш набор данных или генератор может генерировать не менее
пакеты.
Возможно, вам придется использовать функцию .repeat() при построении набора данных.
Я не указывал Steps_per_epoch и не использовал повторение(), поэтому можно ли игнорировать это предупреждение? Я намерен обучать модель до тех пор, пока набор данных не будет исчерпан за эпоху.
[*]Я впервые использую распределенное обучение с помощью Tensorflow. Правильно ли я его использую? Я использовал
Когда я только определил модель, swi_callback_metric , ckpt ( tf.train.Checkpoint) и ckpt_manger, но не использовал их при создании tf_ckpt_callback и swi_callback . См. model.py
Я пытаюсь обучить модель сегментации 3D-медицинских изображений с помощью Tensorflow и Keras: Model.py: [code]import time import logging import os import datetime os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async" os.environ["KERAS_BACKEND"] = "tensorflow" # choose any: 'tensorflow', 'torch', 'jax' #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #os.environ['TF_CPP_MAX_VLOG_LEVEL'] = '0' import tensorflow as tf gpus = tf.config.list_physical_devices('GPU') if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) import numpy as np import pandas as pd from matplotlib import pyplot as plt import keras from keras import ops from keras import layers from keras import mixed_precision from medicai.models import UNETRPlusPlus from medicai.metrics import BinaryDiceMetric from medicai.losses import BinaryDiceCELoss from medicai.utils.inference import SlidingWindowInference from medicai.callbacks import SlidingWindowInferenceCallback import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from src.experiment_config import ExperimentConfig from src.data_pipeline.data_loader import data_loader import pandas as pd import numpy as np os.environ["KERAS_BACKEND"] = "tensorflow"
class TFCheckpointCallback(keras.callbacks.Callback): """Save model + optimizer + epoch using TF checkpointing.""" def __init__(self, ckpt, ckpt_manager): super().__init__() self.ckpt = ckpt self.ckpt_manager = ckpt_manager
def run_sliding_window_inference_per_class_average(model, ds, roi_size, sw_batch_size, overlap, metrics_list): """ Run sliding window inference on a dataset and compute all metrics (average + per class) """ for metric in metrics_list: metric.reset_states()
with strategy.scope(): ckpt = tf.train.Checkpoint( epoch=tf.Variable(0), # epoch counter — saved as part of checkpoint optimizer=model.optimizer, # optimizer state model=model # model weights )
history = model.fit( train_ds, epochs=ExperimentConfig.epochs, callbacks=[ swi_callback, tf_ckpt_callback ])
end_time = time.time() training_time = end_time - start_time print(f"Total training time (seconds): {training_time:.2f}")
# Save training time to a file with open(os.path.join(logs_path, f"training_time_{timestamp}.txt"), "w") as f: f.write(f"Total training time (seconds): {training_time:.2f}\n")
# Save history to CSV history_file = os.path.join(history_path, f"training_history_{timestamp}.csv") pd.DataFrame(history.history).to_csv(history_file, index=False)
# Plot training loss plt.figure(figsize=(10, 5)) plt.plot(history.history['loss'], label='train_loss') plt.xlabel("Epoch") plt.ylabel("Loss") plt.title("Training Loss") plt.legend() plt.grid() plt.savefig(os.path.join(plots_path, f"loss_curve_{timestamp}.png")) plt.close()
# Plot average Dice if 'dice' in history.history: plt.figure(figsize=(10, 5)) plt.plot(history.history['dice'], label='train_dice') plt.xlabel("Epoch") plt.ylabel("Average Dice") plt.title("Training Average Dice") plt.legend() plt.grid() plt.savefig(os.path.join(plots_path, f"dice_curve_{timestamp}.png")) plt.close()
print("Training and saving plots finished successfully.") [/code] Когда я тренируюсь на одном графическом процессоре, обучение всегда завершается успешно. Однако, когда я тренируюсь на нескольких графических процессорах (пробовал использовать 2, 3 и 4 графических процессора), я всегда получаю исключение с плавающей запятой (сброс ядра) [code]2025-12-17 12:28:31.666487: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. keras backend: tensorflow keras version: 3.12.0 tensorflow version: 2.20.0
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR I0000 00:00:1765970958.724694 160312 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 0 I0000 00:00:1765970958.725422 160312 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38670 MB memory: -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:87:00.0, compute capability: 8.0 I0000 00:00:1765970958.725885 160312 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 1 I0000 00:00:1765970958.726444 160312 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 38670 MB memory: -> device: 1, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:c7:00.0, compute capability: 8.0 Keras backend tensorflow Total device found 2 WARNING:tensorflow:From /net/pr2/projects/plgrid/plggneural/3d_seg_project/3D-Medical-Image-Segmentation/venv/lib/python3.10/site-packages/tensorflow/python/util/deprecation.py:660: calling map_fn_v2 (from tensorflow.python.ops.map_fn) with dtype is deprecated and will be removed in a future version. Instructions for updating: Use fn_output_signature instead Model size: 42.66 M 2025-12-17 12:29:26.512602: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:390] TFRecordDataset `buffer_size` is unspecified, default to 262144 2025-12-17 12:29:43.247925: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled [[{{node MultiDeviceIteratorGetNextFromShard}}]] 2025-12-17 12:29:43.248224: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled [[{{node MultiDeviceIteratorGetNextFromShard}}]] [[RemoteCall]] 2025-12-17 12:29:43.249328: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled [[{{node MultiDeviceIteratorGetNextFromShard}}]] [[RemoteCall]] Epoch 1/2 2025-12-17 12:31:05.058981: I external/local_xla/xla/service/service.cc:163] XLA service 0x146328c01540 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2025-12-17 12:31:05.059020: I external/local_xla/xla/service/service.cc:171] StreamExecutor device (0): NVIDIA A100-SXM4-40GB, Compute Capability 8.0 2025-12-17 12:31:05.059283: I external/local_xla/xla/service/service.cc:171] StreamExecutor device (1): NVIDIA A100-SXM4-40GB, Compute Capability 8.0 2025-12-17 12:31:05.137238: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91700 2025-12-17 12:31:07.623407: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:07.805392: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. I0000 00:00:1765971068.804270 160357 device_compiler.h:196] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process. 2025-12-17 12:31:09.734860: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:09.912544: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:32.318966: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=0,k13=2,k14=3,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:32.344476: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.025559828s Trying algorithm eng23{k2=0,k13=2,k14=3,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:34.156184: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:35.306164: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 2.15004513s Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:36.744870: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:36.927667: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:37.081083: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:37.253482: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:37.430320: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:42.279943: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:43.257783: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.977915871s Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:46.830807: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91700 48/Unknown 450s 6s/step - dice: 0.0777 - dice_et: 0.0201 - dice_tc: 0.0344 - dice_wt: 0.1787 - loss: 3.03802025-12-17 12:37:14.868497: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence [[{{node MultiDeviceIteratorGetNextFromShard}}]] [[RemoteCall]] 2025-12-17 12:37:15.919125: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:16.093522: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:17.873295: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:18.032921: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:27.808009: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:37:28.394435: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.586189512s Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:37:28.948297: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:29.110080: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:32.859598: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:37:33.324957: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.465430048s Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... Floating point exception (core dumped) [/code] Я попробовал другое значение ExperimentConfig.batch_size_train, но проблема осталась та же. Вот версии платформы: [code]TensorFlow version: 2.20.0 Keras version: 3.12.0 CUDA version: 12.5.1 cuDNN version: 9 [/code] Итак, мои вопросы:
[*]Почему я всегда получаю исключение с плавающей запятой (сброс ядра) при использовании нескольких графических процессоров?
[*]Я всегда получаю предупреждение пользователя: на входе закончились данные; прерывание обучения. Убедитесь, что ваш набор данных или генератор может генерировать не менее [code]steps_per_epoch * epochs[/code] пакеты. Возможно, вам придется использовать функцию .repeat() при построении набора данных. Я не указывал Steps_per_epoch и не использовал повторение(), поэтому можно ли игнорировать это предупреждение? Я намерен обучать модель до тех пор, пока набор данных не будет исчерпан за эпоху.
[*]Я впервые использую распределенное обучение с помощью Tensorflow. Правильно ли я его использую? Я использовал [code]with strategy.scope(): [/code] Когда я только определил модель, swi_callback_metric , ckpt ( tf.train.Checkpoint) и ckpt_manger, но не использовал их при создании tf_ckpt_callback и swi_callback . См. model.py