Ошибка исключения с плавающей запятой (дамп ядра) при обучении модели с использованием нескольких графических процессоро

Ошибка исключения с плавающей запятой (дамп ядра) при обучении модели с использованием нескольких графических процессоро ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Ошибка исключения с плавающей запятой (дамп ядра) при обучении модели с использованием нескольких графических процессоро

Цитата

Сообщение Anonymous » 17 дек 2025, 14:53

Я пытаюсь обучить модель сегментации 3D-медицинских изображений с помощью Tensorflow и Keras:
Model.py:

Код: Выделить всё

import time
import logging
import os
import datetime
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
os.environ["KERAS_BACKEND"] = "tensorflow"  # choose any: 'tensorflow', 'torch', 'jax'
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
#os.environ['TF_CPP_MAX_VLOG_LEVEL'] = '0'
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import keras
from keras import ops
from keras import layers
from keras import mixed_precision
from medicai.models import UNETRPlusPlus
from medicai.metrics import BinaryDiceMetric
from medicai.losses import BinaryDiceCELoss
from medicai.utils.inference import SlidingWindowInference
from medicai.callbacks import SlidingWindowInferenceCallback
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from src.experiment_config import ExperimentConfig
from src.data_pipeline.data_loader import data_loader
import pandas as pd
import numpy as np
os.environ["KERAS_BACKEND"] = "tensorflow"

class TFCheckpointCallback(keras.callbacks.Callback):
"""Save model + optimizer + epoch using TF checkpointing."""
def __init__(self, ckpt, ckpt_manager):
super().__init__()
self.ckpt = ckpt
self.ckpt_manager = ckpt_manager

def on_epoch_end(self, epoch, logs=None):
# Update epoch variable and save checkpoint
self.ckpt.epoch.assign_add(1)   # increment epoch counter
save_path = self.ckpt_manager.save()
print(f"Saved checkpoint:  {save_path} (epoch {int(self.ckpt.epoch.numpy())})")

def get_model():
model = UNETRPlusPlus(
encoder_name="unetr_plusplus_encoder",
input_shape=ExperimentConfig.input_shape,
num_classes=ExperimentConfig.num_classes,
classifier_activation=None,
)

model.compile(
optimizer=keras.optimizers.AdamW(
learning_rate=ExperimentConfig.lr,
weight_decay=ExperimentConfig.weight_decay,
),
loss=BinaryDiceCELoss(
from_logits=True,
dice_weight=1.0,
ce_weight=1.0,
reduction="mean",
num_classes=ExperimentConfig.num_classes,
),
metrics=[
BinaryDiceMetric(
from_logits=True,
ignore_empty=True,
num_classes=ExperimentConfig.num_classes,
name='dice',
),
BinaryDiceMetric(
from_logits=True,
ignore_empty=True,
target_class_ids=[0],
num_classes=ExperimentConfig.num_classes,
name='dice_tc',
),
BinaryDiceMetric(
from_logits=True,
ignore_empty=True,
target_class_ids=[1],
num_classes=ExperimentConfig.num_classes,
name='dice_wt',
),
BinaryDiceMetric(
from_logits=True,
ignore_empty=True,
target_class_ids=[2],
num_classes=ExperimentConfig.num_classes,
name='dice_et',
)
],
)

return model

def get_inference_metric():
swi_callback_metric = BinaryDiceMetric(
from_logits=True,
ignore_empty=True,
num_classes=ExperimentConfig.num_classes,
name='val_dice',
)
return swi_callback_metric

def run_sliding_window_inference_per_class_average(model, ds, roi_size, sw_batch_size, overlap, metrics_list):
"""
Run sliding window inference on a dataset and compute all metrics (average + per class)
"""
for metric in metrics_list:
metric.reset_states()

swi = SlidingWindowInference(
model,
num_classes=metrics_list[0].num_classes,
roi_size=roi_size,
sw_batch_size=sw_batch_size,
overlap=overlap
)

for x, y in ds:
y_pred = swi(x)
for metric in metrics_list:
metric.update_state(ops.convert_to_tensor(y), ops.convert_to_tensor(y_pred))

# Gather results
results = {}
for metric in metrics_list:
results[metric.name] = float(ops.convert_to_numpy(metric.result()))

return results

def main():

print(
f"keras backend: {keras.config.backend()}\n"
f"keras version: {keras.version()}\n"
f"tensorflow version:  {tf.__version__}\n"
)

# get keras backend
keras_backend = keras.config.backend()

strategy = tf.distribute.MirroredStrategy()
total_device = strategy.num_replicas_in_sync

print('Keras backend ', keras_backend)
print('Total device found ', total_device)

project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
base_save_path = os.path.join(project_root, "experiments", "msd_brain")
unetrplusplus_path = os.path.join(base_save_path, "SwinUnetr")
os.makedirs(unetrplusplus_path, exist_ok=True)

# Subfolders
logs_path = os.path.join(unetrplusplus_path, "logs")
history_path = os.path.join(unetrplusplus_path, "history")
plots_path = os.path.join(unetrplusplus_path, "plots")
os.makedirs(logs_path, exist_ok=True)
os.makedirs(history_path, exist_ok=True)
os.makedirs(plots_path, exist_ok=True)

# Timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# Save path for best model weights
save_path = os.path.join(unetrplusplus_path, f"best_model_weights_{timestamp}.weights.h5")

# Load datasets
tfrecord_pattern = os.path.join(project_root, "data", "msd_brain", "tfrecords", "{}_shard_*.tfrec")

# batch size for training
train_batch = ExperimentConfig.batch_size_train * total_device

train_ds = data_loader(
tfrecord_pattern.format("training"),
batch_size=train_batch,
shuffle=True
)
val_ds = data_loader(
tfrecord_pattern.format("validation"),
batch_size=ExperimentConfig.batch_size_val,
shuffle=False
)
test_ds = data_loader(
tfrecord_pattern.format("test"),
batch_size=ExperimentConfig.batch_size_val,
shuffle=False
)
with strategy.scope():
model = get_model()

checkpoint_dir = os.path.join(unetrplusplus_path, "checkpoints")
os.makedirs(checkpoint_dir, exist_ok=True)

with strategy.scope():
ckpt = tf.train.Checkpoint(
epoch=tf.Variable(0),          # epoch counter — saved as part of checkpoint
optimizer=model.optimizer,     # optimizer state
model=model                    # model weights
)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=3)

# Validation with sliding window callback
swi_callback_metric = get_inference_metric()

# Create checkpoint callback
tf_ckpt_callback = TFCheckpointCallback(ckpt, ckpt_manager)

# Create SWI callback
swi_callback = SlidingWindowInferenceCallback(
model,
dataset=val_ds,
metrics=swi_callback_metric,
num_classes=ExperimentConfig.num_classes,
interval= ExperimentConfig.sliding_window_interval,
overlap=ExperimentConfig.sliding_window_overlap,
roi_size=(ExperimentConfig.input_shape[0],ExperimentConfig.input_shape[1],ExperimentConfig.input_shape[2]),
sw_batch_size=ExperimentConfig.sw_batch_size * total_device ,
save_path=save_path
)

print(f"Model size: {model.count_params() / 1e6:.2f} M")

start_time = time.time()

history = model.fit(
train_ds,
epochs=ExperimentConfig.epochs,
callbacks=[
swi_callback,
tf_ckpt_callback
])

end_time = time.time()
training_time = end_time - start_time
print(f"Total training time (seconds): {training_time:.2f}")

# Save training time to a file
with open(os.path.join(logs_path, f"training_time_{timestamp}.txt"), "w") as f:
f.write(f"Total training time (seconds):  {training_time:.2f}\n")

# Save history to CSV
history_file = os.path.join(history_path, f"training_history_{timestamp}.csv")
pd.DataFrame(history.history).to_csv(history_file, index=False)

# Plot training loss
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='train_loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.legend()
plt.grid()
plt.savefig(os.path.join(plots_path, f"loss_curve_{timestamp}.png"))
plt.close()

# Plot average Dice
if 'dice' in history.history:
plt.figure(figsize=(10, 5))
plt.plot(history.history['dice'], label='train_dice')
plt.xlabel("Epoch")
plt.ylabel("Average Dice")
plt.title("Training Average Dice")
plt.legend()
plt.grid()
plt.savefig(os.path.join(plots_path, f"dice_curve_{timestamp}.png"))
plt.close()

print("Training and saving plots finished successfully.")

dataloader.py:

Код: Выделить всё

import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from src.deserialization.full_deserialization import parse_tfrecord_fn
from src.data_pipeline.transformations import rearrange_shape
from src.data_pipeline.transformations import train_transformation, val_transformation
import tensorflow as tf

def data_loader (tfrecord_pattern, batch_size=1, shuffle=True):
"""Create the data loader.  This function builds a `tf.data.Dataset` pipeline that reads serialized
TFRecords from one or more files (using a glob pattern), applies parsing,
reshaping, and transformations, and returns batches ready for training or validation.

Args:
tfrecord_pattern (str): File path pattern matching one or more TFRecord files.
batch_size (int, optional): Number of samples per batch. Defaults to 1.
shuffle (bool, optional): Whether to shuffle the dataset. Defaults to True.

Returns:
tf.data.Dataset: A batched and prefetched dataset yielding tuples of (image, label, image_affine, label_affine, image_pixdim, label_pixdim)
after parsing, reshaping, and transformations.
"""

num_parallel_calls = tf.data.AUTOTUNE
dataset = tf.data.TFRecordDataset(tf.io.gfile.glob(tfrecord_pattern)) # dataset is a tf.data.Dataset where each element is one serialized tf.train.Example

dataset = dataset.shuffle(buffer_size=16) if shuffle else dataset
dataset = dataset.map(parse_tfrecord_fn,
num_parallel_calls=num_parallel_calls)

dataset =  dataset.map(rearrange_shape,
num_parallel_calls=num_parallel_calls)

if shuffle:
dataset = dataset.map(train_transformation,
num_parallel_calls= num_parallel_calls)

else:
dataset = dataset.map (val_transformation,
num_parallel_calls= num_parallel_calls)

dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE) # Reduced from tf.data.AUTOTUNE

return dataset

experimental_config.py:

Код: Выделить всё

class ExperimentConfig:
""" Configuration class with hard-coded hyperparameters for training and validation.  """
# Hard-coded hyperparameters
batch_size_train = 8  # 8
batch_size_val = 1  #2
epochs = 2 # 1000
lr = 1e-4
weight_decay = 1e-4
sliding_window_interval = 2 # 20
sliding_window_overlap = 0.5
input_shape = (128, 128, 128, 4)
num_classes = 3
PROB = 0.5
sw_batch_size= 2 # 2

def __repr__(self):
return (
f"ExperimentConfig(batch_size_train={self.batch_size_train}, "
f"batch_size_val={self.batch_size_val}, "
f"epochs={self.epochs}, "
f"lr={self.lr}, "
f"weight_decay={self.weight_decay}, "
f"sliding_window_interval={self.sliding_window_interval}, "
f"sliding_window_overlap={self.sliding_window_overlap}, "
f"input_shape={self.input_shape}, "
f"num_classes={self.num_classes}, "
f"PROB={self.PROB}, "
f"sw_batch_size={self.sw_batch_size})"
)

Когда я тренируюсь на одном графическом процессоре, обучение всегда завершается успешно. Однако, когда я тренируюсь на нескольких графических процессорах (пробовал использовать 2, 3 и 4 графических процессора), я всегда получаю исключение с плавающей запятой (сброс ядра)

Код: Выделить всё

2025-12-17 12:28:31.666487: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
keras backend: tensorflow
keras version: 3.12.0
tensorflow version: 2.20.0

WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1765970958.724694  160312 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1765970958.725422  160312 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38670 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:87:00.0, compute capability: 8.0
I0000 00:00:1765970958.725885  160312 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 1
I0000 00:00:1765970958.726444  160312 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 38670 MB memory:  -> device: 1, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:c7:00.0, compute capability: 8.0
Keras backend  tensorflow
Total device found  2
WARNING:tensorflow:From /net/pr2/projects/plgrid/plggneural/3d_seg_project/3D-Medical-Image-Segmentation/venv/lib/python3.10/site-packages/tensorflow/python/util/deprecation.py:660: calling map_fn_v2 (from tensorflow.python.ops.map_fn) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Use fn_output_signature instead
Model size: 42.66 M
2025-12-17 12:29:26.512602: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:390] TFRecordDataset `buffer_size` is unspecified, default to 262144
2025-12-17 12:29:43.247925: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
2025-12-17 12:29:43.248224: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
2025-12-17 12:29:43.249328: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
Epoch 1/2
2025-12-17 12:31:05.058981: I external/local_xla/xla/service/service.cc:163] XLA service 0x146328c01540 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-12-17 12:31:05.059020: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2025-12-17 12:31:05.059283: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (1): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2025-12-17 12:31:05.137238: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91700
2025-12-17 12:31:07.623407: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy.  There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:07.805392: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
I0000 00:00:1765971068.804270  160357 device_compiler.h:196] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
2025-12-17 12:31:09.734860: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:09.912544: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:32.318966: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=0,k13=2,k14=3,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:32.344476: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.025559828s
Trying algorithm eng23{k2=0,k13=2,k14=3,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:34.156184: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:35.306164: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 2.15004513s
Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:36.744870: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:36.927667: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:37.081083: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:37.253482: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy.  There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:37.430320: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:42.279943: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:43.257783: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.977915871s
Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:46.830807: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91700
48/Unknown 450s 6s/step - dice: 0.0777 - dice_et: 0.0201 - dice_tc: 0.0344 - dice_wt: 0.1787 - loss: 3.03802025-12-17 12:37:14.868497: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
2025-12-17 12:37:15.919125: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:16.093522: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:17.873295: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:18.032921: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy.  There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:27.808009: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:37:28.394435: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.586189512s
Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:37:28.948297: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:29.110080: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:32.859598: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:37:33.324957: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.465430048s
Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
Floating point exception (core dumped)

Я попробовал другое значение ExperimentConfig.batch_size_train, но проблема осталась та же.
Вот версии платформы:

Код: Выделить всё

TensorFlow version: 2.20.0
Keras version: 3.12.0
CUDA version: 12.5.1
cuDNN version: 9

Итак, мои вопросы:

[*]Почему я всегда получаю исключение с плавающей запятой (сброс ядра) при использовании нескольких графических процессоров?

[*]Я всегда получаю предупреждение пользователя: на входе закончились данные; прерывание обучения.
Убедитесь, что ваш набор данных или генератор может генерировать не менее

Код: Выделить всё

steps_per_epoch * epochs

пакеты.
Возможно, вам придется использовать функцию .repeat() при построении набора данных.
Я не указывал Steps_per_epoch и не использовал повторение(), поэтому можно ли игнорировать это предупреждение? Я намерен обучать модель до тех пор, пока набор данных не будет исчерпан за эпоху.

[*]Я впервые использую распределенное обучение с помощью Tensorflow. Правильно ли я его использую? Я использовал

Код: Выделить всё

with strategy.scope():

Когда я только определил модель, swi_callback_metric , ckpt ( tf.train.Checkpoint) и ckpt_manger, но не использовал их при создании tf_ckpt_callback и swi_callback . См. model.py

Подробнее здесь: https://stackoverflow.com/questions/798 ... using-mult

1765972415

Anonymous

Я пытаюсь обучить модель сегментации 3D-медицинских изображений с помощью Tensorflow и Keras:
Model.py:
[code]import time
import logging
import os
import datetime
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
os.environ["KERAS_BACKEND"] = "tensorflow"  # choose any: 'tensorflow', 'torch', 'jax'
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
#os.environ['TF_CPP_MAX_VLOG_LEVEL'] = '0'
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import keras
from keras import ops
from keras import layers
from keras import mixed_precision
from medicai.models import UNETRPlusPlus
from medicai.metrics import BinaryDiceMetric
from medicai.losses import BinaryDiceCELoss
from medicai.utils.inference import SlidingWindowInference
from medicai.callbacks import SlidingWindowInferenceCallback
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from src.experiment_config import ExperimentConfig
from src.data_pipeline.data_loader import data_loader
import pandas as pd
import numpy as np
os.environ["KERAS_BACKEND"] = "tensorflow"

class TFCheckpointCallback(keras.callbacks.Callback):
"""Save model + optimizer + epoch using TF checkpointing."""
def __init__(self, ckpt, ckpt_manager):
super().__init__()
self.ckpt = ckpt
self.ckpt_manager = ckpt_manager

def on_epoch_end(self, epoch, logs=None):
# Update epoch variable and save checkpoint
self.ckpt.epoch.assign_add(1)   # increment epoch counter
save_path = self.ckpt_manager.save()
print(f"Saved checkpoint:  {save_path} (epoch {int(self.ckpt.epoch.numpy())})")

def get_model():
model = UNETRPlusPlus(
encoder_name="unetr_plusplus_encoder",
input_shape=ExperimentConfig.input_shape,
num_classes=ExperimentConfig.num_classes,
classifier_activation=None,
)

model.compile(
optimizer=keras.optimizers.AdamW(
learning_rate=ExperimentConfig.lr,
weight_decay=ExperimentConfig.weight_decay,
),
loss=BinaryDiceCELoss(
from_logits=True,
dice_weight=1.0,
ce_weight=1.0,
reduction="mean",
num_classes=ExperimentConfig.num_classes,
),
metrics=[
BinaryDiceMetric(
from_logits=True,
ignore_empty=True,
num_classes=ExperimentConfig.num_classes,
name='dice',
),
BinaryDiceMetric(
from_logits=True,
ignore_empty=True,
target_class_ids=[0],
num_classes=ExperimentConfig.num_classes,
name='dice_tc',
),
BinaryDiceMetric(
from_logits=True,
ignore_empty=True,
target_class_ids=[1],
num_classes=ExperimentConfig.num_classes,
name='dice_wt',
),
BinaryDiceMetric(
from_logits=True,
ignore_empty=True,
target_class_ids=[2],
num_classes=ExperimentConfig.num_classes,
name='dice_et',
)
],
)

return model

def get_inference_metric():
swi_callback_metric = BinaryDiceMetric(
from_logits=True,
ignore_empty=True,
num_classes=ExperimentConfig.num_classes,
name='val_dice',
)
return swi_callback_metric

def run_sliding_window_inference_per_class_average(model, ds, roi_size, sw_batch_size, overlap, metrics_list):
"""
Run sliding window inference on a dataset and compute all metrics (average + per class)
"""
for metric in metrics_list:
metric.reset_states()

swi = SlidingWindowInference(
model,
num_classes=metrics_list[0].num_classes,
roi_size=roi_size,
sw_batch_size=sw_batch_size,
overlap=overlap
)

for x, y in ds:
y_pred = swi(x)
for metric in metrics_list:
metric.update_state(ops.convert_to_tensor(y), ops.convert_to_tensor(y_pred))

# Gather results
results = {}
for metric in metrics_list:
results[metric.name] = float(ops.convert_to_numpy(metric.result()))

return results

def main():

print(
f"keras backend: {keras.config.backend()}\n"
f"keras version: {keras.version()}\n"
f"tensorflow version:  {tf.__version__}\n"
)

# get keras backend
keras_backend = keras.config.backend()

strategy = tf.distribute.MirroredStrategy()
total_device = strategy.num_replicas_in_sync

print('Keras backend ', keras_backend)
print('Total device found ', total_device)

project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
base_save_path = os.path.join(project_root, "experiments", "msd_brain")
unetrplusplus_path = os.path.join(base_save_path, "SwinUnetr")
os.makedirs(unetrplusplus_path, exist_ok=True)

# Subfolders
logs_path = os.path.join(unetrplusplus_path, "logs")
history_path = os.path.join(unetrplusplus_path, "history")
plots_path = os.path.join(unetrplusplus_path, "plots")
os.makedirs(logs_path, exist_ok=True)
os.makedirs(history_path, exist_ok=True)
os.makedirs(plots_path, exist_ok=True)

# Timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# Save path for best model weights
save_path = os.path.join(unetrplusplus_path, f"best_model_weights_{timestamp}.weights.h5")

# Load datasets
tfrecord_pattern = os.path.join(project_root, "data", "msd_brain", "tfrecords", "{}_shard_*.tfrec")

# batch size for training
train_batch = ExperimentConfig.batch_size_train * total_device

train_ds = data_loader(
tfrecord_pattern.format("training"),
batch_size=train_batch,
shuffle=True
)
val_ds = data_loader(
tfrecord_pattern.format("validation"),
batch_size=ExperimentConfig.batch_size_val,
shuffle=False
)
test_ds = data_loader(
tfrecord_pattern.format("test"),
batch_size=ExperimentConfig.batch_size_val,
shuffle=False
)
with strategy.scope():
model = get_model()

checkpoint_dir = os.path.join(unetrplusplus_path, "checkpoints")
os.makedirs(checkpoint_dir, exist_ok=True)

with strategy.scope():
ckpt = tf.train.Checkpoint(
epoch=tf.Variable(0),          # epoch counter — saved as part of checkpoint
optimizer=model.optimizer,     # optimizer state
model=model                    # model weights
)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=3)

# Validation with sliding window callback
swi_callback_metric = get_inference_metric()

# Create checkpoint callback
tf_ckpt_callback = TFCheckpointCallback(ckpt, ckpt_manager)

# Create SWI callback
swi_callback = SlidingWindowInferenceCallback(
model,
dataset=val_ds,
metrics=swi_callback_metric,
num_classes=ExperimentConfig.num_classes,
interval= ExperimentConfig.sliding_window_interval,
overlap=ExperimentConfig.sliding_window_overlap,
roi_size=(ExperimentConfig.input_shape[0],ExperimentConfig.input_shape[1],ExperimentConfig.input_shape[2]),
sw_batch_size=ExperimentConfig.sw_batch_size * total_device ,
save_path=save_path
)

print(f"Model size: {model.count_params() / 1e6:.2f} M")

start_time = time.time()

history = model.fit(
train_ds,
epochs=ExperimentConfig.epochs,
callbacks=[
swi_callback,
tf_ckpt_callback
])

end_time = time.time()
training_time = end_time - start_time
print(f"Total training time (seconds): {training_time:.2f}")

# Save training time to a file
with open(os.path.join(logs_path, f"training_time_{timestamp}.txt"), "w") as f:
f.write(f"Total training time (seconds):  {training_time:.2f}\n")

# Save history to CSV
history_file = os.path.join(history_path, f"training_history_{timestamp}.csv")
pd.DataFrame(history.history).to_csv(history_file, index=False)

# Plot training loss
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='train_loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.legend()
plt.grid()
plt.savefig(os.path.join(plots_path, f"loss_curve_{timestamp}.png"))
plt.close()

# Plot average Dice
if 'dice' in history.history:
plt.figure(figsize=(10, 5))
plt.plot(history.history['dice'], label='train_dice')
plt.xlabel("Epoch")
plt.ylabel("Average Dice")
plt.title("Training Average Dice")
plt.legend()
plt.grid()
plt.savefig(os.path.join(plots_path, f"dice_curve_{timestamp}.png"))
plt.close()

print("Training and saving plots finished successfully.")
[/code]
dataloader.py:
[code]import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from src.deserialization.full_deserialization import parse_tfrecord_fn
from src.data_pipeline.transformations import rearrange_shape
from src.data_pipeline.transformations import train_transformation, val_transformation
import tensorflow as tf

def data_loader (tfrecord_pattern, batch_size=1, shuffle=True):
"""Create the data loader.  This function builds a `tf.data.Dataset` pipeline that reads serialized
TFRecords from one or more files (using a glob pattern), applies parsing,
reshaping, and transformations, and returns batches ready for training or validation.

Args:
tfrecord_pattern (str): File path pattern matching one or more TFRecord files.
batch_size (int, optional): Number of samples per batch. Defaults to 1.
shuffle (bool, optional): Whether to shuffle the dataset. Defaults to True.

Returns:
tf.data.Dataset: A batched and prefetched dataset yielding tuples of (image, label, image_affine, label_affine, image_pixdim, label_pixdim)
after parsing, reshaping, and transformations.
"""

num_parallel_calls = tf.data.AUTOTUNE
dataset = tf.data.TFRecordDataset(tf.io.gfile.glob(tfrecord_pattern)) # dataset is a tf.data.Dataset where each element is one serialized tf.train.Example

dataset = dataset.shuffle(buffer_size=16) if shuffle else dataset
dataset = dataset.map(parse_tfrecord_fn,
num_parallel_calls=num_parallel_calls)

dataset =  dataset.map(rearrange_shape,
num_parallel_calls=num_parallel_calls)

if shuffle:
dataset = dataset.map(train_transformation,
num_parallel_calls= num_parallel_calls)

else:
dataset = dataset.map (val_transformation,
num_parallel_calls= num_parallel_calls)

dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE) # Reduced from tf.data.AUTOTUNE

return dataset

[/code]
experimental_config.py:
[code]class ExperimentConfig:
""" Configuration class with hard-coded hyperparameters for training and validation.  """
# Hard-coded hyperparameters
batch_size_train = 8  # 8
batch_size_val = 1  #2
epochs = 2 # 1000
lr = 1e-4
weight_decay = 1e-4
sliding_window_interval = 2 # 20
sliding_window_overlap = 0.5
input_shape = (128, 128, 128, 4)
num_classes = 3
PROB = 0.5
sw_batch_size= 2 # 2

def __repr__(self):
return (
f"ExperimentConfig(batch_size_train={self.batch_size_train}, "
f"batch_size_val={self.batch_size_val}, "
f"epochs={self.epochs}, "
f"lr={self.lr}, "
f"weight_decay={self.weight_decay}, "
f"sliding_window_interval={self.sliding_window_interval}, "
f"sliding_window_overlap={self.sliding_window_overlap}, "
f"input_shape={self.input_shape}, "
f"num_classes={self.num_classes}, "
f"PROB={self.PROB}, "
f"sw_batch_size={self.sw_batch_size})"
)
[/code]
Когда я тренируюсь на одном графическом процессоре, обучение всегда завершается успешно. Однако, когда я тренируюсь на нескольких графических процессорах (пробовал использовать 2, 3 и 4 графических процессора), я всегда получаю исключение с плавающей запятой (сброс ядра) 
[code]2025-12-17 12:28:31.666487: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
keras backend: tensorflow
keras version: 3.12.0
tensorflow version: 2.20.0

WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1765970958.724694  160312 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1765970958.725422  160312 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38670 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:87:00.0, compute capability: 8.0
I0000 00:00:1765970958.725885  160312 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 1
I0000 00:00:1765970958.726444  160312 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 38670 MB memory:  -> device: 1, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:c7:00.0, compute capability: 8.0
Keras backend  tensorflow
Total device found  2
WARNING:tensorflow:From /net/pr2/projects/plgrid/plggneural/3d_seg_project/3D-Medical-Image-Segmentation/venv/lib/python3.10/site-packages/tensorflow/python/util/deprecation.py:660: calling map_fn_v2 (from tensorflow.python.ops.map_fn) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Use fn_output_signature instead
Model size: 42.66 M
2025-12-17 12:29:26.512602: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:390] TFRecordDataset `buffer_size` is unspecified, default to 262144
2025-12-17 12:29:43.247925: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
2025-12-17 12:29:43.248224: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
2025-12-17 12:29:43.249328: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
Epoch 1/2
2025-12-17 12:31:05.058981: I external/local_xla/xla/service/service.cc:163] XLA service 0x146328c01540 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-12-17 12:31:05.059020: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2025-12-17 12:31:05.059283: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (1): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2025-12-17 12:31:05.137238: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91700
2025-12-17 12:31:07.623407: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy.  There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:07.805392: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
I0000 00:00:1765971068.804270  160357 device_compiler.h:196] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
2025-12-17 12:31:09.734860: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:09.912544: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:32.318966: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=0,k13=2,k14=3,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:32.344476: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.025559828s
Trying algorithm eng23{k2=0,k13=2,k14=3,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:34.156184: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:35.306164: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 2.15004513s
Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:36.744870: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:36.927667: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:37.081083: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:37.253482: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy.  There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:37.430320: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:42.279943: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:43.257783: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.977915871s
Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:46.830807: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91700
48/Unknown 450s 6s/step - dice: 0.0777 - dice_et: 0.0201 - dice_tc: 0.0344 - dice_wt: 0.1787 - loss: 3.03802025-12-17 12:37:14.868497: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
2025-12-17 12:37:15.919125: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:16.093522: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:17.873295: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:18.032921: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy.  There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:27.808009: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:37:28.394435: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.586189512s
Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:37:28.948297: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:29.110080: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:32.859598: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:37:33.324957: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.465430048s
Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
Floating point exception (core dumped)
[/code]
Я попробовал другое значение ExperimentConfig.batch_size_train, но проблема осталась та же.
Вот версии платформы:
[code]TensorFlow version: 2.20.0
Keras version: 3.12.0
CUDA version: 12.5.1
cuDNN version: 9
[/code]
Итак, мои вопросы:

[*]Почему я всегда получаю исключение с плавающей запятой (сброс ядра) при использовании нескольких графических процессоров?

[*]Я всегда получаю предупреждение пользователя: на входе закончились данные; прерывание обучения.
Убедитесь, что ваш набор данных или генератор может генерировать не менее
[code]steps_per_epoch * epochs[/code] пакеты.
Возможно, вам придется использовать функцию .repeat() при построении набора данных.
Я не указывал Steps_per_epoch и не использовал повторение(), поэтому можно ли игнорировать это предупреждение? Я намерен обучать модель до тех пор, пока набор данных не будет исчерпан за эпоху.

[*]Я впервые использую распределенное обучение с помощью Tensorflow.  Правильно ли я его использую?  Я использовал
[code]with strategy.scope():
[/code]
Когда я только определил модель, swi_callback_metric , ckpt ( tf.train.Checkpoint) и ckpt_manger, но не использовал их при создании tf_ckpt_callback и swi_callback . См. model.py

 

Подробнее здесь: [url]https://stackoverflow.com/questions/79849395/floating-point-exception-core-dumped-error-when-training-the-model-using-mult[/url]