import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from src.deserialization.full_deserialization import parse_tfrecord_fn
from src.data_pipeline.transformations import rearrange_shape
from src.data_pipeline.transformations import train_transformation, val_transformation
import tensorflow as tf
def data_loader (tfrecord_pattern, batch_size=1, shuffle=True):
"""Create the data loader. This function builds a `tf.data.Dataset` pipeline that reads serialized
TFRecords from one or more files (using a glob pattern), applies parsing,
reshaping, and transformations, and returns batches ready for training or validation.
Args:
tfrecord_pattern (str): File path pattern matching one or more TFRecord files.
batch_size (int, optional): Number of samples per batch. Defaults to 1.
shuffle (bool, optional): Whether to shuffle the dataset. Defaults to True.
Returns:
tf.data.Dataset: A batched and prefetched dataset yielding tuples of (image, label, image_affine, label_affine, image_pixdim, label_pixdim)
after parsing, reshaping, and transformations.
"""
num_parallel_calls = tf.data.AUTOTUNE
dataset = tf.data.TFRecordDataset(tf.io.gfile.glob(tfrecord_pattern)) # dataset is a tf.data.Dataset where each element is one serialized tf.train.Example
dataset = dataset.shuffle(buffer_size=16) if shuffle else dataset
dataset = dataset.map(parse_tfrecord_fn,
num_parallel_calls=num_parallel_calls)
dataset = dataset.map(rearrange_shape,
num_parallel_calls=num_parallel_calls)
if shuffle:
dataset = dataset.map(train_transformation,
num_parallel_calls= num_parallel_calls)
else:
dataset = dataset.map (val_transformation,
num_parallel_calls= num_parallel_calls)
dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE) # Reduced from tf.data.AUTOTUNE
return dataset
Когда я тренируюсь на одном графическом процессоре, обучение всегда завершается успешно. Однако, когда я тренируюсь на нескольких графических процессорах (пробовал использовать 2, 3 и 4 графических процессора), я всегда получаю исключение с плавающей запятой (сброс ядра)
2025-12-17 12:28:31.666487: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
keras backend: tensorflow
keras version: 3.12.0
tensorflow version: 2.20.0
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1765970958.724694 160312 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1765970958.725422 160312 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38670 MB memory: -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:87:00.0, compute capability: 8.0
I0000 00:00:1765970958.725885 160312 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 1
I0000 00:00:1765970958.726444 160312 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 38670 MB memory: -> device: 1, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:c7:00.0, compute capability: 8.0
Keras backend tensorflow
Total device found 2
WARNING:tensorflow:From /net/pr2/projects/plgrid/plggneural/3d_seg_project/3D-Medical-Image-Segmentation/venv/lib/python3.10/site-packages/tensorflow/python/util/deprecation.py:660: calling map_fn_v2 (from tensorflow.python.ops.map_fn) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Use fn_output_signature instead
Model size: 42.66 M
2025-12-17 12:29:26.512602: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:390] TFRecordDataset `buffer_size` is unspecified, default to 262144
2025-12-17 12:29:43.247925: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
2025-12-17 12:29:43.248224: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
2025-12-17 12:29:43.249328: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
Epoch 1/2
2025-12-17 12:31:05.058981: I external/local_xla/xla/service/service.cc:163] XLA service 0x146328c01540 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-12-17 12:31:05.059020: I external/local_xla/xla/service/service.cc:171] StreamExecutor device (0): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2025-12-17 12:31:05.059283: I external/local_xla/xla/service/service.cc:171] StreamExecutor device (1): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2025-12-17 12:31:05.137238: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91700
2025-12-17 12:31:07.623407: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:07.805392: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
I0000 00:00:1765971068.804270 160357 device_compiler.h:196] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.
2025-12-17 12:31:09.734860: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:09.912544: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:32.318966: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=0,k13=2,k14=3,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:32.344476: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.025559828s
Trying algorithm eng23{k2=0,k13=2,k14=3,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:34.156184: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:35.306164: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 2.15004513s
Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:36.744870: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:36.927667: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:37.081083: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:37.253482: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:37.430320: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:31:42.279943: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:43.257783: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.977915871s
Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:31:46.830807: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91700
48/Unknown 450s 6s/step - dice: 0.0777 - dice_et: 0.0201 - dice_tc: 0.0344 - dice_wt: 0.1787 - loss: 3.03802025-12-17 12:37:14.868497: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
[[{{node MultiDeviceIteratorGetNextFromShard}}]]
[[RemoteCall]]
2025-12-17 12:37:15.919125: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:16.093522: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:17.873295: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:18.032921: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:27.808009: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:37:28.394435: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.586189512s
Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:37:28.948297: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:29.110080: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-17 12:37:32.859598: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
2025-12-17 12:37:33.324957: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.465430048s
Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while...
Floating point exception (core dumped)
Я попробовал другое значение ExperimentConfig.batch_size_train, но проблема осталась та же.
Вот версии платформы:
TensorFlow version: 2.20.0
Keras version: 3.12.0
CUDA version: 12.5.1
cuDNN version: 9
Итак, мои вопросы:
[*]Почему я всегда получаю исключение с плавающей запятой (сброс ядра) при использовании нескольких графических процессоров?
[*]Я всегда получаю предупреждение пользователя: на входе закончились данные; прерывание обучения.
Убедитесь, что ваш набор данных или генератор может генерировать не менее
пакеты.
Возможно, вам придется использовать функцию .repeat() при построении набора данных.
Я не указывал Steps_per_epoch и не использовал повторение(), поэтому можно ли игнорировать это предупреждение? Я намерен обучать модель до тех пор, пока набор данных не будет исчерпан за эпоху.
[*]Я впервые использую распределенное обучение с помощью Tensorflow. Правильно ли я его использую? Я использовал
Когда я только определил модель, swi_callback_metric , ckpt ( tf.train.Checkpoint) и ckpt_manger, но не использовал их при создании tf_ckpt_callback и swi_callback . См. model.py
Я пытаюсь обучить модель сегментации 3D-медицинских изображений с помощью Tensorflow и Keras: Model.py: [code]import time import logging import os import datetime os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async" os.environ["KERAS_BACKEND"] = "tensorflow" # choose any: 'tensorflow', 'torch', 'jax' #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #os.environ['TF_CPP_MAX_VLOG_LEVEL'] = '0' import tensorflow as tf gpus = tf.config.list_physical_devices('GPU') if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) import numpy as np import pandas as pd from matplotlib import pyplot as plt import keras from keras import ops from keras import layers from keras import mixed_precision from medicai.models import UNETRPlusPlus from medicai.metrics import BinaryDiceMetric from medicai.losses import BinaryDiceCELoss from medicai.utils.inference import SlidingWindowInference from medicai.callbacks import SlidingWindowInferenceCallback import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from src.experiment_config import ExperimentConfig from src.data_pipeline.data_loader import data_loader import pandas as pd import numpy as np os.environ["KERAS_BACKEND"] = "tensorflow"
class TFCheckpointCallback(keras.callbacks.Callback): """Save model + optimizer + epoch using TF checkpointing.""" def __init__(self, ckpt, ckpt_manager): super().__init__() self.ckpt = ckpt self.ckpt_manager = ckpt_manager
def run_sliding_window_inference_per_class_average(model, ds, roi_size, sw_batch_size, overlap, metrics_list): """ Run sliding window inference on a dataset and compute all metrics (average + per class) """ for metric in metrics_list: metric.reset_states()
with strategy.scope(): ckpt = tf.train.Checkpoint( epoch=tf.Variable(0), # epoch counter — saved as part of checkpoint optimizer=model.optimizer, # optimizer state model=model # model weights )
history = model.fit( train_ds, epochs=ExperimentConfig.epochs, callbacks=[ swi_callback, tf_ckpt_callback ])
end_time = time.time() training_time = end_time - start_time print(f"Total training time (seconds): {training_time:.2f}")
# Save training time to a file with open(os.path.join(logs_path, f"training_time_{timestamp}.txt"), "w") as f: f.write(f"Total training time (seconds): {training_time:.2f}\n")
# Save history to CSV history_file = os.path.join(history_path, f"training_history_{timestamp}.csv") pd.DataFrame(history.history).to_csv(history_file, index=False)
# Plot training loss plt.figure(figsize=(10, 5)) plt.plot(history.history['loss'], label='train_loss') plt.xlabel("Epoch") plt.ylabel("Loss") plt.title("Training Loss") plt.legend() plt.grid() plt.savefig(os.path.join(plots_path, f"loss_curve_{timestamp}.png")) plt.close()
# Plot average Dice if 'dice' in history.history: plt.figure(figsize=(10, 5)) plt.plot(history.history['dice'], label='train_dice') plt.xlabel("Epoch") plt.ylabel("Average Dice") plt.title("Training Average Dice") plt.legend() plt.grid() plt.savefig(os.path.join(plots_path, f"dice_curve_{timestamp}.png")) plt.close()
print("Training and saving plots finished successfully.") [/code] dataloader.py: [code]import os import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from src.deserialization.full_deserialization import parse_tfrecord_fn from src.data_pipeline.transformations import rearrange_shape from src.data_pipeline.transformations import train_transformation, val_transformation import tensorflow as tf
def data_loader (tfrecord_pattern, batch_size=1, shuffle=True): """Create the data loader. This function builds a `tf.data.Dataset` pipeline that reads serialized TFRecords from one or more files (using a glob pattern), applies parsing, reshaping, and transformations, and returns batches ready for training or validation.
Args: tfrecord_pattern (str): File path pattern matching one or more TFRecord files. batch_size (int, optional): Number of samples per batch. Defaults to 1. shuffle (bool, optional): Whether to shuffle the dataset. Defaults to True.
Returns: tf.data.Dataset: A batched and prefetched dataset yielding tuples of (image, label, image_affine, label_affine, image_pixdim, label_pixdim) after parsing, reshaping, and transformations. """
num_parallel_calls = tf.data.AUTOTUNE dataset = tf.data.TFRecordDataset(tf.io.gfile.glob(tfrecord_pattern)) # dataset is a tf.data.Dataset where each element is one serialized tf.train.Example
def __repr__(self): return ( f"ExperimentConfig(batch_size_train={self.batch_size_train}, " f"batch_size_val={self.batch_size_val}, " f"epochs={self.epochs}, " f"lr={self.lr}, " f"weight_decay={self.weight_decay}, " f"sliding_window_interval={self.sliding_window_interval}, " f"sliding_window_overlap={self.sliding_window_overlap}, " f"input_shape={self.input_shape}, " f"num_classes={self.num_classes}, " f"PROB={self.PROB}, " f"sw_batch_size={self.sw_batch_size})" ) [/code] Когда я тренируюсь на одном графическом процессоре, обучение всегда завершается успешно. Однако, когда я тренируюсь на нескольких графических процессорах (пробовал использовать 2, 3 и 4 графических процессора), я всегда получаю исключение с плавающей запятой (сброс ядра) [code]2025-12-17 12:28:31.666487: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. keras backend: tensorflow keras version: 3.12.0 tensorflow version: 2.20.0
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR I0000 00:00:1765970958.724694 160312 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 0 I0000 00:00:1765970958.725422 160312 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38670 MB memory: -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:87:00.0, compute capability: 8.0 I0000 00:00:1765970958.725885 160312 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 1 I0000 00:00:1765970958.726444 160312 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 38670 MB memory: -> device: 1, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:c7:00.0, compute capability: 8.0 Keras backend tensorflow Total device found 2 WARNING:tensorflow:From /net/pr2/projects/plgrid/plggneural/3d_seg_project/3D-Medical-Image-Segmentation/venv/lib/python3.10/site-packages/tensorflow/python/util/deprecation.py:660: calling map_fn_v2 (from tensorflow.python.ops.map_fn) with dtype is deprecated and will be removed in a future version. Instructions for updating: Use fn_output_signature instead Model size: 42.66 M 2025-12-17 12:29:26.512602: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:390] TFRecordDataset `buffer_size` is unspecified, default to 262144 2025-12-17 12:29:43.247925: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled [[{{node MultiDeviceIteratorGetNextFromShard}}]] 2025-12-17 12:29:43.248224: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled [[{{node MultiDeviceIteratorGetNextFromShard}}]] [[RemoteCall]] 2025-12-17 12:29:43.249328: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: CANCELLED: GetNextFromShard was cancelled [[{{node MultiDeviceIteratorGetNextFromShard}}]] [[RemoteCall]] Epoch 1/2 2025-12-17 12:31:05.058981: I external/local_xla/xla/service/service.cc:163] XLA service 0x146328c01540 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2025-12-17 12:31:05.059020: I external/local_xla/xla/service/service.cc:171] StreamExecutor device (0): NVIDIA A100-SXM4-40GB, Compute Capability 8.0 2025-12-17 12:31:05.059283: I external/local_xla/xla/service/service.cc:171] StreamExecutor device (1): NVIDIA A100-SXM4-40GB, Compute Capability 8.0 2025-12-17 12:31:05.137238: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91700 2025-12-17 12:31:07.623407: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:07.805392: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. I0000 00:00:1765971068.804270 160357 device_compiler.h:196] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process. 2025-12-17 12:31:09.734860: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:09.912544: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:32.318966: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=0,k13=2,k14=3,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:32.344476: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.025559828s Trying algorithm eng23{k2=0,k13=2,k14=3,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:34.156184: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:35.306164: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 2.15004513s Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:36.744870: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:36.927667: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:37.081083: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:37.253482: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:37.430320: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:31:42.279943: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:43.257783: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.977915871s Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[4,4,128,128,128]{4,3,2,1,0}, f32[4,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:31:46.830807: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91700 48/Unknown 450s 6s/step - dice: 0.0777 - dice_et: 0.0201 - dice_tc: 0.0344 - dice_wt: 0.1787 - loss: 3.03802025-12-17 12:37:14.868497: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence [[{{node MultiDeviceIteratorGetNextFromShard}}]] [[RemoteCall]] 2025-12-17 12:37:15.919125: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:16.093522: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:17.873295: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:18.032921: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:27.808009: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:37:28.394435: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.586189512s Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,1,1,1]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=1x1x1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:37:28.948297: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:29.110080: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems. 2025-12-17 12:37:32.859598: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... 2025-12-17 12:37:33.324957: E external/local_xla/xla/service/slow_operation_alarm.cc:140] The operation took 1.465430048s Trying algorithm eng23{k2=6,k13=0,k14=2,k18=1,k23=0} for conv (f32[16,4,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[3,4,128,128,128]{4,3,2,1,0}, f32[3,16,128,128,128]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]} is taking a while... Floating point exception (core dumped) [/code] Я попробовал другое значение ExperimentConfig.batch_size_train, но проблема осталась та же. Вот версии платформы: [code]TensorFlow version: 2.20.0 Keras version: 3.12.0 CUDA version: 12.5.1 cuDNN version: 9 [/code] Итак, мои вопросы:
[*]Почему я всегда получаю исключение с плавающей запятой (сброс ядра) при использовании нескольких графических процессоров?
[*]Я всегда получаю предупреждение пользователя: на входе закончились данные; прерывание обучения. Убедитесь, что ваш набор данных или генератор может генерировать не менее [code]steps_per_epoch * epochs[/code] пакеты. Возможно, вам придется использовать функцию .repeat() при построении набора данных. Я не указывал Steps_per_epoch и не использовал повторение(), поэтому можно ли игнорировать это предупреждение? Я намерен обучать модель до тех пор, пока набор данных не будет исчерпан за эпоху.
[*]Я впервые использую распределенное обучение с помощью Tensorflow. Правильно ли я его использую? Я использовал [code]with strategy.scope(): [/code] Когда я только определил модель, swi_callback_metric , ckpt ( tf.train.Checkpoint) и ckpt_manger, но не использовал их при создании tf_ckpt_callback и swi_callback . См. model.py