Код: Выделить всё
import os
import torch
import atexit
import gc
from ultralytics import YOLO
from torch.nn import DataParallel
# Remap GPUs to a contiguous set using CUDA_VISIBLE_DEVICES.
# For example, if you want to use physical GPUs 0, 1, 3, 4, 5, 6:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,3,4,5,6"
# Set environment variable to help reduce memory fragmentation.
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
# Function to clear GPU memory.
def clear_gpu_memory():
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
# Ensure that GPU memory is cleared on exit.
atexit.register(clear_gpu_memory)
# Load the pretrained YOLOv9 segmentation model and compile it.
model = YOLO("yolov9e-seg.pt")
model.model = torch.compile(model.model)
try:
# Train the model with your specified parameters.
model.train(
data='training_data/brain_data.yaml',
epochs=2,
imgsz=4096,
batch=6,
project='brain_segmentation',
name='testrun',
device=[0, 1, 3, 4, 5, 6],
close_mosaic=1,
save_period=1,
amp=True,
cache=False,
overlap_mask=False,
workers=4,
)
# If available, try deleting the optimizer to free memory.
try:
del model.optimizer
except AttributeError:
pass
# Force garbage collection and clear cached GPU memory after training.
gc.collect()
torch.cuda.empty_cache()
# Get the number of GPUs now visible (they are renumbered from 0 to N-1).
available_gpus = torch.cuda.device_count()
print(f"Available GPUs (contiguous numbering): {list(range(available_gpus))}")
# Wrap the model in DataParallel for training.
model.model = DataParallel(model.model, device_ids=list(range(available_gpus)))
model.model.to('cuda')
# --- Before validation, unwrap and fuse the model ---
# The fused model is expected to be used on a single device, so we unwrap the DataParallel container.
if isinstance(model.model, DataParallel):
# Unwrap and call the underlying fuse() method.
fused_module = model.model.module.fuse(verbose=False)
model.model = fused_module
else:
model.model = model.model.fuse(verbose=False)
print("Model fused.")
# Validate using memory optimizations:
# - torch.inference_mode() to disable gradient tracking.
# - torch.amp.autocast with device_type='cuda' for mixed-precision inference.
with torch.inference_mode():
with torch.amp.autocast(device_type='cuda'):
model.val(
device=list(range(available_gpus)),
batch=6,
imgsz=4096
)
print("Validation complete.")
# Export the fused model to ONNX (typically done on a single GPU).
model.export(
device=0,
imgsz=4096,
half=True,
simplify=True,
opset=12
)
except KeyboardInterrupt:
print("Training interrupted. Clearing GPU memory...")
clear_gpu_memory()
raise
except Exception as e:
print(f"An error occurred: {e}. Clearing GPU memory...")
clear_gpu_memory()
raise
path: work_my/new_yolo_4096/training_data
train:
- images/train # Path to training images
- labels/train # Path to training annotations
val:
- images/val # Path to validation images
- labels/val # Path to validation annotations
nc: 25
names: ['Thalamus', 'Caudate nucleus', 'Putamen', 'Globus pallidus', 'Nucleus accumbens', 'Internal capsule', 'Substantia innominata', 'Fornix', 'Anterior commissure', 'Ganglionic eminence', 'Hypothalamus', 'Amygdala', 'Hippocampus', 'Choroid plexus', 'Lateral ventricle', 'Olfactory tubercle', 'Pretectum', 'Inferior colliculus', 'Superior colliculus', 'Tegmentum', 'Pons', 'Medulla', 'Cerebellum', 'Corpus callosum', 'Cerebral cortex']
< /code>
Некоторые точки: < /p>
[*] Мои учебные данные правильно подготовлены, и нет проблем в этой части загрузки Данные или проблема неправильных путей в конфигурации
[*] Я хочу обучить свою модель по тому же разрешению 4096x4096, поэтому, пожалуйста, не предлагайте уменьшить размер изображения.
Размер партии должен быть равен количеству устройств, поэтому минимум составляет 6, так что держали то же самое, нельзя уменьшить. память) < /li>
Все графические процессоры пусты, и никакая другая программа не было принято. Обучение завершается в этой части: < /p>
Starting training for 2 epochs...
Epoch GPU_mem box_loss seg_loss cls_loss dfl_loss Instances Size
1/2 81G 2.821 6.069 54.57 2.973 30 4096: 100%|██████████| 45/45 [00:56
Подробнее здесь: https://stackoverflow.com/questions/794 ... -could-but