def get_device_via_env_variables(deterministic: bool = False, verbose: bool = True) -> torch.device:
device: torch.device = torch.device("cpu")
if torch.cuda.is_available():
if 'CUDA_VISIBLE_DEVICES' not in os.environ:
device: torch.device = torch.device("cuda:0")
else:
gpu_idx: list[str] = os.environ['CUDA_VISIBLE_DEVICES'].split(',')
if len(gpu_idx) == 1:
gpu_idx: str = gpu_idx[0]
else:
# generate random int from 0 to len(gpu_idx) with import statement
import random
idx: int = random.randint(0, len(gpu_idx) - 1) if not deterministic else -1
gpu_idx: str = gpu_idx[idx]
device: torch.device = torch.device(f"cuda:{gpu_idx}")
if verbose:
print(f'{device=}')
return device
У меня есть подозрение, что gpu_idx и CUDA_VISIBLE_DEVICES на самом деле не совпадают... Я просто хочу загрузить правильный графический процессор. Как это сделать?
ошибка:
Traceback (most recent call last):aded (0.000 MB deduped)
File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1368, in
main_data_analyis()
File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1163, in main_data_analyis
args: Namespace = load_args()
File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1152, in load_args
args.meta_learner = get_maml_meta_learner(args)
File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/common.py", line 272, in get_maml_meta_learner
base_model = load_model_ckpt(args, path_to_checkpoint=args.path_2_init_maml)
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/mains/common.py", line 265, in load_model_ckpt
base_model, _, _ = load_model_optimizer_scheduler_from_ckpt(args, path_to_checkpoint,
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/mains/common.py", line 81, in load_model_optimizer_scheduler_from_ckpt
ckpt: dict = torch.load(path_to_checkpoint, map_location=torch.device('cuda:3'))
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 607, in load
return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 882, in _load
result = unpickler.load()
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 857, in persistent_load
load_tensor(data_type, size, key, _maybe_decode_ascii(location))
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 846, in load_tensor
loaded_storages[key] = restore_location(storage, location)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 827, in restore_location
return default_restore_location(storage, str(map_location))
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 175, in default_restore_location
result = fn(storage, location)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 151, in _cuda_deserialize
device = validate_cuda_device(location)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 142, in validate_cuda_device
raise RuntimeError('Attempting to deserialize object on CUDA device '
RuntimeError: Attempting to deserialize object on CUDA device 3 but torch.cuda.device_count() is 1. Please use torch.load with map_location to map your storages to an existing device.
мотивирован тем фактом, что я пытаюсь использовать оставшиеся 40 ГБ моего 5CNN с фильтрами 256 и 512, но в результате возникают проблемы с памятью
Traceback (most recent call last):
File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1368, in
main_data_analyis()
File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1213, in main_data_analyis
stats_analysis_with_emphasis_on_effect_size(args, hist=True)
File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/stats_analysis_with_emphasis_on_effect_size.py", line 74, in stats_analysis_with_emphasis_on_effect_size
results_usl: dict = get_episodic_accs_losses_all_splits_usl(args, args.mdl_sl, loaders)
File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/common.py", line 616, in get_episodic_accs_losses_all_splits_usl
losses, accs = agent.get_lists_accs_losses(data, training)
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 92, in get_lists_accs_losses
spt_embeddings_t = self.get_embedding(spt_x_t, self.base_model).detach()
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 166, in get_embedding
return get_embedding(x=x, base_model=base_model)
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 267, in get_embedding
out = base_model.model.features(x)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 443, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 439, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
RuntimeError: CUDA out of memory. Tried to allocate 174.00 MiB (GPU 0; 79.20 GiB total capacity; 54.31 GiB already allocated; 22.56 MiB free; 54.61 GiB reserved in total by PyTorch)
Я придумал этот код, но он приводит к бесконечным ошибкам: [code]def get_device_via_env_variables(deterministic: bool = False, verbose: bool = True) -> torch.device: device: torch.device = torch.device("cpu") if torch.cuda.is_available(): if 'CUDA_VISIBLE_DEVICES' not in os.environ: device: torch.device = torch.device("cuda:0") else: gpu_idx: list[str] = os.environ['CUDA_VISIBLE_DEVICES'].split(',') if len(gpu_idx) == 1: gpu_idx: str = gpu_idx[0] else: # generate random int from 0 to len(gpu_idx) with import statement import random idx: int = random.randint(0, len(gpu_idx) - 1) if not deterministic else -1 gpu_idx: str = gpu_idx[idx] device: torch.device = torch.device(f"cuda:{gpu_idx}") if verbose: print(f'{device=}') return device [/code] У меня есть подозрение, что gpu_idx и CUDA_VISIBLE_DEVICES на самом деле не совпадают... Я просто хочу загрузить правильный графический процессор. Как это сделать? ошибка: [code]Traceback (most recent call last):aded (0.000 MB deduped) File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1368, in main_data_analyis() File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1163, in main_data_analyis args: Namespace = load_args() File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1152, in load_args args.meta_learner = get_maml_meta_learner(args) File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/common.py", line 272, in get_maml_meta_learner base_model = load_model_ckpt(args, path_to_checkpoint=args.path_2_init_maml) File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/mains/common.py", line 265, in load_model_ckpt base_model, _, _ = load_model_optimizer_scheduler_from_ckpt(args, path_to_checkpoint, File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/mains/common.py", line 81, in load_model_optimizer_scheduler_from_ckpt ckpt: dict = torch.load(path_to_checkpoint, map_location=torch.device('cuda:3')) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 607, in load return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 882, in _load result = unpickler.load() File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 857, in persistent_load load_tensor(data_type, size, key, _maybe_decode_ascii(location)) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 846, in load_tensor loaded_storages[key] = restore_location(storage, location) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 827, in restore_location return default_restore_location(storage, str(map_location)) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 175, in default_restore_location result = fn(storage, location) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 151, in _cuda_deserialize device = validate_cuda_device(location) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 142, in validate_cuda_device raise RuntimeError('Attempting to deserialize object on CUDA device ' RuntimeError: Attempting to deserialize object on CUDA device 3 but torch.cuda.device_count() is 1. Please use torch.load with map_location to map your storages to an existing device. [/code] мотивирован тем фактом, что я пытаюсь использовать оставшиеся 40 ГБ моего 5CNN с фильтрами 256 и 512, но в результате возникают проблемы с памятью [code]Traceback (most recent call last): File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1368, in main_data_analyis() File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1213, in main_data_analyis stats_analysis_with_emphasis_on_effect_size(args, hist=True) File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/stats_analysis_with_emphasis_on_effect_size.py", line 74, in stats_analysis_with_emphasis_on_effect_size results_usl: dict = get_episodic_accs_losses_all_splits_usl(args, args.mdl_sl, loaders) File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/common.py", line 616, in get_episodic_accs_losses_all_splits_usl losses, accs = agent.get_lists_accs_losses(data, training) File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 92, in get_lists_accs_losses spt_embeddings_t = self.get_embedding(spt_x_t, self.base_model).detach() File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 166, in get_embedding return get_embedding(x=x, base_model=base_model) File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 267, in get_embedding out = base_model.model.features(x) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl return forward_call(*input, **kwargs) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward input = module(input) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl return forward_call(*input, **kwargs) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 443, in forward return self._conv_forward(input, self.weight, self.bias) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 439, in _conv_forward return F.conv2d(input, weight, bias, self.stride, RuntimeError: CUDA out of memory. Tried to allocate 174.00 MiB (GPU 0; 79.20 GiB total capacity; 54.31 GiB already allocated; 22.56 MiB free; 54.61 GiB reserved in total by PyTorch) [/code] Я хочу использовать графический процессор 3, но последняя ошибка говорит о графическом процессоре 0. Что я делаю не так? cross: https://discuss.pytorch.org/t/how-do-you-load-a-specific-gpu-from-cuda-available-devices-in-pytorch/174044