File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/torch/serialization.py", line 379, in save
_save(obj, opened_zipfile, pickle_module, pickle_protocol)
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/torch/serialization.py", line 499, in _save
zip_file.write_record(name, storage.data_ptr(), num_bytes)
OSError: [Errno 116] Stale file handle
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/shared/rsaas/miranda9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_dist_maml_l2l.py", line 1815, in
main()
File "/shared/rsaas/miranda9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_dist_maml_l2l.py", line 1748, in main
train(args=args)
File "/shared/rsaas/miranda9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_dist_maml_l2l.py", line 1795, in train
meta_train_iterations_ala_l2l(args, args.agent, args.opt, args.scheduler)
File "/home/miranda9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/training/meta_training.py", line 213, in meta_train_iterations_ala_l2l
log_train_val_stats(args, args.it, step_name, train_loss, train_acc, training=True)
File "/home/miranda9/ultimate-utils/ultimate-utils-proj-src/uutils/logging_uu/wandb_logging/supervised_learning.py", line 55, in log_train_val_stats
_log_train_val_stats(args=args,
File "/home/miranda9/ultimate-utils/ultimate-utils-proj-src/uutils/logging_uu/wandb_logging/supervised_learning.py", line 113, in _log_train_val_stats
save_for_supervised_learning(args, ckpt_filename='ckpt.pt')
File "/home/miranda9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/checkpointing_uu/supervised_learning.py", line 54, in save_for_supervised_learning
torch.save({'training_mode': args.training_mode,
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/torch/serialization.py", line 380, in save
return
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/torch/serialization.py", line 259, in __exit__
self.file_like.write_end_of_file()
RuntimeError: [enforce fail at inline_container.cc:298] . unexpected pos 2736460544 vs 2736460432
# - ckpt
args_pickable: Namespace = uutils.make_args_pickable(args)
# note not saving any objects, to make sure checkpoint is loadable later with no problems
torch.save({'training_mode': args.training_mode,
'it': args.it,
'epoch_num': args.epoch_num,
# 'args': args_pickable, # some versions of this might not have args!
# decided only to save the dict version to avoid this ckpt not working, make it args when loading
'args_dict': vars(args_pickable), # some versions of this might not have args!
'model_state_dict': get_model_from_ddp(args.model).state_dict(),
'model_str': str(args.model), # added later, to make it easier to check what optimizer was used
'model_hps': args.model_hps,
'model_option': args.model_option,
'opt_state_dict': args.opt.state_dict(),
'opt_str': str(args.opt),
'opt_hps': args.opt_hps,
'opt_option': args.opt_option,
'scheduler_str': str(args.scheduler),
'scheduler_state_dict': try_to_get_scheduler_state_dict(args.scheduler),
'scheduler_hps': args.scheduler_hps,
'scheduler_option': args.scheduler_option,
},
pickle_module=pickle,
f=args.log_root / ckpt_filename)
если это неправильный способ проверки моделей с обнимающим лицом (HF), то что?
Работает ли torch.save на моделях с обнимающими лицами (я использую vit)? Я предполагал, что да. Моя ошибка: [code] File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/torch/serialization.py", line 379, in save _save(obj, opened_zipfile, pickle_module, pickle_protocol) File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/torch/serialization.py", line 499, in _save zip_file.write_record(name, storage.data_ptr(), num_bytes) OSError: [Errno 116] Stale file handle During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/shared/rsaas/miranda9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_dist_maml_l2l.py", line 1815, in main() File "/shared/rsaas/miranda9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_dist_maml_l2l.py", line 1748, in main train(args=args) File "/shared/rsaas/miranda9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_dist_maml_l2l.py", line 1795, in train meta_train_iterations_ala_l2l(args, args.agent, args.opt, args.scheduler) File "/home/miranda9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/training/meta_training.py", line 213, in meta_train_iterations_ala_l2l log_train_val_stats(args, args.it, step_name, train_loss, train_acc, training=True) File "/home/miranda9/ultimate-utils/ultimate-utils-proj-src/uutils/logging_uu/wandb_logging/supervised_learning.py", line 55, in log_train_val_stats _log_train_val_stats(args=args, File "/home/miranda9/ultimate-utils/ultimate-utils-proj-src/uutils/logging_uu/wandb_logging/supervised_learning.py", line 113, in _log_train_val_stats save_for_supervised_learning(args, ckpt_filename='ckpt.pt') File "/home/miranda9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/checkpointing_uu/supervised_learning.py", line 54, in save_for_supervised_learning torch.save({'training_mode': args.training_mode, File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/torch/serialization.py", line 380, in save return File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/torch/serialization.py", line 259, in __exit__ self.file_like.write_end_of_file() RuntimeError: [enforce fail at inline_container.cc:298] . unexpected pos 2736460544 vs 2736460432 [/code] мой код: [code] # - ckpt args_pickable: Namespace = uutils.make_args_pickable(args) # note not saving any objects, to make sure checkpoint is loadable later with no problems torch.save({'training_mode': args.training_mode, 'it': args.it, 'epoch_num': args.epoch_num,
# 'args': args_pickable, # some versions of this might not have args! # decided only to save the dict version to avoid this ckpt not working, make it args when loading 'args_dict': vars(args_pickable), # some versions of this might not have args!
'model_state_dict': get_model_from_ddp(args.model).state_dict(), 'model_str': str(args.model), # added later, to make it easier to check what optimizer was used 'model_hps': args.model_hps, 'model_option': args.model_option,
'scheduler_str': str(args.scheduler), 'scheduler_state_dict': try_to_get_scheduler_state_dict(args.scheduler), 'scheduler_hps': args.scheduler_hps, 'scheduler_option': args.scheduler_option, }, pickle_module=pickle, f=args.log_root / ckpt_filename) [/code] если это неправильный способ проверки моделей с обнимающим лицом (HF), то что?