Код: Выделить всё
import os
import torch
import torch.distributed as dist
def init_distributed():
os.environ['MASTER_ADDR'] = "10.12.27.241"
os.environ['MASTER_PORT'] = '29500'
node_rank = int(os.environ.get('RANK', 0)) # 1 for worker
world_size = 2
dist.init_process_group(
backend='gloo',
rank=node_rank,
world_size=world_size
)
print(f"Initialized process group: rank {node_rank} of {world_size}")
return node_rank, world_size
def send_receive_message(rank, world_size):
if rank == 0:
# Node 0 sends a message
message = torch.tensor([42, 43, 44], dtype=torch.int64)
dist.send(message, dst=1)
print(f"Rank {rank} sent message: {message}")
else:
# Node 1 receives the message
message = torch.zeros(3, dtype=torch.int64)
dist.recv(message, src=0)
print(f"Rank {rank} received message: {message}")
if __name__ == "__main__":
rank, world_size = init_distributed()
send_receive_message(rank, world_size)
# Barrier to ensure all processes have completed
dist.barrier()
# Clean up
dist.destroy_process_group()
Код: Выделить всё
[E110 05:59:45.095859745 ProcessGroupGloo.cpp:143] Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.cc:144] no error
Traceback (most recent call last):
File "/data/exp/com.py", line 36, in
rank, world_size = init_distributed()
File "/data/exp/com.py", line 12, in init_distributed
dist.init_process_group(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 83, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 97, in wrapper
func_return = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1527, in init_process_group
default_pg, _ = _new_process_group_helper(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1744, in _new_process_group_helper
backend_class = ProcessGroupGloo(
RuntimeError: Gloo connectFullMesh failed with [../third_party/gloo/gloo/transport/tcp/pair.cc:144] no error
Любые рекомендации или указатели будут очень признательны!
Подробнее здесь: https://stackoverflow.com/questions/793 ... nstances-u
Мобильная версия