该脚本用于快速检测多机多卡环境下的 NCCL 通信是否正常。它初始化 nccl 进程组,并执行一次简单的 all_reduce 操作。
在每个节点上运行:
torchrun --nproc_per_node=8 nccl_test.py
import torch
import torch.distributed as dist
import time
def check_nccl():
if not dist.is_available():
print("Distributed not available")
return
dist.init_process_group("nccl")
rank = dist.get_rank()
world_size = dist.get_world_size()
if rank == 0:
print(f"World Size: {world_size}")
print("Testing NCCL AllReduce...")
tensor = torch.ones(1024 * 1024).cuda()
start = time.time()
dist.all_reduce(tensor)
torch.cuda.synchronize()
end = time.time()
if rank == 0:
print(f"AllReduce Time: {(end - start)*1000:.2f}ms")
print("NCCL Test Passed!")
if __name__ == "__main__":
check_nccl()