NVIDIA NCCL 源码学习(十五)- Symmetric Memory
背景
最近nccl引入了Symmetric Memory的特性,本质和nvshmem的对称内存一样,用户可以像用nvshmem一样使用nccl暴露出来的device接口写通信算子了。
nvshmem的做法是内部自己维护这个对称内存,用户通过nvshmem_malloc获取对称内存,但是nccl的做法是用户自己申请内存,然后通过nccl的接口进行注册成为对称内存。用法如下所示。
void *d_sendbuff;void *d_recvbuff;NCCLCHECK(ncclMemAlloc(&d_sendbuff, size_bytes));NCCLCHECK(ncclMemAlloc(&d_recvbuff, size_bytes));ncclWindow_t send_win;ncclWindow_t recv_win;NCCLCHECK(ncclCommWindowRegister(comm, d_sendbuff, size_bytes, &send_win,NCCL_WIN_COLL_SYMMETRIC));NCCLCHECK(ncclAllReduce(d_sendbuff, d_recvbuff, count, ncclFloat, ncclSum,comm, stream));NCCLCHECK(ncclCommWindowDeregister(comm, send_win));NCCLCHECK(ncclCommWindowDeregister(comm, recv_win));NCCLCHECK(ncclMemFree(d_sendbuff));NCCLCHECK(ncclMemFree(d_recvbuff));
用户通过ncclMemAlloc分配一块内存,然后通过ncclCommWindowRegister进行注册,接下来直接执行ncclAllReduce接口就可以使能对称内存的特性。
整体流程
如图一所示,以单机两卡为例,那么nccl会在每张卡上预留两段VA空间,第0段用于映射gpu0的物理地址,第1段用于映射gpu1的物理地址,每一段大小为bigSize,用户执行ncclCommWindowRegister之后会在每张卡上分配一块物理内存,gpu0的handle0和gpu1的handle1,然后通过cuMemMap映射到自己的VA空间。

实现
ncclMemAlloc就是通过cumem接口分配显存,并且设置配p2p,使得其他卡可以访问这块显存。
注册的入口是ncclCommWindowRegister,userPtr为用户通过ncclMemAlloc分配的内存地址。首先通过ncclDevrInitOnce还是通过group机制实现的,创建一个task,记录了用户传入的信息,然后将task入队到regTaskQueue。
ncclResult_t ncclCommWindowRegister(struct ncclComm* comm, void* userPtr, size_t userSize,struct ncclWindow_vidmem** outWinDev, int winFlags) {NCCLCHECKGOTO(ncclDevrInitOnce(comm), ret, fail);task->userPtr = userPtr;task->userSize = userSize;task->winFlags = winFlags;task->outWinDev = outWinDev;ncclIntruQueueEnqueue(&comm->devrState.regTaskQueue, task);NCCLCHECK(ncclGroupEndInternal());}
然后看下ncclDevrInitOnce,首先会计算每个lsa组的信息,lsa表示Load Store Accessible,一个lsa组就是一个机器上的所有rank,lsaSize表示一个lsa组的rank数,lsaSelf就是自己在组内的rank,lsaRankList记录所在组的所有rank。然后开始设置bigSize,bigSize默认设置为组内所有卡中最大的显存容量,假设为200GB,ncclSpaceConstruct和ncclShadowPoolConstruct负责对space和ShadowPool初始化,下边再详细介绍。
ncclResult_t ncclDevrInitOnce(struct ncclComm* comm) {for (int r=1; r < comm->nRanks; r++) {if (comm->rankToNode[r] == comm->rankToNode[r-1]) {nodeSize += 1;} else {lsaSize = gcd(lsaSize, nodeSize);nodeSize = 1; } }lsaSize = gcd(lsaSize, nodeSize);devr->lsaSize = lsaSize;devr->lsaSelf = comm->rank % lsaSize;devr->lsaRankList = (int*)malloc(devr->lsaSize*sizeof(int));for (int i=0; i < devr->lsaSize; i++) {devr->lsaRankList[i] = comm->rank + (i - devr->lsaSelf);}devr->bigSize = ncclParamWinStride();if (-devr->bigSize <= 1) { devr->bigSize = 1; for (int r=0; r < comm->nRanks; ++r) {devr->bigSize = std::max<size_t>(devr->bigSize, comm->peerInfo[r].totalGlobalMem);} }devr->bigSize = alignUp(devr->bigSize, size_t(1)<<32);ncclSpaceConstruct(&devr->bigSpace);ncclShadowPoolConstruct(&devr->shadows);return ncclSuccess;
}
然后GroupEnd会执行ncclDevrWindowRegisterInGroup进行实际的注册
ncclResult_t ncclDevrWindowRegisterInGroup(struct ncclComm* comm,void* userPtr, size_t userSize, int winFlags, ncclWindow_t* outWinDev) { CUCHECKGOTO(cuMemGetAddressRange(&memAddr, &memSize, reinterpret_cast<CUdeviceptr>(userPtr)), ret, fail_locReg);memOffset = reinterpret_cast<CUdeviceptr>(userPtr) - memAddr;CUCHECKGOTO(cuMemRetainAllocationHandle(&memHandle, reinterpret_cast<void*>(memAddr)), ret, fail_locReg);NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, (void*)memAddr, memSize, &mem), ret, fail_locReg_memHandle);NCCLCHECKGOTO(symWindowCreate(comm, mem, memOffset, userPtr, userSize, winFlags, localRegHandle, outWinDev, nullptr, stream), ret, fail_locReg_memHandle_mem_stream);
}
首先通过cuMemGetAddressRange获取用户地址userPtr对应的内存块首地址memAddr和大小memSize,通过cuMemRetainAllocationHandle获取memAddr对应的handle memHandle。
然后开始创建对称内存。
static ncclResult_t symMemoryObtain(struct ncclComm* comm, CUmemGenericAllocationHandle memHandle, void* memAddr, size_t size,struct ncclDevrMemory** outMem) { struct ncclDevrState* devr = &comm->devrState;struct ncclDevrMemory* mem = devr->memHead;while (mem != nullptr) {if (mem->memHandle == memHandle) {CUCHECKIGNORE(cuMemRelease(memHandle));goto leave;} mem = mem->next;}mem = (struct ncclDevrMemory*)malloc(sizeof(struct ncclDevrMemory));mem->refCount = 0; mem->memHandle = memHandle;mem->primaryAddr = memAddr;mem->size = size;...
}
memHead为链表,维护了已注册过的所有handle,因此首先遍历memHead链表,如果发现这次要注册的memHandle已经在链表中,那么直接cuMmerelease掉memHandle,减小引用计数,就可以直接返回。
如果没有注册过,那么分配一个新的ncclDevrMemory mem,记录memHandle,memAddr等信息。
static ncclResult_t symMemoryObtain(...) { NCCLCHECKGOTO(ncclSpaceAlloc(&devr->bigSpace, devr->bigSize, size, devr->granularity, &bigOffset), ret, fail_mem);mem->bigOffset = bigOffset;NCCLCHECKGOTO(symMemoryMapLsaTeam(comm, memHandle, size, bigOffset), ret, fail_mem_space);if (mem->primaryAddr == nullptr) {mem->primaryAddr = (char*)devr->lsaFlatBase + devr->lsaSelf*devr->bigSize + mem->bigOffset;}mem->next = devr->memHead;devr->memHead = mem;
leave:mem->refCount += 1;*outMem = mem;return ret;
}
然后开始执行ncclSpaceAlloc,由于每张卡都会预留了200GB的虚拟地址空间,ncclSpace就是维护这个200GB连续空间的分配和释放,实际维护的是偏移,做法也比较简单,就是维护了已占用的区间集合,每次分配的时候线性查找一个大于size的空间,返回这个空间给bigOffset。symMemoryMapLsaTeam进行内存映射,然后将mem假如到memHead对应的链表。
然后看下内存映射的过程,首先分配messages[lsaSize]用于记录lsa组所有rank对应内存的handle,将自己的memHandle记录到messages[devr->lsaSelf],然后allgather。
lsaFlatBase记录了虚拟地址空间的首地址,如果为NULL,那么先通过cuMemAddressReserve预留虚拟地址空间,大小为lsaSize * bigSize。
static ncclResult_t symMemoryMapLsaTeam(struct ncclComm* comm, CUmemGenericAllocationHandle memHandle, size_t size, size_t bigOffset) { struct ncclDevrState* devr = &comm->devrState;CUmemAccessDesc accessDesc = {};union Message {CUmemGenericAllocationHandle memHandle;CUmemFabricHandle fabricHandle;};Message* messages = (Message*)calloc(devr->lsaSize, sizeof(Message));messages[devr->lsaSelf].memHandle = memHandle;NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, devr->lsaRankList, devr->lsaSelf, devr->lsaSize, messages, sizeof(Message)), ret, fail);if (devr->lsaFlatBase == nullptr) { // Create on first need.CUdeviceptr addr;CUCHECKGOTO(cuMemAddressReserve(&addr, devr->lsaSize*devr->bigSize, NCCL_MAX_PAGE_SIZE, 0, 0), ret, fail);devr->lsaFlatBase = reinterpret_cast<void*>(addr);}...
}
遍历lsaSize个rank对应的内存块,由于fd是个per进程的概念,因此如果是其他rank的内存块的handle,需要通过uds获取当前进程对应的fd,然后通过cuMemImportFromShareableHandle获取handle,虚拟地址空间为lsaFlatBase + r*devr->bigSize + bigOffset,通过cuMemMap对handle进行映射。
static ncclResult_t symMemoryMapLsaTeam() {...accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;accessDesc.location.id = comm->cudaDev;accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;for (int r = 0; r < devr->lsaSize; r++) {CUmemGenericAllocationHandle impHandle;if (r == devr->lsaSelf) {impHandle = memHandle;} else {if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {int fd = -1;NCCLCHECKGOTO(ncclProxyClientGetFdBlocking(comm, devr->lsaRankList[r], &messages[r], &fd), ret, fail);CUCHECKGOTO(cuMemImportFromShareableHandle(&impHandle, reinterpret_cast<void*>((uintptr_t)fd), ncclCuMemHandleType), ret, fail);SYSCHECKGOTO(close(fd), "close", ret, fail);} else {CUCHECKGOTO(cuMemImportFromShareableHandle(&impHandle, (void*)&messages[r].fabricHandle, ncclCuMemHandleType), ret, fail);} } CUdeviceptr addr = reinterpret_cast<uintptr_t>((char*)devr->lsaFlatBase + r*devr->bigSize + bigOffset);CUCHECKGOTO(cuMemMap(addr, size, 0, impHandle, 0), ret, fail);CUCHECKGOTO(cuMemSetAccess(addr, size, &accessDesc, 1), ret, fail);if (r != devr->lsaSelf) {CUCHECKGOTO(cuMemRelease(impHandle), ret, fail);} }
}
现在已经完成内存注册了,还有两个问题,一个是在执行allreduce的时候,nccl怎么知道这个地址是不是注册过的,另外一个是如果用户想自己写kernel,用户怎么知道其他卡注册的地址是多少,为了解决这些问题,nccl引入了window的概念。
在函数ncclDevrWindowRegisterInGroup中,通过symMemoryObtain完成内存注册后,接着会通过symWindowCreate创建window。
ncclResult_t ncclDevrWindowRegisterInGroup() {...NCCLCHECKGOTO(symWindowCreate(comm, mem, memOffset, userPtr, userSize, winFlags, localRegHandle, outWinDev, nullptr, stream), ret, fail_locReg_memHandle_mem_stream);...
}
static ncclResult_t symWindowCreate(struct ncclComm* comm, struct ncclDevrMemory* mem, size_t memOffset, void* userPtr, size_t userSize, int winFlags, void* localReg,struct ncclWindow_vidmem** outWinDev, struct ncclDevrWindow** outWin,cudaStream_t stream) { uintptr_t userAddr = reinterpret_cast<uintptr_t>(userPtr);struct ncclDevrState* devr = &comm->devrState;struct ncclDevrWindow* win; win = (struct ncclDevrWindow*)malloc(sizeof(struct ncclDevrWindow));memset(win, 0, sizeof(*win));win->memory = mem; win->size = userSize;win->bigOffset = mem->bigOffset + memOffset;win->winFlags = winFlags;win->localRegHandle = localReg;win->userPtr = userPtr;...
}
分配一个ncclDevrWindow win,记录下对应的mem,bigOffset,userPtr等信息。
比如win这种数据结构会在host和device同时会进行访问,因此为了方便管理这个对应关系引入了Shadow的概念,ShadowPool本质就是一个hashmap,存储的对象为ncclShadowObject,如图2所示,ncclShadowObject中有两个指针devObj和hostObj,分别对应host和device的对象,在分配ShadowObject的时候会分配大小为sizeof(ncclShadowObject) + sizeof(hostObj)的host内存,hostObj直接存放在ShadowObject之后,然后将这个ShadowObject插入ShadowPool,key为devObj的地址。

然后开始分配device端会用到的window,通过ncclShadowPoolAlloc分配ShadowObject,然后设置hostObj,再拷贝到devObj。
static ncclResult_t symWindowCreate() {...struct ncclWindow_vidmem* winDev;struct ncclWindow_vidmem* winDevHost;NCCLCHECK(ncclShadowPoolAlloc(&devr->shadows, &winDev, &winDevHost, stream));win->vidmem = winDev;winDevHost->lsaFlatBase = (char*)devr->lsaFlatBase + win->bigOffset;winDevHost->mcOffset4K = win->bigOffset>>12;winDevHost->stride4G = devr->bigSize>>32;winDevHost->lsaRank = devr->lsaSelf;winDevHost->worldRank = comm->rank;winDevHost->winHost = (void*)win;winDevHost->ginOffset4K = memOffset>>12;CUDACHECK(cudaMemcpyAsync(winDev, winDevHost, sizeof(struct ncclWindow_vidmem), cudaMemcpyHostToDevice, stream));...
}
最后将这个window插入到devr->winSorted,一个有序链表,按照userAddr排序。
static ncclResult_t symWindowCreate() {...{ // insert into winSorted[]int i = listFindSortedLub(&ncclDevrWindowSorted::userAddr, devr->winSorted, devr->winSortedCount, userAddr);struct ncclDevrWindowSorted winSort;winSort.userAddr = userAddr;winSort.size = userSize;winSort.win = win;listInsert(&devr->winSorted, &devr->winSortedCapacity, &devr->winSortedCount, i, winSort);}...
}
到现在就完成了对称内存的建立过程,接下来我们看下上边说的两个问题,一个是在执行allreduce的时候,nccl怎么知道这个地址是不是注册过的,另外一个是如果用户想自己写kernel,用户怎么知道其他卡注册的地址是多少。
当执行allreduce的时候,还是执行group的流程,如果支持symmetric,那么会在ncclPrepareTasks的时候通过ncclMakeSymmetricTaskList检查一下队列中的task有没有使用symmtric的,如下所示:
ncclResult_t ncclMakeSymmetricTaskList(struct ncclComm* comm, struct ncclTaskColl* task, struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next>* symTaskQueue, struct ncclTaskColl** remainTasksHead) {...while (task != nullptr) {int index = ((int)task->func*ncclNumDevRedOps + (int)task->opDev.op)*ncclNumTypes + (int)task->datatype;struct ncclTaskColl* next = task->next;NCCLCHECK(ncclDevrFindWindow(comm, task->sendbuff, &task->sendWin));NCCLCHECK(ncclDevrFindWindow(comm, task->recvbuff, &task->recvWin));bool symAvailable = ncclSymkAvailable(comm, task->func, task->opDev.op, task->datatype, task->count);if (task->sendWin && task->recvWin && (task->sendWin->winFlags & task->recvWin->winFlags & NCCL_WIN_COLL_SYMMETRIC) && symAvailable) {...}
}ncclResult_t ncclDevrFindWindow(struct ncclComm* comm, void const* userPtr, struct ncclDevrWindow** outWin) {struct ncclDevrState* devr = &comm->devrState;uintptr_t userAddr = reinterpret_cast<uintptr_t>(userPtr);int i = listFindSortedLub(&ncclDevrWindowSorted::userAddr, devr->winSorted, devr->winSortedCount, userAddr);if (0 < i && (userAddr - devr->winSorted[i-1].userAddr < devr->winSorted[i-1].size)) {*outWin = devr->winSorted[i-1].win;} else { *outWin = nullptr;} return ncclSuccess;
}
这里就用到了之前所说的window,ncclDevrFindWindow通过遍历前边说的window的有序列表,检查task的buf是否在window内部,如果在window内部,会将window设置给task,接着将会派发到symmtric版本的allreduce实现。
最后我们看下ncclSymPtr,即device端如何使用对称内存,以nccl的kernel为例,在enqueue的时候会记录下来task对应的window和偏移,如下:
ncclResult_t ncclSymkMakeDevWork(struct ncclComm* comm, struct ncclTaskColl* task, struct ncclSymkDevWork* outDevWork) {...outDevWork->inputWin = task->sendWin->vidmem;outDevWork->inputOff = (uint8_t*)task->sendbuff - (uint8_t*)task->sendWin->userPtr;...
}
那么就可以通过window和off构造ncclSymPtr,对于获取本地的地址,就是通过localPtr()获取,lsaFlatBase为预留的VA的首地址,stride4G就是bigOffset,lsaRank是自己在lsa组的localrank,因此本地地址就是lsaFlatBase开始,第lsaRank个bigOffset中偏移为offset的地址。
NCCL_DEVICE_INLINE void* ncclGetLocalPointer(ncclWindow_t w, size_t offset) {char* base = nccl::utility::loadConst(&w->lsaFlatBase);uint32_t stride4G = nccl::utility::loadConst(&w->stride4G);int i = nccl::utility::loadConst(&w->lsaRank);return (void*)(nccl::utility::add4G(base, i*stride4G) + offset);
}
对于lsa组内其他rank的地址,可以通过ncclGetLsaPointer,就是找lsaFlatBase中第peer个bigOffset中偏移为offset的地址。
NCCL_DEVICE_INLINE void* ncclGetLsaPointer(ncclWindow_t w, size_t offset, int peer) {char* base = nccl::utility::loadConst(&w->lsaFlatBase);uint32_t stride4G = nccl::utility::loadConst(&w->stride4G);int i = peer;return (void*)(nccl::utility::add4G(base, i*stride4G) + offset);
}