Skip to content

Commit 159cb6f

Browse files
committed
Added new unit tests for AllReduce with Bias API
1 parent 46b032b commit 159cb6f

File tree

9 files changed

+673
-33
lines changed

9 files changed

+673
-33
lines changed

test/AllReduceTests.cpp

Lines changed: 505 additions & 0 deletions
Large diffs are not rendered by default.

test/common/CallCollectiveForked.cpp

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,22 +40,18 @@ void callCollective(ncclUniqueId id, int collID, int rank, int nranks, const std
4040
default:
4141
ERROR("This collective is not implemented for callCollective routine");
4242
}
43-
43+
4444
HIPCALL(hipSetDevice(rank));
4545
hipStream_t stream;
4646
HIPCALL(hipStreamCreate(&stream));
4747
ncclComm_t comm;
48-
49-
5048

5149
NCCLCHECK(ncclCommInitRank(&comm, nranks, id, rank));
5250
int *sendbuff;
5351
int *recvbuff;
5452
void *sendRegHandle;
5553
void *recvRegHandle;
56-
5754

58-
5955
size_t sendSize = 0;
6056
size_t recvSize = 0;
6157

@@ -78,13 +74,12 @@ void callCollective(ncclUniqueId id, int collID, int rank, int nranks, const std
7874
else{
7975
HIPCALL(hipMallocManaged((void **)&sendbuff, sendSize * sizeof(int)));
8076
HIPCALL(hipMallocManaged((void **)&recvbuff, recvSize * sizeof(int)));
81-
}
82-
77+
}
78+
8379
NCCLCHECK(ncclCommRegister(comm, sendbuff, sendSize * sizeof(int), &sendRegHandle));
8480
NCCLCHECK(ncclCommRegister(comm, recvbuff, recvSize * sizeof(int), &recvRegHandle));
8581

8682
HIPCALL(hipMemcpy(sendbuff, send.data(), sizeof(int) * sendSize, hipMemcpyHostToDevice));
87-
HIPCALL(hipMemcpy(recvbuff, recv.data(), sizeof(int) *recvSize, hipMemcpyHostToDevice));
8883

8984
switch(collID){
9085
case ncclCollAllReduce:
@@ -98,12 +93,13 @@ void callCollective(ncclUniqueId id, int collID, int rank, int nranks, const std
9893

9994
HIPCALL(hipStreamSynchronize(stream));
10095
HIPCALL(hipMemcpy(recv.data(), recvbuff, sizeof(int) * recvSize, hipMemcpyDeviceToHost));
101-
96+
10297
NCCLCHECK(ncclCommDeregister(comm, sendRegHandle));
10398
NCCLCHECK(ncclCommDeregister(comm, recvRegHandle));
10499

105100
HIPCALL(hipFree(sendbuff));
106101
HIPCALL(hipFree(recvbuff));
102+
107103
ncclCommDestroy(comm);
108104
}
109105

@@ -115,7 +111,7 @@ void callCollectiveForked(int nranks, int collID, const std::vector<int>& sendB
115111
for(int r = 0; r < nranks; ++r){
116112
if(pipe(childPipes[r].data()) == -1)
117113
ERROR("child %i pipe Failed\n", r);
118-
}
114+
}
119115

120116
auto createNCCLid = [&](int rank){
121117
ncclGetUniqueId(&id);
@@ -162,9 +158,10 @@ void callCollectiveForked(int nranks, int collID, const std::vector<int>& sendB
162158
}
163159

164160
getAndDistributeNCCLid(nranks);
165-
161+
166162
for(int r = 0; r < nranks; ++r)
167163
wait(NULL); // Wait for all children
168164
}
169165

170166
}
167+

test/common/CallCollectiveForked.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,5 @@ namespace RcclUnitTesting
1515
}
1616

1717
#endif
18+
19+

test/common/CollectiveArgs.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@ namespace RcclUnitTesting
3939
this->streamIdx = streamIdx;
4040
this->options = optionalColArgs;
4141

42+
// Initialize bias fields
43+
this->numBiasElements = 0;
44+
this->numBiasBytesAllocated = 0;
45+
this->biasRegHandle = nullptr;
46+
4247
if (this->options.scalarMode != -1)
4348
{
4449
size_t const numBytes = DataTypeToBytes(dataType);
@@ -102,6 +107,23 @@ namespace RcclUnitTesting
102107
CHECK_CALL(this->expected.AllocateCpuMem(this->numOutputBytesAllocated));
103108
}
104109
CHECK_CALL(this->outputCpu.AllocateCpuMem(this->numOutputBytesAllocated));
110+
111+
// Allocate bias buffers if bias is enabled
112+
if (this->options.useBias)
113+
{
114+
this->numBiasElements = this->options.biasNumElements;
115+
this->numBiasBytesAllocated = this->numBiasElements * DataTypeToBytes(this->dataType);
116+
CHECK_CALL(this->biasGpu.AllocateGpuMem(this->numBiasBytesAllocated, useManagedMem, userRegistered));
117+
CHECK_CALL(this->biasCpu.AllocateCpuMem(this->numBiasBytesAllocated));
118+
this->biasRegHandle = nullptr;
119+
}
120+
else
121+
{
122+
this->numBiasElements = 0;
123+
this->numBiasBytesAllocated = 0;
124+
this->biasRegHandle = nullptr;
125+
}
126+
105127
return TEST_SUCCESS;
106128
}
107129

@@ -155,6 +177,15 @@ namespace RcclUnitTesting
155177
if (this->options.scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr));
156178
this->localScalar.Attach(nullptr);
157179
}
180+
181+
// Deallocate bias buffers if they were allocated
182+
if (this->options.useBias && this->numBiasBytesAllocated > 0)
183+
{
184+
this->biasGpu.FreeGpuMem(this->userRegistered);
185+
this->biasCpu.FreeCpuMem();
186+
this->biasRegHandle = nullptr;
187+
}
188+
158189
return TEST_SUCCESS;
159190
}
160191

test/common/CollectiveArgs.hpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,15 @@ namespace RcclUnitTesting
8282
ScalarTransport scalarTransport; // Used for custom reduction operators
8383
int scalarMode = -1; // -1 if scalar not used
8484

85+
// Bias support for fused AllReduce+Bias operations
86+
bool useBias = false; // Enable bias addition
87+
void* biasPtr = nullptr; // Pointer to bias buffer (GPU memory)
88+
size_t biasNumElements = 0; // Number of elements in bias buffer
89+
int biasConstantValue = -1; // If >= 0, use constant value for all bias elements (instead of incremental pattern)
90+
91+
// Input data pattern control (useful for ncclProd to avoid overflow at high rank counts)
92+
int inputConstantValue = -1; // If >= 0, use constant value for all input elements (instead of rank-based pattern)
93+
8594
// allToAllv args
8695
size_t sendcounts[MAX_RANKS*MAX_RANKS];
8796
size_t sdispls[MAX_RANKS*MAX_RANKS];
@@ -122,6 +131,13 @@ namespace RcclUnitTesting
122131
size_t numInputElementsAllocated;
123132
size_t numOutputElementsAllocated;
124133

134+
// Bias data for fused AllReduce+Bias operations
135+
PtrUnion biasGpu; // Bias buffer on GPU
136+
PtrUnion biasCpu; // Bias buffer on CPU (for initialization/validation)
137+
void* biasRegHandle; // Handle for registered bias buffer
138+
size_t numBiasElements; // Number of elements in bias buffer
139+
size_t numBiasBytesAllocated; // Number of bytes allocated for bias
140+
125141
// Set collective arguments
126142
ErrCode SetArgs(int const globalRank,
127143
int const totalRanks,

test/common/PrepDataFuncs.cpp

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,22 @@ namespace RcclUnitTesting
4747
collArgs.numOutputElements, collArgs.numOutputElementsAllocated);
4848
return TEST_FAIL;
4949
}
50+
51+
// Check bias allocation if bias is enabled
52+
if (collArgs.options.useBias)
53+
{
54+
if (collArgs.numBiasElements == 0 || collArgs.numBiasBytesAllocated == 0)
55+
{
56+
ERROR("Bias is enabled but bias buffers are not allocated\n");
57+
return TEST_FAIL;
58+
}
59+
if (collArgs.numBiasElements != collArgs.numOutputElements)
60+
{
61+
ERROR("Number of bias elements (%lu) must match number of output elements (%lu)\n",
62+
collArgs.numBiasElements, collArgs.numOutputElements);
63+
return TEST_FAIL;
64+
}
65+
}
5066
return TEST_SUCCESS;
5167
}
5268

@@ -108,7 +124,22 @@ namespace RcclUnitTesting
108124
for (int rank = 0; rank < collArgs.totalRanks; ++rank)
109125
{
110126
// Generate temporary input for this rank
111-
CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
127+
if (collArgs.options.inputConstantValue >= 0)
128+
{
129+
// Use constant value for all input elements across all ranks
130+
// This is useful for ncclProd at high rank counts to avoid factorial overflow
131+
for (size_t i = 0; i < collArgs.numInputElements; i++)
132+
{
133+
CHECK_CALL(tempInputCpu.Set(collArgs.dataType, i,
134+
collArgs.options.inputConstantValue,
135+
(double)collArgs.options.inputConstantValue));
136+
}
137+
}
138+
else
139+
{
140+
// Use rank-based pattern: value[rank][i] = (rank + i) % 256 (default behavior)
141+
CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
142+
}
112143

113144
// Copy the pre-scaled input into GPU memory for the correct rank
114145
if (rank == collArgs.globalRank)
@@ -144,6 +175,38 @@ namespace RcclUnitTesting
144175
{
145176
CHECK_CALL(result.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
146177
}
178+
179+
// Add bias to expected output if bias is enabled
180+
if (collArgs.options.useBias && (isAllReduce || collArgs.options.root == collArgs.globalRank))
181+
{
182+
// Initialize bias data on CPU
183+
if (collArgs.options.biasConstantValue >= 0)
184+
{
185+
// Use constant value for all bias elements (useful for ncclProd to avoid overflow)
186+
for (size_t i = 0; i < collArgs.numBiasElements; i++)
187+
{
188+
CHECK_CALL(collArgs.biasCpu.Set(collArgs.dataType, i,
189+
collArgs.options.biasConstantValue,
190+
(double)collArgs.options.biasConstantValue));
191+
}
192+
}
193+
else
194+
{
195+
// Use incremental pattern: bias[i] = i (default behavior)
196+
CHECK_CALL(collArgs.biasCpu.FillPattern(collArgs.dataType, collArgs.numBiasElements, 0, false));
197+
}
198+
199+
// Copy bias data to GPU
200+
size_t const biasBytes = collArgs.numBiasBytesAllocated;
201+
CHECK_HIP(hipMemcpy(collArgs.biasGpu.ptr, collArgs.biasCpu.ptr, biasBytes, hipMemcpyHostToDevice));
202+
203+
// Apply bias to expected output using the SAME reduction operation as AllReduce
204+
CHECK_CALL(result.Reduce(collArgs.dataType, collArgs.numInputElements, collArgs.biasCpu, tempOp));
205+
206+
// Update the biasPtr in options to point to the GPU buffer
207+
collArgs.options.biasPtr = collArgs.biasGpu.ptr;
208+
}
209+
147210
return TEST_SUCCESS;
148211
}
149212

test/common/TestBed.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -636,12 +636,16 @@ namespace RcclUnitTesting
636636
std::vector<bool> const& inPlaceList,
637637
std::vector<bool> const& managedMemList,
638638
std::vector<bool> const& useHipGraphList,
639-
bool const& enableSweep)
639+
bool const& enableSweep,
640+
OptionalColArgs* const optionalArgsPtr)
640641
{
641642
// Sort numElements in descending order to cut down on # of allocations
642643
std::vector<int> sortedN = numElements;
643644
std::sort(sortedN.rbegin(), sortedN.rend());
644-
OptionalColArgs optionalArgs;
645+
646+
// Use provided OptionalColArgs or create default one
647+
OptionalColArgs defaultArgs;
648+
OptionalColArgs& optionalArgs = (optionalArgsPtr != nullptr) ? *optionalArgsPtr : defaultArgs;
645649
// Filter out any unsupported datatypes, in case only subset has been compiled for
646650
std::vector<ncclDataType_t> const& supportedDataTypes = this->GetAllSupportedDataTypes();
647651
std::vector<ncclDataType_t> dataTypes;
@@ -718,6 +722,11 @@ namespace RcclUnitTesting
718722
&numOutputElements);
719723
optionalArgs.redOp = redOps[rdIdx];
720724
optionalArgs.root = roots[rtIdx] % this->numActiveRanks;
725+
// Set biasNumElements if bias is enabled
726+
if (optionalArgs.useBias)
727+
{
728+
optionalArgs.biasNumElements = numOutputElements;
729+
}
721730
this->SetCollectiveArgs(funcTypes[ftIdx],
722731
dataTypes[dtIdx],
723732
numInputElements,

test/common/TestBed.hpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ namespace RcclUnitTesting
4141
std::vector<int> const& numStreamsPerGroup,
4242
int const numGroupCalls = 1,
4343
bool const useBlocking = true);
44-
44+
4545
// Prepare TestBed for use with GPUs across multiple child processes
4646
void InitComms(std::vector<std::vector<int>> const& deviceIdsPerChild,
4747
int const numCollectivesInGroup = 1,
@@ -96,7 +96,7 @@ namespace RcclUnitTesting
9696
// Execute all collectives on all test children
9797
// Blocks until collective is completed
9898
void ExecuteCollectives(std::vector<int> const &currentRanks = {},
99-
int const groupId = -1,
99+
int const groupId = -1,
100100
bool const useHipGraph = false);
101101

102102
// Perform results validation - compare output to expected
@@ -140,7 +140,7 @@ namespace RcclUnitTesting
140140
int const numGpus,
141141
int const ranksPerGpu,
142142
const std::vector<int>& gpuPriorityOrder);
143-
143+
144144
static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
145145
int const numGpus,
146146
const std::vector<int>& gpuPriorityOrder);
@@ -166,7 +166,8 @@ namespace RcclUnitTesting
166166
std::vector<bool> const& inPlaceList,
167167
std::vector<bool> const& managedMemList,
168168
std::vector<bool> const& useHipGraphList,
169-
bool const& enableSweep = true);
169+
bool const& enableSweep = true,
170+
OptionalColArgs* const optionalArgs = nullptr);
170171

171172
// Wait for user-input if in interactive mode
172173
void InteractiveWait(std::string message);

0 commit comments

Comments
 (0)