@@ -47,6 +47,22 @@ namespace RcclUnitTesting
4747 collArgs.numOutputElements , collArgs.numOutputElementsAllocated );
4848 return TEST_FAIL;
4949 }
50+
51+ // Check bias allocation if bias is enabled
52+ if (collArgs.options .useBias )
53+ {
54+ if (collArgs.numBiasElements == 0 || collArgs.numBiasBytesAllocated == 0 )
55+ {
56+ ERROR (" Bias is enabled but bias buffers are not allocated\n " );
57+ return TEST_FAIL;
58+ }
59+ if (collArgs.numBiasElements != collArgs.numOutputElements )
60+ {
61+ ERROR (" Number of bias elements (%lu) must match number of output elements (%lu)\n " ,
62+ collArgs.numBiasElements , collArgs.numOutputElements );
63+ return TEST_FAIL;
64+ }
65+ }
5066 return TEST_SUCCESS;
5167 }
5268
@@ -108,7 +124,22 @@ namespace RcclUnitTesting
108124 for (int rank = 0 ; rank < collArgs.totalRanks ; ++rank)
109125 {
110126 // Generate temporary input for this rank
111- CHECK_CALL (tempInputCpu.FillPattern (collArgs.dataType , collArgs.numInputElements , rank, false ));
127+ if (collArgs.options .inputConstantValue >= 0 )
128+ {
129+ // Use constant value for all input elements across all ranks
130+ // This is useful for ncclProd at high rank counts to avoid factorial overflow
131+ for (size_t i = 0 ; i < collArgs.numInputElements ; i++)
132+ {
133+ CHECK_CALL (tempInputCpu.Set (collArgs.dataType , i,
134+ collArgs.options .inputConstantValue ,
135+ (double )collArgs.options .inputConstantValue ));
136+ }
137+ }
138+ else
139+ {
140+ // Use rank-based pattern: value[rank][i] = (rank + i) % 256 (default behavior)
141+ CHECK_CALL (tempInputCpu.FillPattern (collArgs.dataType , collArgs.numInputElements , rank, false ));
142+ }
112143
113144 // Copy the pre-scaled input into GPU memory for the correct rank
114145 if (rank == collArgs.globalRank )
@@ -144,6 +175,38 @@ namespace RcclUnitTesting
144175 {
145176 CHECK_CALL (result.DivideByInt (collArgs.dataType , collArgs.numInputElements , collArgs.totalRanks ));
146177 }
178+
179+ // Add bias to expected output if bias is enabled
180+ if (collArgs.options .useBias && (isAllReduce || collArgs.options .root == collArgs.globalRank ))
181+ {
182+ // Initialize bias data on CPU
183+ if (collArgs.options .biasConstantValue >= 0 )
184+ {
185+ // Use constant value for all bias elements (useful for ncclProd to avoid overflow)
186+ for (size_t i = 0 ; i < collArgs.numBiasElements ; i++)
187+ {
188+ CHECK_CALL (collArgs.biasCpu .Set (collArgs.dataType , i,
189+ collArgs.options .biasConstantValue ,
190+ (double )collArgs.options .biasConstantValue ));
191+ }
192+ }
193+ else
194+ {
195+ // Use incremental pattern: bias[i] = i (default behavior)
196+ CHECK_CALL (collArgs.biasCpu .FillPattern (collArgs.dataType , collArgs.numBiasElements , 0 , false ));
197+ }
198+
199+ // Copy bias data to GPU
200+ size_t const biasBytes = collArgs.numBiasBytesAllocated ;
201+ CHECK_HIP (hipMemcpy (collArgs.biasGpu .ptr , collArgs.biasCpu .ptr , biasBytes, hipMemcpyHostToDevice));
202+
203+ // Apply bias to expected output using the SAME reduction operation as AllReduce
204+ CHECK_CALL (result.Reduce (collArgs.dataType , collArgs.numInputElements , collArgs.biasCpu , tempOp));
205+
206+ // Update the biasPtr in options to point to the GPU buffer
207+ collArgs.options .biasPtr = collArgs.biasGpu .ptr ;
208+ }
209+
147210 return TEST_SUCCESS;
148211 }
149212
0 commit comments