Skip to content

Commit 3afd6c0

Browse files
authored
fix sampling if data overflow after temperature penalty (#3508)
* fix sampling if data overflow after temperature penalty * using float type for sampling * prevent potential index error * update * fix lint * fix batch * update name * update name * update check
1 parent b057894 commit 3afd6c0

16 files changed

+160
-139
lines changed

src/turbomind/engine/model_request.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ auto ModelRequest::Forward(InputParam param, std::function<void()> cb) -> Output
8989
}
9090

9191
if (param.gen_cfg.output_logprobs) {
92-
add(outputs_, "logprob_vals", data_type_, kCPU, max_out_len, kMaxLogProb);
92+
add(outputs_, "logprob_vals", data_type_v<float>, kCPU, max_out_len, kMaxLogProb);
9393
add(outputs_, "logprob_indexes", data_type_v<int>, kCPU, max_out_len, kMaxLogProb);
9494
add(outputs_, "logprob_nums", data_type_v<int>, kCPU, max_out_len);
9595
}

src/turbomind/kernels/ban_bad_words.cu

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -163,12 +163,6 @@ void invokeBanBadWords(T* logits,
163163
size_t step, \
164164
cudaStream_t stream);
165165

166-
#ifdef ENABLE_FP32
167166
INSTANTIATE_INVOKE_BAN_BAD_WORDS(float);
168-
#endif
169-
INSTANTIATE_INVOKE_BAN_BAD_WORDS(half);
170-
#ifdef ENABLE_BF16
171-
INSTANTIATE_INVOKE_BAN_BAD_WORDS(__nv_bfloat16);
172-
#endif
173167

174168
} // namespace turbomind

src/turbomind/kernels/reduce_kernel_utils.cuh

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,32 @@ __device__ inline __nv_bfloat16 getMaxValue<__nv_bfloat16>()
7979
}
8080
#endif
8181

82+
template<typename T>
83+
__device__ inline T getInfValue();
84+
85+
template<>
86+
__device__ inline float getInfValue<float>()
87+
{
88+
return INFINITY;
89+
}
90+
91+
template<>
92+
__device__ inline half getInfValue<half>()
93+
{
94+
return __ushort_as_half((unsigned short)0x7C00U);
95+
}
96+
97+
#ifdef ENABLE_BF16
98+
template<>
99+
__device__ inline __nv_bfloat16 getInfValue<__nv_bfloat16>()
100+
{
101+
#if __CUDA_ARCH__ >= 800
102+
return __ushort_as_bfloat16((unsigned short)0x7F80U);
103+
#endif
104+
return {};
105+
}
106+
#endif
107+
82108
template<int Bytes>
83109
__device__ inline void copy(const void* local, void* data)
84110
{
@@ -344,8 +370,8 @@ __device__ __forceinline__ TopK<T, MAX_K> reduce_topk_op(const TopK<T, MAX_K>& a
344370

345371
template<typename T>
346372
struct TopK_2 {
347-
int p = -1;
348-
T u = -getMaxValue<T>();
373+
int p = 0;
374+
T u = -getInfValue<T>();
349375

350376
__device__ __forceinline__ void insert(T elem, int elem_id)
351377
{
@@ -357,8 +383,8 @@ struct TopK_2 {
357383

358384
__device__ __forceinline__ void init()
359385
{
360-
u = -getMaxValue<T>();
361-
p = -1;
386+
u = -getInfValue<T>();
387+
p = 0;
362388
}
363389
};
364390

src/turbomind/kernels/sampling_kernels.cu

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,12 +97,6 @@ void invokeSampling(SamplingParams& params, cudaStream_t stream)
9797
params.sampled_nums);
9898
}
9999

100-
#ifdef ENABLE_FP32
101100
template void invokeSampling<float>(SamplingParams& params, cudaStream_t stream);
102-
#endif
103-
template void invokeSampling<half>(SamplingParams& params, cudaStream_t stream);
104-
#ifdef ENABLE_BF16
105-
template void invokeSampling<nv_bfloat16>(SamplingParams& params, cudaStream_t stream);
106-
#endif
107101

108102
} // namespace turbomind

src/turbomind/kernels/sampling_penalty_kernels.cu

Lines changed: 1 addition & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -109,13 +109,7 @@ void invokeApplyTemperaturePenalty(T* logits,
109109
const int vocab_size_padd, \
110110
cudaStream_t stream);
111111

112-
#ifdef ENABLE_FP32
113112
INISTANTIATE_INVOKE_APPLY_TEMPERATURE_PENALTY(float);
114-
#endif
115-
INISTANTIATE_INVOKE_APPLY_TEMPERATURE_PENALTY(half);
116-
#ifdef ENABLE_BF16
117-
INISTANTIATE_INVOKE_APPLY_TEMPERATURE_PENALTY(__nv_bfloat16);
118-
#endif
119113

120114
template<typename T>
121115
__global__ void batchApplyTemperaturePenalty(T* logits,
@@ -215,13 +209,7 @@ void invokeBatchApplyTemperaturePenalty(T* logits,
215209
const int vocab_size_padd, \
216210
cudaStream_t stream);
217211

218-
#ifdef ENABLE_FP32
219212
INISTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY(float);
220-
#endif
221-
INISTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY(half);
222-
#ifdef ENABLE_BF16
223-
INISTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY(__nv_bfloat16);
224-
#endif
225213

226214
template<typename T, int vec_size>
227215
__global__ void batchApplyTemperaturePenalty_v2(T* logits,
@@ -268,7 +256,7 @@ __global__ void batchApplyTemperaturePenalty_v2(T* logits,
268256
vec[c] = (float)vec[c] * scale;
269257
}
270258
else {
271-
vec[c] = -getMaxValue<T>();
259+
vec[c] = -getInfValue<T>();
272260
}
273261
}
274262

@@ -328,13 +316,7 @@ void invokeBatchApplyTemperaturePenalty_v2(T* logits,
328316
const int vocab_size_padded, \
329317
cudaStream_t stream);
330318

331-
#ifdef ENABLE_FP32
332319
INSTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY_V2(float);
333-
#endif
334-
INSTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY_V2(half);
335-
#ifdef ENABLE_BF16
336-
INSTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY_V2(__nv_bfloat16);
337-
#endif
338320

339321
template<typename T, RepetitionPenaltyType penalty_type>
340322
__global__ void applyRepetitionPenalty(T* logits,
@@ -466,13 +448,7 @@ void invokeApplyRepetitionPenalty(T* logits,
466448
const RepetitionPenaltyType penalty_type, \
467449
cudaStream_t stream);
468450

469-
#ifdef ENABLE_FP32
470451
INISTANTIATE_INVOKE_APPLY_REPETITION_PENALTY(float);
471-
#endif
472-
INISTANTIATE_INVOKE_APPLY_REPETITION_PENALTY(half);
473-
#ifdef ENABLE_BF16
474-
INISTANTIATE_INVOKE_APPLY_REPETITION_PENALTY(__nv_bfloat16);
475-
#endif
476452

477453
template<typename T, RepetitionPenaltyType penalty_type>
478454
__global__ void batchApplyRepetitionPenalty(T* logits,
@@ -598,13 +574,7 @@ void invokeBatchApplyRepetitionPenalty(T* logits,
598574
RepetitionPenaltyType penalty_type, \
599575
cudaStream_t stream);
600576

601-
#ifdef ENABLE_FP32
602577
INSTANTIATE_INVOKE_BATCH_APPLY_REPETITION_PENALTY(float);
603-
#endif
604-
INSTANTIATE_INVOKE_BATCH_APPLY_REPETITION_PENALTY(half);
605-
#ifdef ENABLE_BF16
606-
INSTANTIATE_INVOKE_BATCH_APPLY_REPETITION_PENALTY(__nv_bfloat16);
607-
#endif
608578

609579
template<typename T>
610580
__global__ void batchApplyMinLengthPenalty(T* __restrict__ logits,
@@ -653,12 +623,6 @@ void invokeMinLengthPenalty(T* logits,
653623
const int end_ids_size, \
654624
cudaStream_t stream);
655625

656-
#ifdef ENABLE_FP32
657626
INSTANTIATE_INVOKE_MIN_LENGTH_PENALTY(float);
658-
#endif
659-
INSTANTIATE_INVOKE_MIN_LENGTH_PENALTY(half);
660-
#ifdef ENABLE_BF16
661-
INSTANTIATE_INVOKE_MIN_LENGTH_PENALTY(__nv_bfloat16);
662-
#endif
663627

664628
} // namespace turbomind

src/turbomind/kernels/sampling_topk_kernels.cu

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ __global__ void topKSortStage1(T* logits,
108108
if (tid == 0) {
109109
topk_tmp_id_buf[ite] = total.p;
110110
topk_tmp_val_buf[ite] = total.u;
111-
if (total.p != -1) {
111+
if (total.u != -getInfValue<T>()) {
112112
logits[total.p] = -MAX_T_VAL;
113113
}
114114
}
@@ -244,12 +244,6 @@ void invokeTopKSortFilter(TopKSortFilterParams& params, cudaStream_t stream)
244244
}
245245
}
246246

247-
#ifdef ENABLE_FP32
248247
template void invokeTopKSortFilter<float>(TopKSortFilterParams& params, cudaStream_t stream);
249-
#endif
250-
template void invokeTopKSortFilter<half>(TopKSortFilterParams& params, cudaStream_t stream);
251-
#ifdef ENABLE_BF16
252-
template void invokeTopKSortFilter<nv_bfloat16>(TopKSortFilterParams& params, cudaStream_t stream);
253-
#endif
254248

255249
} // namespace turbomind

src/turbomind/kernels/sampling_topp_kernels.cu

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -145,13 +145,7 @@ void invokeSoftmax(T* logits,
145145
const int* kept, \
146146
cudaStream_t stream);
147147

148-
#ifdef ENABLE_FP32
149148
INSTANTIATE_INVOKE_SOFTMAX(float);
150-
#endif
151-
INSTANTIATE_INVOKE_SOFTMAX(half);
152-
#ifdef ENABLE_BF16
153-
INSTANTIATE_INVOKE_SOFTMAX(nv_bfloat16);
154-
#endif
155149

156150
template<typename T, int MAX_K, int THREADBLOCK_SIZE>
157151
__launch_bounds__(THREADBLOCK_SIZE) __global__ void topp_beam_topk_kernel(const T* logits,
@@ -290,13 +284,7 @@ void invokeTopPSort(TopPSortParams& params, cudaStream_t stream)
290284
stream)); // cudaStream_t
291285
}
292286

293-
#ifdef ENABLE_FP32
294287
template void invokeTopPSort<float>(TopPSortParams& params, cudaStream_t stream);
295-
#endif
296-
template void invokeTopPSort<half>(TopPSortParams& params, cudaStream_t stream);
297-
#ifdef ENABLE_BF16
298-
template void invokeTopPSort<nv_bfloat16>(TopPSortParams& params, cudaStream_t stream);
299-
#endif
300288

301289
template<typename T, int BLOCK_SIZE>
302290
__global__ void topPMinPFilter(T* sorted_logits,
@@ -404,12 +392,6 @@ void invokeTopPMinPFilter(TopPMinPFilterParams& params, cudaStream_t stream)
404392
params.min_ps);
405393
}
406394

407-
#ifdef ENABLE_FP32
408395
template void invokeTopPMinPFilter<float>(TopPMinPFilterParams& params, cudaStream_t stream);
409-
#endif
410-
template void invokeTopPMinPFilter<half>(TopPMinPFilterParams& params, cudaStream_t stream);
411-
#ifdef ENABLE_BF16
412-
template void invokeTopPMinPFilter<nv_bfloat16>(TopPMinPFilterParams& params, cudaStream_t stream);
413-
#endif
414396

415397
} // namespace turbomind

src/turbomind/layers/DynamicDecodeLayer.cc

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,11 @@ DynamicDecodeLayer::DynamicDecodeLayer(DataType dtype,
3232
const cudaDeviceProp* device_prop)
3333
{
3434
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
35-
auto dispatch = [&](auto t) {
36-
using T = decltype(t);
37-
BaseDynamicDecodeLayer::BaseParam param{max_batch_size, vocab_size, vocab_size_padded, stream, device_prop};
38-
layers_.emplace_back(new LogitsProcessorLayer<T>{param});
39-
layers_.emplace_back(new SamplingLayer<T>{param});
40-
layers_.emplace_back(new StopCriteriaLayer<T>{param});
41-
};
42-
TM_DISPATCH_PRIMARY_DTYPES(dtype, dispatch);
35+
TM_CHECK(dtype == kFloat32);
36+
BaseDynamicDecodeLayer::BaseParam param{max_batch_size, vocab_size, vocab_size_padded, stream, device_prop};
37+
layers_.emplace_back(new LogitsProcessorLayer<float>{param});
38+
layers_.emplace_back(new SamplingLayer<float>{param});
39+
layers_.emplace_back(new StopCriteriaLayer<float>{param});
4340
}
4441

4542
DynamicDecodeLayer::~DynamicDecodeLayer() {}

src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -234,11 +234,6 @@ void LogitsProcessorLayer<T>::Setup(const std::vector<const Request*>& rs, const
234234
TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
235235
}
236236

237-
#ifdef ENABLE_FP32
238237
template class LogitsProcessorLayer<float>;
239-
#endif
240-
template class LogitsProcessorLayer<half>;
241-
#ifdef ENABLE_BF16
242-
template class LogitsProcessorLayer<__nv_bfloat16>;
243-
#endif
238+
244239
} // namespace turbomind

src/turbomind/layers/sampling_layers/SamplingLayer.cc

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -178,12 +178,6 @@ void SamplingLayer<T>::Setup(const std::vector<const Request*>& rs, const Tensor
178178
core::Copy(min_p_.data(), bsz, min_p_buf_.data());
179179
}
180180

181-
#ifdef ENABLE_FP32
182181
template class SamplingLayer<float>;
183-
#endif
184-
template class SamplingLayer<half>;
185-
#ifdef ENABLE_BF16
186-
template class SamplingLayer<nv_bfloat16>;
187-
#endif
188182

189183
} // namespace turbomind

0 commit comments

Comments
 (0)