fix sampling if data overflow after temperature penalty (#3508)

irexyc · web-flow · commit 3afd6c0c34b7 · 2025-05-01T17:03:42.000+08:00
* fix sampling if data overflow after temperature penalty

* using float type for sampling

* prevent potential index error

* update

* fix lint

* fix batch

* update name

* update name

* update check
diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc
@@ -89,7 +89,7 @@ auto ModelRequest::Forward(InputParam param, std::function<void()> cb) -> Output
     }
 
     if (param.gen_cfg.output_logprobs) {
-        add(outputs_, "logprob_vals", data_type_, kCPU, max_out_len, kMaxLogProb);
+        add(outputs_, "logprob_vals", data_type_v<float>, kCPU, max_out_len, kMaxLogProb);
         add(outputs_, "logprob_indexes", data_type_v<int>, kCPU, max_out_len, kMaxLogProb);
         add(outputs_, "logprob_nums", data_type_v<int>, kCPU, max_out_len);
     }
diff --git a/src/turbomind/kernels/ban_bad_words.cu b/src/turbomind/kernels/ban_bad_words.cu
@@ -163,12 +163,6 @@ void invokeBanBadWords(T*           logits,
                                        size_t       step,                                                              \
                                        cudaStream_t stream);
 
-#ifdef ENABLE_FP32
 INSTANTIATE_INVOKE_BAN_BAD_WORDS(float);
-#endif
-INSTANTIATE_INVOKE_BAN_BAD_WORDS(half);
-#ifdef ENABLE_BF16
-INSTANTIATE_INVOKE_BAN_BAD_WORDS(__nv_bfloat16);
-#endif
 
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/reduce_kernel_utils.cuh b/src/turbomind/kernels/reduce_kernel_utils.cuh
@@ -79,6 +79,32 @@ __device__ inline __nv_bfloat16 getMaxValue<__nv_bfloat16>()
 }
 #endif
 
+template<typename T>
+__device__ inline T getInfValue();
+
+template<>
+__device__ inline float getInfValue<float>()
+{
+    return INFINITY;
+}
+
+template<>
+__device__ inline half getInfValue<half>()
+{
+    return __ushort_as_half((unsigned short)0x7C00U);
+}
+
+#ifdef ENABLE_BF16
+template<>
+__device__ inline __nv_bfloat16 getInfValue<__nv_bfloat16>()
+{
+#if __CUDA_ARCH__ >= 800
+    return __ushort_as_bfloat16((unsigned short)0x7F80U);
+#endif
+    return {};
+}
+#endif
+
 template<int Bytes>
 __device__ inline void copy(const void* local, void* data)
 {
@@ -344,8 +370,8 @@ __device__ __forceinline__ TopK<T, MAX_K> reduce_topk_op(const TopK<T, MAX_K>& a
 
 template<typename T>
 struct TopK_2 {
-    int p = -1;
-    T   u = -getMaxValue<T>();
+    int p = 0;
+    T   u = -getInfValue<T>();
 
     __device__ __forceinline__ void insert(T elem, int elem_id)
     {
@@ -357,8 +383,8 @@ struct TopK_2 {
 
     __device__ __forceinline__ void init()
     {
-        u = -getMaxValue<T>();
-        p = -1;
+        u = -getInfValue<T>();
+        p = 0;
     }
 };
 
diff --git a/src/turbomind/kernels/sampling_kernels.cu b/src/turbomind/kernels/sampling_kernels.cu
@@ -97,12 +97,6 @@ void invokeSampling(SamplingParams& params, cudaStream_t stream)
                                                    params.sampled_nums);
 }
 
-#ifdef ENABLE_FP32
 template void invokeSampling<float>(SamplingParams& params, cudaStream_t stream);
-#endif
-template void invokeSampling<half>(SamplingParams& params, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeSampling<nv_bfloat16>(SamplingParams& params, cudaStream_t stream);
-#endif
 
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/sampling_penalty_kernels.cu b/src/turbomind/kernels/sampling_penalty_kernels.cu
@@ -109,13 +109,7 @@ void invokeApplyTemperaturePenalty(T*           logits,
                                                 const int    vocab_size_padd,                                          \
                                                 cudaStream_t stream);
 
-#ifdef ENABLE_FP32
 INISTANTIATE_INVOKE_APPLY_TEMPERATURE_PENALTY(float);
-#endif
-INISTANTIATE_INVOKE_APPLY_TEMPERATURE_PENALTY(half);
-#ifdef ENABLE_BF16
-INISTANTIATE_INVOKE_APPLY_TEMPERATURE_PENALTY(__nv_bfloat16);
-#endif
 
 template<typename T>
 __global__ void batchApplyTemperaturePenalty(T*           logits,
@@ -215,13 +209,7 @@ void invokeBatchApplyTemperaturePenalty(T*           logits,
                                                      const int    vocab_size_padd,                                     \
                                                      cudaStream_t stream);
 
-#ifdef ENABLE_FP32
 INISTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY(float);
-#endif
-INISTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY(half);
-#ifdef ENABLE_BF16
-INISTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY(__nv_bfloat16);
-#endif
 
 template<typename T, int vec_size>
 __global__ void batchApplyTemperaturePenalty_v2(T*           logits,
@@ -268,7 +256,7 @@ __global__ void batchApplyTemperaturePenalty_v2(T*           logits,
                 vec[c] = (float)vec[c] * scale;
             }
             else {
-                vec[c] = -getMaxValue<T>();
+                vec[c] = -getInfValue<T>();
             }
         }
 
@@ -328,13 +316,7 @@ void invokeBatchApplyTemperaturePenalty_v2(T*           logits,
                                                         const int    vocab_size_padded,                                \
                                                         cudaStream_t stream);
 
-#ifdef ENABLE_FP32
 INSTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY_V2(float);
-#endif
-INSTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY_V2(half);
-#ifdef ENABLE_BF16
-INSTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY_V2(__nv_bfloat16);
-#endif
 
 template<typename T, RepetitionPenaltyType penalty_type>
 __global__ void applyRepetitionPenalty(T*          logits,
@@ -466,13 +448,7 @@ void invokeApplyRepetitionPenalty(T*                          logits,
                                                const RepetitionPenaltyType penalty_type,                               \
                                                cudaStream_t                stream);
 
-#ifdef ENABLE_FP32
 INISTANTIATE_INVOKE_APPLY_REPETITION_PENALTY(float);
-#endif
-INISTANTIATE_INVOKE_APPLY_REPETITION_PENALTY(half);
-#ifdef ENABLE_BF16
-INISTANTIATE_INVOKE_APPLY_REPETITION_PENALTY(__nv_bfloat16);
-#endif
 
 template<typename T, RepetitionPenaltyType penalty_type>
 __global__ void batchApplyRepetitionPenalty(T*           logits,
@@ -598,13 +574,7 @@ void invokeBatchApplyRepetitionPenalty(T*                    logits,
                                                     RepetitionPenaltyType penalty_type,                                \
                                                     cudaStream_t          stream);
 
-#ifdef ENABLE_FP32
 INSTANTIATE_INVOKE_BATCH_APPLY_REPETITION_PENALTY(float);
-#endif
-INSTANTIATE_INVOKE_BATCH_APPLY_REPETITION_PENALTY(half);
-#ifdef ENABLE_BF16
-INSTANTIATE_INVOKE_BATCH_APPLY_REPETITION_PENALTY(__nv_bfloat16);
-#endif
 
 template<typename T>
 __global__ void batchApplyMinLengthPenalty(T* __restrict__ logits,
@@ -653,12 +623,6 @@ void invokeMinLengthPenalty(T*           logits,
                                          const int    end_ids_size,                                                    \
                                          cudaStream_t stream);
 
-#ifdef ENABLE_FP32
 INSTANTIATE_INVOKE_MIN_LENGTH_PENALTY(float);
-#endif
-INSTANTIATE_INVOKE_MIN_LENGTH_PENALTY(half);
-#ifdef ENABLE_BF16
-INSTANTIATE_INVOKE_MIN_LENGTH_PENALTY(__nv_bfloat16);
-#endif
 
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/sampling_topk_kernels.cu b/src/turbomind/kernels/sampling_topk_kernels.cu
@@ -108,7 +108,7 @@ __global__ void topKSortStage1(T*         logits,
         if (tid == 0) {
             topk_tmp_id_buf[ite]  = total.p;
             topk_tmp_val_buf[ite] = total.u;
-            if (total.p != -1) {
+            if (total.u != -getInfValue<T>()) {
                 logits[total.p] = -MAX_T_VAL;
             }
         }
@@ -244,12 +244,6 @@ void invokeTopKSortFilter(TopKSortFilterParams& params, cudaStream_t stream)
     }
 }
 
-#ifdef ENABLE_FP32
 template void invokeTopKSortFilter<float>(TopKSortFilterParams& params, cudaStream_t stream);
-#endif
-template void invokeTopKSortFilter<half>(TopKSortFilterParams& params, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeTopKSortFilter<nv_bfloat16>(TopKSortFilterParams& params, cudaStream_t stream);
-#endif
 
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/sampling_topp_kernels.cu b/src/turbomind/kernels/sampling_topp_kernels.cu
@@ -145,13 +145,7 @@ void invokeSoftmax(T*           logits,
                                    const int*   kept,                                                                  \
                                    cudaStream_t stream);
 
-#ifdef ENABLE_FP32
 INSTANTIATE_INVOKE_SOFTMAX(float);
-#endif
-INSTANTIATE_INVOKE_SOFTMAX(half);
-#ifdef ENABLE_BF16
-INSTANTIATE_INVOKE_SOFTMAX(nv_bfloat16);
-#endif
 
 template<typename T, int MAX_K, int THREADBLOCK_SIZE>
 __launch_bounds__(THREADBLOCK_SIZE) __global__ void topp_beam_topk_kernel(const T*     logits,
@@ -290,13 +284,7 @@ void invokeTopPSort(TopPSortParams& params, cudaStream_t stream)
                                                                         stream));       // cudaStream_t
 }
 
-#ifdef ENABLE_FP32
 template void invokeTopPSort<float>(TopPSortParams& params, cudaStream_t stream);
-#endif
-template void invokeTopPSort<half>(TopPSortParams& params, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeTopPSort<nv_bfloat16>(TopPSortParams& params, cudaStream_t stream);
-#endif
 
 template<typename T, int BLOCK_SIZE>
 __global__ void topPMinPFilter(T*           sorted_logits,
@@ -404,12 +392,6 @@ void invokeTopPMinPFilter(TopPMinPFilterParams& params, cudaStream_t stream)
                                                                   params.min_ps);
 }
 
-#ifdef ENABLE_FP32
 template void invokeTopPMinPFilter<float>(TopPMinPFilterParams& params, cudaStream_t stream);
-#endif
-template void invokeTopPMinPFilter<half>(TopPMinPFilterParams& params, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeTopPMinPFilter<nv_bfloat16>(TopPMinPFilterParams& params, cudaStream_t stream);
-#endif
 
 }  // namespace turbomind
diff --git a/src/turbomind/layers/DynamicDecodeLayer.cc b/src/turbomind/layers/DynamicDecodeLayer.cc
@@ -32,14 +32,11 @@ DynamicDecodeLayer::DynamicDecodeLayer(DataType              dtype,
                                        const cudaDeviceProp* device_prop)
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    auto dispatch = [&](auto t) {
-        using T = decltype(t);
-        BaseDynamicDecodeLayer::BaseParam param{max_batch_size, vocab_size, vocab_size_padded, stream, device_prop};
-        layers_.emplace_back(new LogitsProcessorLayer<T>{param});
-        layers_.emplace_back(new SamplingLayer<T>{param});
-        layers_.emplace_back(new StopCriteriaLayer<T>{param});
-    };
-    TM_DISPATCH_PRIMARY_DTYPES(dtype, dispatch);
+    TM_CHECK(dtype == kFloat32);
+    BaseDynamicDecodeLayer::BaseParam param{max_batch_size, vocab_size, vocab_size_padded, stream, device_prop};
+    layers_.emplace_back(new LogitsProcessorLayer<float>{param});
+    layers_.emplace_back(new SamplingLayer<float>{param});
+    layers_.emplace_back(new StopCriteriaLayer<float>{param});
 }
 
 DynamicDecodeLayer::~DynamicDecodeLayer() {}
diff --git a/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc b/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc
@@ -234,11 +234,6 @@ void LogitsProcessorLayer<T>::Setup(const std::vector<const Request*>& rs, const
     TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-#ifdef ENABLE_FP32
 template class LogitsProcessorLayer<float>;
-#endif
-template class LogitsProcessorLayer<half>;
-#ifdef ENABLE_BF16
-template class LogitsProcessorLayer<__nv_bfloat16>;
-#endif
+
 }  // namespace turbomind
diff --git a/src/turbomind/layers/sampling_layers/SamplingLayer.cc b/src/turbomind/layers/sampling_layers/SamplingLayer.cc
@@ -178,12 +178,6 @@ void SamplingLayer<T>::Setup(const std::vector<const Request*>& rs, const Tensor
     core::Copy(min_p_.data(), bsz, min_p_buf_.data());
 }
 
-#ifdef ENABLE_FP32
 template class SamplingLayer<float>;
-#endif
-template class SamplingLayer<half>;
-#ifdef ENABLE_BF16
-template class SamplingLayer<nv_bfloat16>;
-#endif
 
 }  // namespace turbomind
diff --git a/src/turbomind/layers/sampling_layers/StopCriteriaLayer.cc b/src/turbomind/layers/sampling_layers/StopCriteriaLayer.cc
@@ -77,12 +77,6 @@ void StopCriteriaLayer<T>::Forward(TensorMap& args)
     TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-#ifdef ENABLE_FP32
 template class StopCriteriaLayer<float>;
-#endif
-template class StopCriteriaLayer<half>;
-#ifdef ENABLE_BF16
-template class StopCriteriaLayer<nv_bfloat16>;
-#endif
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
diff --git a/src/turbomind/models/llama/llama_kernels.cu b/src/turbomind/models/llama/llama_kernels.cu
diff --git a/src/turbomind/models/llama/llama_kernels.h b/src/turbomind/models/llama/llama_kernels.h

Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ auto ModelRequest::Forward(InputParam param, std::function<void()> cb) -> Output`
`89`	`89`	`}`
`90`	`90`
`91`	`91`	`if (param.gen_cfg.output_logprobs) {`
`92`		`- add(outputs_, "logprob_vals", data_type_, kCPU, max_out_len, kMaxLogProb);`
	`92`	`+ add(outputs_, "logprob_vals", data_type_v<float>, kCPU, max_out_len, kMaxLogProb);`
`93`	`93`	`add(outputs_, "logprob_indexes", data_type_v<int>, kCPU, max_out_len, kMaxLogProb);`
`94`	`94`	`add(outputs_, "logprob_nums", data_type_v<int>, kCPU, max_out_len);`
`95`	`95`	`}`
Original file line number	Diff line number	Diff line change
`@@ -108,7 +108,7 @@ __global__ void topKSortStage1(T* logits,`
`108`	`108`	`if (tid == 0) {`
`109`	`109`	`topk_tmp_id_buf[ite] = total.p;`
`110`	`110`	`topk_tmp_val_buf[ite] = total.u;`
`111`		`- if (total.p != -1) {`
	`111`	`+ if (total.u != -getInfValue<T>()) {`
`112`	`112`	`logits[total.p] = -MAX_T_VAL;`
`113`	`113`	`}`
`114`	`114`	`}`
`@@ -244,12 +244,6 @@ void invokeTopKSortFilter(TopKSortFilterParams& params, cudaStream_t stream)`
`244`	`244`	`}`
`245`	`245`	`}`
`246`	`246`
`247`		`-#ifdef ENABLE_FP32`
`248`	`247`	`template void invokeTopKSortFilter<float>(TopKSortFilterParams& params, cudaStream_t stream);`
`249`		`-#endif`
`250`		`-template void invokeTopKSortFilter<half>(TopKSortFilterParams& params, cudaStream_t stream);`
`251`		`-#ifdef ENABLE_BF16`
`252`		`-template void invokeTopKSortFilter<nv_bfloat16>(TopKSortFilterParams& params, cudaStream_t stream);`
`253`		`-#endif`
`254`	`248`
`255`	`249`	`} // namespace turbomind`