@@ -109,13 +109,7 @@ void invokeApplyTemperaturePenalty(T* logits,
109109 const int vocab_size_padd, \
110110 cudaStream_t stream);
111111
112- #ifdef ENABLE_FP32
113112INISTANTIATE_INVOKE_APPLY_TEMPERATURE_PENALTY (float );
114- #endif
115- INISTANTIATE_INVOKE_APPLY_TEMPERATURE_PENALTY (half);
116- #ifdef ENABLE_BF16
117- INISTANTIATE_INVOKE_APPLY_TEMPERATURE_PENALTY (__nv_bfloat16);
118- #endif
119113
120114template <typename T>
121115__global__ void batchApplyTemperaturePenalty (T* logits,
@@ -215,13 +209,7 @@ void invokeBatchApplyTemperaturePenalty(T* logits,
215209 const int vocab_size_padd, \
216210 cudaStream_t stream);
217211
218- #ifdef ENABLE_FP32
219212INISTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY (float );
220- #endif
221- INISTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY (half);
222- #ifdef ENABLE_BF16
223- INISTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY (__nv_bfloat16);
224- #endif
225213
226214template <typename T, int vec_size>
227215__global__ void batchApplyTemperaturePenalty_v2 (T* logits,
@@ -268,7 +256,7 @@ __global__ void batchApplyTemperaturePenalty_v2(T* logits,
268256 vec[c] = (float )vec[c] * scale;
269257 }
270258 else {
271- vec[c] = -getMaxValue <T>();
259+ vec[c] = -getInfValue <T>();
272260 }
273261 }
274262
@@ -328,13 +316,7 @@ void invokeBatchApplyTemperaturePenalty_v2(T* logits,
328316 const int vocab_size_padded, \
329317 cudaStream_t stream);
330318
331- #ifdef ENABLE_FP32
332319INSTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY_V2 (float );
333- #endif
334- INSTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY_V2 (half);
335- #ifdef ENABLE_BF16
336- INSTANTIATE_INVOKE_BATCH_APPLY_TEMPERATURE_PENALTY_V2 (__nv_bfloat16);
337- #endif
338320
339321template <typename T, RepetitionPenaltyType penalty_type>
340322__global__ void applyRepetitionPenalty (T* logits,
@@ -466,13 +448,7 @@ void invokeApplyRepetitionPenalty(T* logits,
466448 const RepetitionPenaltyType penalty_type, \
467449 cudaStream_t stream);
468450
469- #ifdef ENABLE_FP32
470451INISTANTIATE_INVOKE_APPLY_REPETITION_PENALTY (float );
471- #endif
472- INISTANTIATE_INVOKE_APPLY_REPETITION_PENALTY (half);
473- #ifdef ENABLE_BF16
474- INISTANTIATE_INVOKE_APPLY_REPETITION_PENALTY (__nv_bfloat16);
475- #endif
476452
477453template <typename T, RepetitionPenaltyType penalty_type>
478454__global__ void batchApplyRepetitionPenalty (T* logits,
@@ -598,13 +574,7 @@ void invokeBatchApplyRepetitionPenalty(T* logits,
598574 RepetitionPenaltyType penalty_type, \
599575 cudaStream_t stream);
600576
601- #ifdef ENABLE_FP32
602577INSTANTIATE_INVOKE_BATCH_APPLY_REPETITION_PENALTY (float );
603- #endif
604- INSTANTIATE_INVOKE_BATCH_APPLY_REPETITION_PENALTY (half);
605- #ifdef ENABLE_BF16
606- INSTANTIATE_INVOKE_BATCH_APPLY_REPETITION_PENALTY (__nv_bfloat16);
607- #endif
608578
609579template <typename T>
610580__global__ void batchApplyMinLengthPenalty (T* __restrict__ logits,
@@ -653,12 +623,6 @@ void invokeMinLengthPenalty(T* logits,
653623 const int end_ids_size, \
654624 cudaStream_t stream);
655625
656- #ifdef ENABLE_FP32
657626INSTANTIATE_INVOKE_MIN_LENGTH_PENALTY (float );
658- #endif
659- INSTANTIATE_INVOKE_MIN_LENGTH_PENALTY (half);
660- #ifdef ENABLE_BF16
661- INSTANTIATE_INVOKE_MIN_LENGTH_PENALTY (__nv_bfloat16);
662- #endif
663627
664628} // namespace turbomind
0 commit comments