Skip to content
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
a5a52a8
Adds cub as a submodule
rhenry-nv Oct 17, 2020
7085b44
Redo topk implementation
rhenry-nv Oct 19, 2020
bae684c
Cuda 11 Support for nvidia's improvements
XapaJIaMnu Oct 21, 2020
464bd52
Remove redundant includes in topk
rhenry-nv Oct 22, 2020
023f7e1
Fixes topk when all elements are the lowest value of the input type
rhenry-nv Oct 23, 2020
e463f76
Adds license to nth_element.cu
rhenry-nv Oct 23, 2020
66d879c
Removes unneccessary cub include outside of header guards
rhenry-nv Oct 26, 2020
ce2da48
Handles not found in topk stage 2
rhenry-nv Oct 26, 2020
2270945
Removes unneeded include
rhenry-nv Oct 26, 2020
5bdcf5d
Refactor topk to make more generic. Towards integration with topk ope…
rhenry-nv Dec 4, 2020
7c369ea
Refactors topk to replace the implementation of the existing topk ope…
rhenry-nv Dec 4, 2020
32445a9
Finishes refactor of topk.cuh and replaces the topk call in topk.cu. …
rhenry-nv Dec 4, 2020
e028945
Adds comment to topKLauncher template parameters. Regression tests pass
rhenry-nv Dec 4, 2020
fa7a616
Fix change log
rhenry-nv Dec 4, 2020
640aad3
Removes cub include outside of header guard. IDE keep automatically a…
rhenry-nv Dec 4, 2020
9f4ae14
Adds license to topk.cu
rhenry-nv Dec 4, 2020
9ad2ca8
Fixes TopK so that the correct result is returned if the input array …
rhenry-nv Dec 11, 2020
f9e8b83
Sets the minimal value to inf in topk
rhenry-nv Dec 11, 2020
8ac9300
Adds comments around topk struct, renames fields so that they are mor…
rhenry-nv Dec 18, 2020
808d2f4
Adds comments to topk_stage1
rhenry-nv Dec 19, 2020
835b080
Fixes code in topk_stage2
rhenry-nv Dec 19, 2020
abf2a58
Fixes some comments
rhenry-nv Dec 19, 2020
b77725e
Fixes renames beams_per_batch to items_per_row and starts commenting …
rhenry-nv Dec 19, 2020
f6641ca
Adds more comments and gives some variables better names
rhenry-nv Dec 19, 2020
ea26be4
Removes individual NVIDIA licenses and adds NVIDIA to the LICENSE.md …
rhenry-nv Dec 19, 2020
c6a4df1
changes some variable names
rhenry-nv Dec 19, 2020
5c7f240
Merge master into topk_refactor
rhenry-nv Mar 5, 2021
3130aa7
Removes diff marker from changelog
rhenry-nv Mar 5, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,6 @@
[submodule "src/3rd_party/simple-websocket-server"]
path = src/3rd_party/simple-websocket-server
url = https://github.com/marian-nmt/Simple-WebSocket-Server
[submodule "src/3rd_party/cub"]
path = src/3rd_party/cub
url = https://github.com/NVIDIA/cub
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
## [Unreleased]

### Added

- Includes cub as a dependency
- Replaces the topK implementation in nth_element.cu and topk.cu
- Add --train-embedder-rank for fine-tuning any encoder(-decoder) model for multi-lingual similarity via softmax-margin loss
- Add --logical-epoch that allows to redefine the displayed epoch counter as a multiple of n data epochs, updates or labels. Also allows to define width of fractional part with second argument.
- Add --metrics chrf for computing ChrF according to https://www.aclweb.org/anthology/W15-3049/ and SacreBLEU reference implementation
Expand Down
1 change: 1 addition & 0 deletions src/3rd_party/cub
Submodule cub added at 52d58a
302 changes: 302 additions & 0 deletions src/3rd_party/topk.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,302 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This code is modified from the topk implementation in NVIDIA's faster
* transformer repository. The original source code files can be found here:
*
* https://github.com/NVIDIA/DeepLearningExamples/blob/master/FasterTransformer/v3.0/fastertransformer/cuda/topk_kernels.cu
* https://github.com/NVIDIA/DeepLearningExamples/blob/master/FasterTransformer/v3.0/fastertransformer/cuda/topk_kernels.cuh
*/

#pragma once
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cmath>
#if CUDA_VERSION >= 11000
#include <cub/cub.cuh>
#else
#include "cub/cub/cub.cuh"
#endif

#define MAX_BLOCKS_PER_BEAM 8

template<typename T>
struct FpInfinity;

template <>
struct FpInfinity<float> {
static __host__ __device__ __forceinline__ float infinity() {
return INFINITY;
}
};

template <>
struct FpInfinity<__half> {
static __host__ __device__ __forceinline__ __half infinity() {
return __float2half(INFINITY);
}
};

template<typename IndexType, typename T>
struct TopK {
IndexType p = 0;
T u = -FpInfinity<T>::infinity();

__device__ __forceinline__ void insertKeepMax(T elem, IndexType elem_id) {
if(elem > u) {
u = elem;
p = elem_id;
}
}

__device__ __forceinline__ void insertKeepMin(T elem, IndexType elem_id) {
if(elem < u) {
u = elem;
p = elem_id;
}
}

__device__ __forceinline__ void init(bool descendingOrder) {
u = descendingOrder? -FpInfinity<T>::infinity() : FpInfinity<T>::infinity();
p = 0;
}
};

template<typename IndexType, typename T>
__device__ __forceinline__ TopK<IndexType, T> reduce_topk_max(const TopK<IndexType, T>& a, const TopK<IndexType, T>& b) {
return a.u > b.u ? a : b;
}

template<typename IndexType, typename T>
__device__ __forceinline__ TopK<IndexType, T> reduce_topk_min(const TopK<IndexType, T>& a, const TopK<IndexType, T>& b) {
return a.u < b.u ? a : b;
}

template<typename IndexType, typename T, int BLOCK_SIZE_, int BLOCKS_PER_BEAM_, bool getRowOffsets>
__global__ void topk_stage_1(T* log_probs,
IndexType* topk_tmp_id_buf,
T* topk_tmp_val_buf,
const int k,
const int vocab_size,
const int descendingOrder) {

typedef cub::BlockReduce<TopK<IndexType, T>, BLOCK_SIZE_> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;

const IndexType tid = threadIdx.x;
const IndexType bid = blockIdx.x;
const IndexType row_id = bid / BLOCKS_PER_BEAM_; // row id for log_probs
const IndexType block_lane = bid % BLOCKS_PER_BEAM_; // block id for a beam
const IndexType tmp_log_buf_index = row_id * vocab_size;
const IndexType tmp_topk_buf_index = row_id * BLOCKS_PER_BEAM_ * k + block_lane * k;
TopK<IndexType, T> partial;
const T minimal = descendingOrder? -FpInfinity<T>::infinity() : FpInfinity<T>::infinity();;

for(int ite = 0; ite < k; ite++) {
partial.init(descendingOrder);
const IndexType threadStart = tid + block_lane * BLOCK_SIZE_;

// This is needed to ensure the indices for the threads in each valid block starts in a valid range for that block.
if(threadStart < vocab_size) partial.p = threadStart;
#pragma unroll
for(IndexType elem_id = threadStart; elem_id < vocab_size; elem_id += BLOCK_SIZE_ * BLOCKS_PER_BEAM_) {
IndexType index = elem_id + tmp_log_buf_index;
descendingOrder? partial.insertKeepMax(log_probs[index], index) : partial.insertKeepMin(log_probs[index], index);
}

TopK<IndexType, T> total = BlockReduce(temp_storage).Reduce(partial, descendingOrder? reduce_topk_max<IndexType, T>: reduce_topk_min<IndexType, T>);

if (tid == 0) {
const int index = tmp_topk_buf_index + ite;
topk_tmp_id_buf[index] = getRowOffsets? total.p - tmp_log_buf_index : total.p;
topk_tmp_val_buf[index] = total.u;
// If we found a max, blank out the value in the log prob array before starting the next iteration.
// Otherwise, we don't need to issue a write since all prob values must have been T::min()
if(total.u != minimal) log_probs[total.p] = minimal;
}
__syncthreads();
}

// Update prob array with original values.
for(int beam = tid; beam < k; beam += BLOCK_SIZE_) {
const IndexType index = tmp_topk_buf_index + beam;
T val = topk_tmp_val_buf[index];
// We only want to replace the value in the log prob array if a value was blanked out (we found a max).
// When a max isn't found, topk_tmp_val_buf[index] will be T::min()
if(val != minimal) {
IndexType k_idx = getRowOffsets? topk_tmp_id_buf[index] + tmp_log_buf_index : topk_tmp_id_buf[index];
log_probs[k_idx] = (T)topk_tmp_val_buf[index];
}
}
}

template<typename IndexType, typename T, int BLOCK_SIZE_, int BLOCKS_PER_BEAM_>
__global__ void topk_stage_2(const IndexType* __restrict topk_tmp_id_buf,
T* topk_tmp_val_buf,
TopK<IndexType, T>* top,
IndexType* outIndices,
T* outVals,
const int beams_per_batch,
const int k,
bool descendingOrder) {

const int size = beams_per_batch * k * BLOCKS_PER_BEAM_;
const int tid = threadIdx.x;
const int batch_id = blockIdx.x;
const T minimal = descendingOrder? -FpInfinity<T>::infinity() : FpInfinity<T>::infinity();;

typedef cub::BlockReduce<TopK<IndexType, T>, BLOCK_SIZE_> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
extern __shared__ char array[];
T *s_val = topk_tmp_val_buf + batch_id * size;
TopK<IndexType, T> *topks = (TopK<IndexType, T>*)(array);

TopK<IndexType, T> partial;

for(int ite = 0; ite < k; ite++) {
partial.init(descendingOrder);
#pragma unroll
for(IndexType i = tid; i < size; i+= BLOCK_SIZE_) {
descendingOrder? partial.insertKeepMax(s_val[i], i) : partial.insertKeepMin(s_val[i], i);
}

TopK<IndexType, T> total = BlockReduce(temp_storage).Reduce(partial, descendingOrder? reduce_topk_max<IndexType, T>: reduce_topk_min<IndexType, T>);

if(tid == 0) {
topks[ite] = total;
s_val[total.p] = minimal;
}
__syncthreads();
}

for(int beam = tid; beam < k; beam += BLOCK_SIZE_) {
TopK<IndexType, T> beamOut;
IndexType indexInTmpValRow = topks[beam].p;
beamOut.p = topk_tmp_id_buf[batch_id * size + indexInTmpValRow];
beamOut.u = topks[beam].u;
if(top) top[batch_id * k + beam] = beamOut;
if(outIndices) outIndices[batch_id * k + beam] = beamOut.p;
if(outVals) outVals[batch_id * k + beam] = beamOut.u;
}
}

#define CASE_K(K,BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_) \
case K: \
topk_stage_1<IndexType, T, BLOCK_SIZE_1_, BLOCKS_PER_BEAM_, getRowOffsets><<<batch_size * beams_per_batch * BLOCKS_PER_BEAM_, BLOCK_SIZE_1_, 0, stream>>>( \
log_probs, \
topk_tmp_id_buf, \
topk_tmp_val_buf, \
k, vocab_size, descendingOrder); \
topk_stage_2<IndexType, T, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_><<<batch_size, BLOCK_SIZE_2_, K * sizeof(TopK<IndexType, T>), stream>>>( \
topk_tmp_id_buf, \
topk_tmp_val_buf, \
tops, \
outIndices, \
outVals, \
beams_per_batch, \
k, descendingOrder); \
break; \

// The getRowOffsets template parameter is added so the topk implementation works with both nth_element.cu and the topk operator.
// It is a template parameter since we know at compile time which version of topk we want to call. This flag can be removed whenever nth
// element.cu is removed. When this flag is true, the indices returns are the offsets within the row. When the flag is false, the indices
// returned are offset from the base pointer.
template <typename IndexType, typename T, bool getRowOffsets=false>
void topK_kernelLauncher(T* log_probs,
IndexType* topk_tmp_id_buf,
T* topk_tmp_val_buf,
TopK<IndexType, T>* tops,
const int batch_size,
const int beams_per_batch,
const int k,
const int vocab_size,
bool descendingOrder,
cudaStream_t stream) {

IndexType* outIndices = nullptr;
T* outVals = nullptr;
switch(k) {
CASE_K(1,128,128,MAX_BLOCKS_PER_BEAM);
CASE_K(2,128,128,MAX_BLOCKS_PER_BEAM);
CASE_K(4,128,128,MAX_BLOCKS_PER_BEAM);
CASE_K(6,128,128,MAX_BLOCKS_PER_BEAM);
CASE_K(8,128,128,MAX_BLOCKS_PER_BEAM);
CASE_K(10,128,128,MAX_BLOCKS_PER_BEAM);
CASE_K(16,128,128,5);
CASE_K(32,256,128,1);
CASE_K(64,256,256,1);
default:
topk_stage_1<IndexType, T, 128, 1, getRowOffsets><<<batch_size * beams_per_batch * 1, 128, 0, stream>>>(log_probs,
topk_tmp_id_buf,
topk_tmp_val_buf,
k,
vocab_size,
descendingOrder);

topk_stage_2<IndexType, T, 128, 1><<<batch_size, 128, k * sizeof(TopK<IndexType, T>), stream>>>(topk_tmp_id_buf,
topk_tmp_val_buf,
tops,
outIndices,
outVals,
beams_per_batch,
k,
descendingOrder);
break;
}
}

template <typename IndexType, typename T, bool getRowOffsets=false>
void topK_kernelLauncher(T* log_probs,
IndexType* topk_tmp_id_buf,
T* topk_tmp_val_buf,
IndexType* outIndices,
T* outVals,
const int batch_size,
const int beams_per_batch,
const int k,
const int vocab_size,
bool descendingOrder,
cudaStream_t stream) {

TopK<IndexType, T>* tops = nullptr;
switch(k) {
CASE_K(1,128,128,MAX_BLOCKS_PER_BEAM);
CASE_K(2,128,128,MAX_BLOCKS_PER_BEAM);
CASE_K(4,128,128,MAX_BLOCKS_PER_BEAM);
CASE_K(6,128,128,MAX_BLOCKS_PER_BEAM);
CASE_K(8,128,128,MAX_BLOCKS_PER_BEAM);
CASE_K(10,128,128,MAX_BLOCKS_PER_BEAM);
CASE_K(16,128,128,5);
CASE_K(32,256,128,1);
CASE_K(64,256,256,1);
default:
topk_stage_1<IndexType, T, 128, 1, getRowOffsets><<<batch_size * beams_per_batch * 1, 128, 0, stream>>>(log_probs,
topk_tmp_id_buf,
topk_tmp_val_buf,
k,
vocab_size,
descendingOrder);

topk_stage_2<IndexType, T, 128, 1><<<batch_size, 128, k * sizeof(TopK<IndexType, T>), stream>>>(topk_tmp_id_buf,
topk_tmp_val_buf,
tops,
outIndices,
outVals,
beams_per_batch,
k,
descendingOrder);
break;
}
}
6 changes: 6 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# Part of this file was contributed by NVIDIA under license:
# Copyright (C) 2020 NVIDIA Corporation
# SPDX-License-Identifier: MIT

add_definitions(-DCUB_IGNORE_DEPRECATED_CPP_DIALECT=1)
add_definitions(-DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT=1)
add_subdirectory(3rd_party)

include_directories(.)
Expand Down
Loading