-
Notifications
You must be signed in to change notification settings - Fork 145
Convert non-type template parameters to runtime parameters in CAGRA search to cut binary size #1498
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
Auto-sync is disabled for draft pull requests in this repository. Workflows must be run manually. Contributors can view more details about this message here. |
| template <class K, class V, unsigned warp_size = 32> | ||
| struct warp_merge_core_n { | ||
| RAFT_DEVICE_INLINE_FUNCTION void operator()( | ||
| K* ks, V* vs, unsigned n, const std::uint32_t range, const bool asc) | ||
| { | ||
| const auto lane_id = threadIdx.x % warp_size; | ||
|
|
||
| if (range == 1) { | ||
| for (std::uint32_t b = 2; b <= N; b <<= 1) { | ||
| for (std::uint32_t b = 2; b <= n; b <<= 1) { | ||
| for (std::uint32_t c = b / 2; c >= 1; c >>= 1) { | ||
| #pragma unroll | ||
| for (std::uint32_t i = 0; i < N; i++) { | ||
| for (std::uint32_t i = 0; i < n; i++) { | ||
| std::uint32_t j = i ^ c; | ||
| if (i >= j) continue; | ||
| const auto line_id = i + (N * lane_id); | ||
| const auto line_id = i + (n * lane_id); | ||
| const auto p = static_cast<bool>(line_id & b) == static_cast<bool>(line_id & c); | ||
| swap_if_needed(k[i], v[i], k[j], v[j], p); | ||
| swap_if_needed(ks[i], vs[i], ks[j], vs[j], p); | ||
| } | ||
| } | ||
| } | ||
| return; | ||
| } | ||
|
|
||
| const std::uint32_t b = range; | ||
| for (std::uint32_t c = b / 2; c >= 1; c >>= 1) { | ||
| const auto p = static_cast<bool>(lane_id & b) == static_cast<bool>(lane_id & c); | ||
| #pragma unroll | ||
| for (std::uint32_t i = 0; i < N; i++) { | ||
| swap_if_needed(k[i], v[i], c, p); | ||
| } | ||
| } | ||
| const auto p = ((lane_id & b) == 0); | ||
| for (std::uint32_t c = N / 2; c >= 1; c >>= 1) { | ||
| } else { | ||
| const std::uint32_t b = range; | ||
| for (std::uint32_t c = b / 2; c >= 1; c >>= 1) { | ||
| const auto p = static_cast<bool>(lane_id & b) == static_cast<bool>(lane_id & c); | ||
| #pragma unroll | ||
| for (std::uint32_t i = 0; i < N; i++) { | ||
| std::uint32_t j = i ^ c; | ||
| if (i >= j) continue; | ||
| swap_if_needed(k[i], v[i], k[j], v[j], p); | ||
| } | ||
| } | ||
| } | ||
| }; | ||
|
|
||
| template <class K, class V, unsigned warp_size> | ||
| struct warp_merge_core<K, V, 6, warp_size> { | ||
| RAFT_DEVICE_INLINE_FUNCTION void operator()(K k[6], | ||
| V v[6], | ||
| const std::uint32_t range, | ||
| const bool asc) | ||
| { | ||
| constexpr unsigned N = 6; | ||
| const auto lane_id = threadIdx.x % warp_size; | ||
|
|
||
| if (range == 1) { | ||
| for (std::uint32_t i = 0; i < N; i += 3) { | ||
| const auto p = (i == 0); | ||
| swap_if_needed(k[0 + i], v[0 + i], k[1 + i], v[1 + i], p); | ||
| swap_if_needed(k[1 + i], v[1 + i], k[2 + i], v[2 + i], p); | ||
| swap_if_needed(k[0 + i], v[0 + i], k[1 + i], v[1 + i], p); | ||
| } | ||
| const auto p = ((lane_id & 1) == 0); | ||
| for (std::uint32_t i = 0; i < 3; i++) { | ||
| std::uint32_t j = i + 3; | ||
| swap_if_needed(k[i], v[i], k[j], v[j], p); | ||
| } | ||
| for (std::uint32_t i = 0; i < N; i += 3) { | ||
| swap_if_needed(k[0 + i], v[0 + i], k[1 + i], v[1 + i], p); | ||
| swap_if_needed(k[1 + i], v[1 + i], k[2 + i], v[2 + i], p); | ||
| swap_if_needed(k[0 + i], v[0 + i], k[1 + i], v[1 + i], p); | ||
| } | ||
| return; | ||
| } | ||
|
|
||
| const std::uint32_t b = range; | ||
| for (std::uint32_t c = b / 2; c >= 1; c >>= 1) { | ||
| const auto p = static_cast<bool>(lane_id & b) == static_cast<bool>(lane_id & c); | ||
| #pragma unroll | ||
| for (std::uint32_t i = 0; i < N; i++) { | ||
| swap_if_needed(k[i], v[i], c, p); | ||
| for (std::uint32_t i = 0; i < n; i++) { | ||
| swap_if_needed(ks[i], vs[i], c, p); | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change effectively removes all of the loop unrolling, because n is not known at compile time (you can safely remove #pragra unroll as it does nothing now btw). In particular, this means the input arrays k and v cannot be passed and accessed via registers. This will likely have a huge impact on performance.
Please run a few benchmarks using ANN_BENCH first to see the impact on the throughput. From there, we can decide whether (a) performance is acceptable, (b) we need to profile the the kernel using NCU and try to improve performance, or (c) the perf state is hopeless and cannot be recovered without manual loop unrolling / restoring the template parameter.
For the benchmarks, I'd suggest the following parameter sweep:
./build.sh -n libcuvs bench-ann --limit-bench-ann=CUVS_CAGRA_ANN_BENCH
./cpp/build/bench/ann/CUVS_CAGRA_ANN_BENCH \
--search \
--benchmark_min_time=10s \
--benchmark_min_warmup_time=0.001 \
--benchmark_counters_tabular=true \
--benchmark_out=cagra-search-`git rev-parse --abbrev-ref HEAD`.csv \
--benchmark_out_format=csv \
--data_prefix=<data folder> \
--index_prefix=<index folder> \
--override_kv=algo:\"single_cta\" \
--override_kv=k:10:100 \
--override_kv=itopk:32:64:128:256:512 \
--override_kv=max_iterations:20 \
--override_kv=n_queries:10000 \
<config file>
… on successive warp_merge calls when N is large, we now handle run-time branch in a higher level
…c sort key, value pairs
…ch the original code
…re of large N path negatively impacting the performance of small N cases
…own (16-17%) max_itopk=512 with batch size 10000 cases
This PR converts
MAX_ITOPK & MAX_CANDIDATES in single-CTA search and
MAX_ELEMENTS in multi-CTA search to runtime parameters.
Cut libcuvs.so size (CUDA 13,
build.sh --allgpuarch -n libcuvs) from 459 MB to 350 MB.For correctness testing,
ran
To evaluate performance impact,
ran cuvs_bench with batch sizes (10, 100, 1000, 10000) and k (10 and 100) (default options for other parameters) and
deep-image-96-innerfor both single-CTA and multi-CTA searches.
Performance impacts varies based on MAX_ITOPK and MAX_CANDIDATES combinations but performance numbers were roughly comparable (slightly slower in average with the maximum slowdown around 10%).
Let me know if there are other benchmarks I need to run to test performance.
Some performance logs from the original code and the updated code for anyone interested (batch size = 10K, K=100).
Original (main)
Updated