From 659dc6ab3645039d4c7cf6cb01d77db522a8c8be Mon Sep 17 00:00:00 2001 From: Alexey Kukanov Date: Tue, 21 Oct 2025 23:16:26 +0200 Subject: [PATCH 1/7] Unify simd_calc_mask* --- include/oneapi/dpl/pstl/algorithm_impl.h | 5 +++-- include/oneapi/dpl/pstl/unseq_backend_simd.h | 21 +++----------------- 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index b6a7e0acb9b..ba4c723c228 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -1272,7 +1272,7 @@ ::std::pair<_DifferenceType, _DifferenceType> __brick_calc_mask_1(_RandomAccessIterator __first, _RandomAccessIterator __last, bool* __mask, _UnaryPredicate __pred, /*vector=*/::std::true_type) noexcept { - auto __result = __unseq_backend::__simd_calc_mask_1(__first, __last - __first, __mask, __pred); + auto __result = __unseq_backend::__simd_compute_mask(__mask, __last - __first, __pred, __first); return ::std::make_pair(__result, (__last - __first) - __result); } @@ -1720,7 +1720,8 @@ _DifferenceType __brick_calc_mask_2(_RandomAccessIterator __first, _RandomAccessIterator __last, bool* __restrict __mask, _BinaryPredicate __pred, /*vector=*/::std::true_type) noexcept { - return __unseq_backend::__simd_calc_mask_2(__first, __last - __first, __mask, __pred); + return __unseq_backend::__simd_compute_mask(__mask, __last - __first, __not_pred<_BinaryPredicate&>(__pred), + __first, __first - 1); } template +template _DifferenceType -__simd_calc_mask_2(_InputIterator __first, _DifferenceType __n, bool* __mask, _BinaryPredicate __pred) noexcept +__simd_compute_mask(bool* __mask, _DifferenceType __n, _Predicate __pred, _Iterators... __it) noexcept { _DifferenceType __count = 0; _ONEDPL_PRAGMA_SIMD_REDUCTION(+ : __count) for (_DifferenceType __i = 0; __i < __n; ++__i) { - __mask[__i] = !__pred(__first[__i], __first[__i - 1]); - __count += __mask[__i]; - } - return __count; -} - -template -_DifferenceType -__simd_calc_mask_1(_InputIterator __first, _DifferenceType __n, bool* __mask, _UnaryPredicate __pred) noexcept -{ - _DifferenceType __count = 0; - - _ONEDPL_PRAGMA_SIMD_REDUCTION(+ : __count) - for (_DifferenceType __i = 0; __i < __n; ++__i) - { - __mask[__i] = __pred(__first[__i]); + __mask[__i] = __pred(__it[__i]... ); __count += __mask[__i]; } return __count; From aa8d51d26b11942f9a88937ca575dde45e8504f5 Mon Sep 17 00:00:00 2001 From: Alexey Kukanov Date: Wed, 22 Oct 2025 11:03:45 +0200 Subject: [PATCH 2/7] Unify brick_calc_mask* --- include/oneapi/dpl/pstl/algorithm_fwd.h | 29 +++---- include/oneapi/dpl/pstl/algorithm_impl.h | 90 +++++++------------- include/oneapi/dpl/pstl/unseq_backend_simd.h | 2 +- 3 files changed, 40 insertions(+), 81 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_fwd.h b/include/oneapi/dpl/pstl/algorithm_fwd.h index 773343f9259..6dd4aeb3be5 100644 --- a/include/oneapi/dpl/pstl/algorithm_fwd.h +++ b/include/oneapi/dpl/pstl/algorithm_fwd.h @@ -442,14 +442,15 @@ __brick_bounded_copy_if(_RandomAccessIterator1, typename std::iterator_traits<_R _RandomAccessIterator2, typename std::iterator_traits<_RandomAccessIterator2>::difference_type, _UnaryPredicate, /*vector=*/std::true_type) noexcept; -template -::std::pair<_DifferenceType, _DifferenceType> -__brick_calc_mask_1(_ForwardIterator, _ForwardIterator, bool* __restrict, _UnaryPredicate, - /*vector=*/::std::false_type) noexcept; -template -::std::pair<_DifferenceType, _DifferenceType> -__brick_calc_mask_1(_RandomAccessIterator, _RandomAccessIterator, bool* __restrict, _UnaryPredicate, - /*vector=*/::std::true_type) noexcept; +template +std::pair<_DifferenceType, _DifferenceType> +__brick_compute_mask(/*vector=*/std::false_type, bool* __mask, _DifferenceType __len, _Predicate __pred, + _RandomAccessIterator... __it) noexcept; + +template +std::pair<_DifferenceType, _DifferenceType> +__brick_compute_mask(/*vector=*/std::true_type, bool* __mask, _DifferenceType __len, _Predicate __pred, + _RandomAccessIterator... __it) noexcept; template void @@ -458,7 +459,7 @@ __brick_copy_by_mask(_ForwardIterator, _ForwardIterator, _OutputIterator, bool*, template void -__brick_copy_by_mask(_RandomAccessIterator, _RandomAccessIterator, _OutputIterator, bool* __restrict, +__brick_copy_by_mask(_RandomAccessIterator, _RandomAccessIterator, _OutputIterator, bool*, /*vector=*/::std::true_type) noexcept; template @@ -561,16 +562,6 @@ _OutputIterator __pattern_unique_copy(_Tag, _ExecutionPolicy&&, _ForwardIterator, _ForwardIterator, _OutputIterator, _BinaryPredicate) noexcept; -template -_DifferenceType -__brick_calc_mask_2(_RandomAccessIterator, _RandomAccessIterator, bool* __restrict, _BinaryPredicate, - /*vector=*/::std::false_type) noexcept; - -template -_DifferenceType -__brick_calc_mask_2(_RandomAccessIterator, _RandomAccessIterator, bool* __restrict, _BinaryPredicate, - /*vector=*/::std::true_type) noexcept; - template _RandomAccessIterator2 diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index ba4c723c228..6710a269159 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -1244,36 +1244,27 @@ __brick_bounded_copy_if(_RandomAccessIterator1 __first, return {__first, __result}; } -// TODO: Try to use transform_reduce for combining __brick_copy_if_phase1 on IsVector. -template -::std::pair<_DifferenceType, _DifferenceType> -__brick_calc_mask_1(_ForwardIterator __first, _ForwardIterator __last, bool* __restrict __mask, _UnaryPredicate __pred, - /*vector=*/::std::false_type) noexcept +template +std::pair<_DifferenceType, _DifferenceType> +__brick_compute_mask(/*vector=*/std::false_type, bool* __mask, _DifferenceType __len, _Predicate __pred, + _RandomAccessIterator... __it) noexcept { - auto __count_true = _DifferenceType(0); - auto __size = __last - __first; - - static_assert(__is_random_access_iterator_v<_ForwardIterator>, - "Pattern-brick error. Should be a random access iterator."); - - for (; __first != __last; ++__first, (void)++__mask) + _DifferenceType __count_true = 0; + for (_DifferenceType __i = 0; __i < __len; ++__i) { - *__mask = __pred(*__first); - if (*__mask) - { - ++__count_true; - } + __mask[__i] = __pred(__it[__i]...); + __count_true += __mask[__i]; } - return ::std::make_pair(__count_true, __size - __count_true); + return std::make_pair(__count_true, __len - __count_true); } -template -::std::pair<_DifferenceType, _DifferenceType> -__brick_calc_mask_1(_RandomAccessIterator __first, _RandomAccessIterator __last, bool* __mask, _UnaryPredicate __pred, - /*vector=*/::std::true_type) noexcept +template +std::pair<_DifferenceType, _DifferenceType> +__brick_compute_mask(/*vector=*/std::true_type, bool* __mask, _DifferenceType __len, _Predicate __pred, + _RandomAccessIterator... __it) noexcept { - auto __result = __unseq_backend::__simd_compute_mask(__mask, __last - __first, __pred, __first); - return ::std::make_pair(__result, (__last - __first) - __result); + auto __count_true = __unseq_backend::__simd_compute_mask(__mask, __len, __pred, __it...); + return std::make_pair(__count_true, __len - __count_true); } template @@ -1294,7 +1285,7 @@ __brick_copy_by_mask(_ForwardIterator __first, _ForwardIterator __last, _OutputI template void __brick_copy_by_mask(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, - bool* __restrict __mask, _Assigner __assigner, /*vector=*/::std::true_type) noexcept + bool* __mask, _Assigner __assigner, /*vector=*/::std::true_type) noexcept { #if (_PSTL_MONOTONIC_PRESENT || _ONEDPL_MONOTONIC_PRESENT) __unseq_backend::__simd_copy_by_mask(__first, __last - __first, __result, __mask, __assigner); @@ -1324,7 +1315,7 @@ __brick_bounded_copy_by_mask(_RandomAccessIterator1 __first, _Bound __in_len, _R template _Bound __brick_bounded_copy_by_mask(_RandomAccessIterator1 __first, _Bound __in_len, _RandomAccessIterator2 __result, - _Bound __out_len, bool* __restrict __mask, _Assigner __assigner, + _Bound __out_len, bool* __mask, _Assigner __assigner, /*vector=*/std::true_type) noexcept { #if (_PSTL_MONOTONIC_PRESENT || _ONEDPL_MONOTONIC_PRESENT) @@ -1413,8 +1404,7 @@ __pattern_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomA __par_backend::__parallel_strict_scan( __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __n, _DifferenceType(0), [=](_DifferenceType __i, _DifferenceType __len) { // Reduce - return __internal::__brick_calc_mask_1<_DifferenceType>(__first + __i, __first + (__i + __len), - __mask + __i, __pred, _IsVector{}) + return __internal::__brick_compute_mask(_IsVector{}, __mask + __i, __len, __pred, __first + __i) .first; }, ::std::plus<_DifferenceType>(), // Combine @@ -1445,13 +1435,15 @@ __pattern_bounded_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, __par_backend::__buffer __mask_buf(__n); bool* __mask = __mask_buf.get(); - return __internal::__except_handler([&__exec, __n, __first, __result, __pred, __mask, __n_out]() { + auto __it_pred = + [=](_RandomAccessIterator1 __it, _DifferenceType __idx) { return std::invoke(__pred, __it[__idx]); }; + return __internal::__except_handler([&__exec, __n, __first, __result, __it_pred, __mask, __n_out]() { _DifferenceType __res_in{__n}, __res_out{__n_out}; __par_backend::__parallel_strict_scan( __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __n, _DifferenceType(0), [=](_DifferenceType __i, _DifferenceType __len) { // Reduce - return __internal::__brick_calc_mask_1<_DifferenceType>( - __first + __i, __first + (__i + __len), __mask + __i, __pred, _IsVector{}).first; + return __internal::__brick_compute_mask(__first + __i, __len, __it_pred, __mask + __i, _IsVector{}) + .first; }, std::plus<_DifferenceType>(), // Combine [=, &__res_in](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial) { // Scan @@ -1701,29 +1693,6 @@ __pattern_unique_copy(_Tag, _ExecutionPolicy&&, _ForwardIterator __first, _Forwa return __internal::__brick_unique_copy(__first, __last, __result, __pred, typename _Tag::__is_vector{}); } -template -_DifferenceType -__brick_calc_mask_2(_RandomAccessIterator __first, _RandomAccessIterator __last, bool* __restrict __mask, - _BinaryPredicate __pred, /*vector=*/::std::false_type) noexcept -{ - _DifferenceType __count = 0; - for (; __first != __last; ++__first, (void)++__mask) - { - *__mask = !__pred(*__first, *(__first - 1)); - __count += *__mask; - } - return __count; -} - -template -_DifferenceType -__brick_calc_mask_2(_RandomAccessIterator __first, _RandomAccessIterator __last, bool* __restrict __mask, - _BinaryPredicate __pred, /*vector=*/::std::true_type) noexcept -{ - return __unseq_backend::__simd_compute_mask(__mask, __last - __first, __not_pred<_BinaryPredicate&>(__pred), - __first, __first - 1); -} - template _RandomAccessIterator2 @@ -1739,12 +1708,12 @@ __pattern_unique_copy(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Ran __par_backend::__buffer __mask_buf(__n); if (_DifferenceType(2) < __n) { - return __internal::__except_handler([&__exec, __n, __first, __result, __pred, &__mask_buf]() { + return __internal::__except_handler([&__exec, __n, __first, __result, &__pred, &__mask_buf]() { bool* __mask = __mask_buf.get(); _DifferenceType __m{}; __par_backend::__parallel_strict_scan( __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __n, _DifferenceType(0), - [=](_DifferenceType __i, _DifferenceType __len) -> _DifferenceType { // Reduce + [=, &__pred](_DifferenceType __i, _DifferenceType __len) -> _DifferenceType { // Reduce _DifferenceType __extra = 0; if (__i == 0) { @@ -1755,9 +1724,9 @@ __pattern_unique_copy(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Ran ++__i; ++__extra; } - return __internal::__brick_calc_mask_2<_DifferenceType>(__first + __i, __first + (__i + __len), - __mask + __i, __pred, _IsVector{}) + - __extra; + __not_pred<_BinaryPredicate&> __pred_negated(__pred); + return __internal::__brick_compute_mask(_IsVector{}, __mask + __i, __len, __pred_negated, + __first + __i, __first + (__i - 1)).first + __extra; }, ::std::plus<_DifferenceType>(), // Combine [=](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial) { // Scan @@ -2505,8 +2474,7 @@ __pattern_partition_copy(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _ __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __n, std::make_pair(_DifferenceType(0), _DifferenceType(0)), [=](_DifferenceType __i, _DifferenceType __len) { // Reduce - return __internal::__brick_calc_mask_1<_DifferenceType>(__first + __i, __first + (__i + __len), - __mask + __i, __pred, _IsVector{}); + return __internal::__brick_compute_mask(_IsVector{}, __mask + __i, __len, __pred, __first + __i); }, [](const _ReturnType& __x, const _ReturnType& __y) -> _ReturnType { return ::std::make_pair(__x.first + __y.first, __x.second + __y.second); diff --git a/include/oneapi/dpl/pstl/unseq_backend_simd.h b/include/oneapi/dpl/pstl/unseq_backend_simd.h index 3ebd1fd425b..8c34dcd86b7 100644 --- a/include/oneapi/dpl/pstl/unseq_backend_simd.h +++ b/include/oneapi/dpl/pstl/unseq_backend_simd.h @@ -291,7 +291,7 @@ __simd_compute_mask(bool* __mask, _DifferenceType __n, _Predicate __pred, _Itera _ONEDPL_PRAGMA_SIMD_REDUCTION(+ : __count) for (_DifferenceType __i = 0; __i < __n; ++__i) { - __mask[__i] = __pred(__it[__i]... ); + __mask[__i] = __pred(__it[__i]...); __count += __mask[__i]; } return __count; From 7877346662406c3639a76d2b8f15d160d1130520 Mon Sep 17 00:00:00 2001 From: Alexey Kukanov Date: Tue, 21 Oct 2025 23:41:39 +0200 Subject: [PATCH 3/7] Improve handling of the first element by unique_copy --- include/oneapi/dpl/pstl/algorithm_impl.h | 57 ++++++++++-------------- 1 file changed, 23 insertions(+), 34 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 6710a269159..738597426db 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -1702,43 +1702,32 @@ __pattern_unique_copy(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Ran using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag; using _DifferenceType = typename std::iterator_traits<_RandomAccessIterator1>::difference_type; - const _DifferenceType __n = __last - __first; + _DifferenceType __n = __last - __first; if (_DifferenceType(2) < __n) { + *__result++ = *__first++; // Always copy the first element + --__n; __par_backend::__buffer __mask_buf(__n); - if (_DifferenceType(2) < __n) - { - return __internal::__except_handler([&__exec, __n, __first, __result, &__pred, &__mask_buf]() { - bool* __mask = __mask_buf.get(); - _DifferenceType __m{}; - __par_backend::__parallel_strict_scan( - __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __n, _DifferenceType(0), - [=, &__pred](_DifferenceType __i, _DifferenceType __len) -> _DifferenceType { // Reduce - _DifferenceType __extra = 0; - if (__i == 0) - { - // Special boundary case - __mask[__i] = true; - if (--__len == 0) - return 1; - ++__i; - ++__extra; - } - __not_pred<_BinaryPredicate&> __pred_negated(__pred); - return __internal::__brick_compute_mask(_IsVector{}, __mask + __i, __len, __pred_negated, - __first + __i, __first + (__i - 1)).first + __extra; - }, - ::std::plus<_DifferenceType>(), // Combine - [=](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial) { // Scan - // Phase 2 is same as for __pattern_copy_if - __internal::__brick_copy_by_mask( - __first + __i, __first + (__i + __len), __result + __initial, __mask + __i, - [](_RandomAccessIterator1 __x, _RandomAccessIterator2 __z) { *__z = *__x; }, _IsVector{}); - }, - [&__m](_DifferenceType __total) { __m = __total; }); - return __result + __m; - }); - } + return __internal::__except_handler([&__exec, __n, __first, __result, &__pred, &__mask_buf]() { + bool* __mask = __mask_buf.get(); + _DifferenceType __m{}; + __par_backend::__parallel_strict_scan( + __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __n, _DifferenceType(0), + [=, &__pred](_DifferenceType __i, _DifferenceType __len) -> _DifferenceType { // Reduce + __not_pred<_BinaryPredicate&> __pred_negated(__pred); + return __internal::__brick_compute_mask(_IsVector{}, __mask + __i, __len, __pred_negated, + __first + __i, __first + (__i - 1)).first; + }, + std::plus<_DifferenceType>(), // Combine + [=](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial) { // Scan + // Phase 2 is same as for __pattern_copy_if + __internal::__brick_copy_by_mask( + __first + __i, __first + (__i + __len), __result + __initial, __mask + __i, + [](_RandomAccessIterator1 __x, _RandomAccessIterator2 __z) { *__z = *__x; }, _IsVector{}); + }, + [&__m](_DifferenceType __total) { __m = __total; }); // Apex + return __result + __m; + }); } // trivial sequence - use serial algorithm return __internal::__brick_unique_copy(__first, __last, __result, __pred, _IsVector{}); From eb67f7770c2d2da65c5b59617c2e79316fd42924 Mon Sep 17 00:00:00 2001 From: Alexey Kukanov Date: Fri, 24 Oct 2025 14:19:25 +0200 Subject: [PATCH 4/7] Rework *_compute_mask without variadics --- include/oneapi/dpl/pstl/algorithm_fwd.h | 12 +++--- include/oneapi/dpl/pstl/algorithm_impl.h | 40 +++++++++++--------- include/oneapi/dpl/pstl/unseq_backend_simd.h | 6 +-- 3 files changed, 31 insertions(+), 27 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_fwd.h b/include/oneapi/dpl/pstl/algorithm_fwd.h index 6dd4aeb3be5..2f1abd2ff5f 100644 --- a/include/oneapi/dpl/pstl/algorithm_fwd.h +++ b/include/oneapi/dpl/pstl/algorithm_fwd.h @@ -442,15 +442,15 @@ __brick_bounded_copy_if(_RandomAccessIterator1, typename std::iterator_traits<_R _RandomAccessIterator2, typename std::iterator_traits<_RandomAccessIterator2>::difference_type, _UnaryPredicate, /*vector=*/std::true_type) noexcept; -template +template std::pair<_DifferenceType, _DifferenceType> -__brick_compute_mask(/*vector=*/std::false_type, bool* __mask, _DifferenceType __len, _Predicate __pred, - _RandomAccessIterator... __it) noexcept; +__brick_compute_mask(_RandomAccessIterator, _DifferenceType, _IterPredicate, bool*, + /*vector=*/std::false_type) noexcept; -template +template std::pair<_DifferenceType, _DifferenceType> -__brick_compute_mask(/*vector=*/std::true_type, bool* __mask, _DifferenceType __len, _Predicate __pred, - _RandomAccessIterator... __it) noexcept; +__brick_compute_mask(_RandomAccessIterator, _DifferenceType, _IterPredicate, bool*, + /*vector=*/std::true_type) noexcept; template void diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 738597426db..f714c356f0e 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -1244,26 +1244,26 @@ __brick_bounded_copy_if(_RandomAccessIterator1 __first, return {__first, __result}; } -template +template std::pair<_DifferenceType, _DifferenceType> -__brick_compute_mask(/*vector=*/std::false_type, bool* __mask, _DifferenceType __len, _Predicate __pred, - _RandomAccessIterator... __it) noexcept +__brick_compute_mask(_RandomAccessIterator __first, _DifferenceType __len, _IterPredicate __pred, bool* __mask, + /*vector=*/std::false_type) noexcept { _DifferenceType __count_true = 0; for (_DifferenceType __i = 0; __i < __len; ++__i) { - __mask[__i] = __pred(__it[__i]...); + __mask[__i] = __pred(__first, __i); __count_true += __mask[__i]; } return std::make_pair(__count_true, __len - __count_true); } -template +template std::pair<_DifferenceType, _DifferenceType> -__brick_compute_mask(/*vector=*/std::true_type, bool* __mask, _DifferenceType __len, _Predicate __pred, - _RandomAccessIterator... __it) noexcept +__brick_compute_mask(_RandomAccessIterator __first, _DifferenceType __len, _IterPredicate __pred, bool* __mask, + /*vector=*/std::true_type) noexcept { - auto __count_true = __unseq_backend::__simd_compute_mask(__mask, __len, __pred, __it...); + auto __count_true = __unseq_backend::__simd_compute_mask(__first, __len, __pred, __mask); return std::make_pair(__count_true, __len - __count_true); } @@ -1398,14 +1398,15 @@ __pattern_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomA if (_DifferenceType(1) < __n) { __par_backend::__buffer __mask_buf(__n); - return __internal::__except_handler([&__exec, __n, __first, __result, __pred, &__mask_buf]() { + return __internal::__except_handler([&__exec, __n, __first, __result, &__pred, &__mask_buf]() { bool* __mask = __mask_buf.get(); _DifferenceType __m{}; __par_backend::__parallel_strict_scan( __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __n, _DifferenceType(0), - [=](_DifferenceType __i, _DifferenceType __len) { // Reduce - return __internal::__brick_compute_mask(_IsVector{}, __mask + __i, __len, __pred, __first + __i) - .first; + [=, &__pred](_DifferenceType __i, _DifferenceType __len) { // Reduce + return __internal::__brick_compute_mask(__first + __i, __len, + [&__pred](_RandomAccessIterator1 __it, _DifferenceType __idx){ return __pred(__it[__idx]);}, + __mask + __i, _IsVector{}).first; }, ::std::plus<_DifferenceType>(), // Combine [=](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial) { // Scan @@ -1714,9 +1715,10 @@ __pattern_unique_copy(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _Ran __par_backend::__parallel_strict_scan( __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __n, _DifferenceType(0), [=, &__pred](_DifferenceType __i, _DifferenceType __len) -> _DifferenceType { // Reduce - __not_pred<_BinaryPredicate&> __pred_negated(__pred); - return __internal::__brick_compute_mask(_IsVector{}, __mask + __i, __len, __pred_negated, - __first + __i, __first + (__i - 1)).first; + return __internal::__brick_compute_mask(__first + __i, __len, + [&__pred](_RandomAccessIterator1 __it, _DifferenceType __idx){ + return !__pred(__it[__idx], __it[__idx - 1]); + }, __mask + __i, _IsVector{}).first; }, std::plus<_DifferenceType>(), // Combine [=](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial) { // Scan @@ -2456,14 +2458,16 @@ __pattern_partition_copy(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _ if (_DifferenceType(1) < __n) { __par_backend::__buffer __mask_buf(__n); - return __internal::__except_handler([&__exec, __n, __first, __out_true, __out_false, __pred, &__mask_buf]() { + return __internal::__except_handler([&__exec, __n, __first, __out_true, __out_false, &__pred, &__mask_buf]() { bool* __mask = __mask_buf.get(); _ReturnType __m{}; __par_backend::__parallel_strict_scan( __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __n, std::make_pair(_DifferenceType(0), _DifferenceType(0)), - [=](_DifferenceType __i, _DifferenceType __len) { // Reduce - return __internal::__brick_compute_mask(_IsVector{}, __mask + __i, __len, __pred, __first + __i); + [=, &__pred](_DifferenceType __i, _DifferenceType __len) { // Reduce + return __internal::__brick_compute_mask(__first + __i, __len, + [&__pred](_RandomAccessIterator1 __it, _DifferenceType __idx){ return __pred(__it[__idx]);}, + __mask + __i, _IsVector{}); }, [](const _ReturnType& __x, const _ReturnType& __y) -> _ReturnType { return ::std::make_pair(__x.first + __y.first, __x.second + __y.second); diff --git a/include/oneapi/dpl/pstl/unseq_backend_simd.h b/include/oneapi/dpl/pstl/unseq_backend_simd.h index 8c34dcd86b7..d9861f5d74c 100644 --- a/include/oneapi/dpl/pstl/unseq_backend_simd.h +++ b/include/oneapi/dpl/pstl/unseq_backend_simd.h @@ -282,16 +282,16 @@ __simd_copy_if(_InputIterator __first, _DifferenceType __n, _OutputIterator __re return __cnt; } -template +template _DifferenceType -__simd_compute_mask(bool* __mask, _DifferenceType __n, _Predicate __pred, _Iterators... __it) noexcept +__simd_compute_mask(_Iterator __first, _DifferenceType __n, _IterPredicate __pred, bool* __mask) noexcept { _DifferenceType __count = 0; _ONEDPL_PRAGMA_SIMD_REDUCTION(+ : __count) for (_DifferenceType __i = 0; __i < __n; ++__i) { - __mask[__i] = __pred(__it[__i]...); + __mask[__i] = __pred(__first, __i); __count += __mask[__i]; } return __count; From c042e9dc6d34f93bd440ac41428bc58a122662bd Mon Sep 17 00:00:00 2001 From: Alexey Kukanov Date: Fri, 24 Oct 2025 15:24:51 +0200 Subject: [PATCH 5/7] Unify the common part of copy_if and unique_copy patterns --- include/oneapi/dpl/pstl/algorithm_fwd.h | 10 ++- include/oneapi/dpl/pstl/algorithm_impl.h | 95 +++++++++++------------- 2 files changed, 51 insertions(+), 54 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_fwd.h b/include/oneapi/dpl/pstl/algorithm_fwd.h index 2f1abd2ff5f..43692b7b2b9 100644 --- a/include/oneapi/dpl/pstl/algorithm_fwd.h +++ b/include/oneapi/dpl/pstl/algorithm_fwd.h @@ -444,12 +444,12 @@ __brick_bounded_copy_if(_RandomAccessIterator1, typename std::iterator_traits<_R template std::pair<_DifferenceType, _DifferenceType> -__brick_compute_mask(_RandomAccessIterator, _DifferenceType, _IterPredicate, bool*, +__brick_compute_mask(_RandomAccessIterator, _DifferenceType, _IterPredicate, bool*, /*vector=*/std::false_type) noexcept; template std::pair<_DifferenceType, _DifferenceType> -__brick_compute_mask(_RandomAccessIterator, _DifferenceType, _IterPredicate, bool*, +__brick_compute_mask(_RandomAccessIterator, _DifferenceType, _IterPredicate, bool*, /*vector=*/std::true_type) noexcept; template @@ -482,6 +482,12 @@ void __brick_partition_by_mask(_RandomAccessIterator, _RandomAccessIterator, _OutputIterator1, _OutputIterator2, bool*, /*vector=*/::std::true_type) noexcept; +template +_RandomAccessIterator2 +__parallel_selective_copy(__parallel_tag<_IsVector>, _ExecutionPolicy&&, _RandomAccessIterator1, _DifferenceType, + _RandomAccessIterator2, _IterPredicate); + template _OutputIterator __pattern_copy_if(_Tag, _ExecutionPolicy&&, _ForwardIterator, _ForwardIterator, _OutputIterator, diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index f714c356f0e..b25898c1e46 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -1375,6 +1375,34 @@ __brick_partition_by_mask(_RandomAccessIterator1 __first, _RandomAccessIterator1 #endif } +template +_RandomAccessIterator2 +__parallel_selective_copy(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, + _DifferenceType __n, _RandomAccessIterator2 __result, _IterPredicate __pred) +{ + using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag; + __par_backend::__buffer __mask_buf(__n); + bool* __mask = __mask_buf.get(); + + return __internal::__except_handler([&__exec, __n, __first, __result, __pred, __mask]() { + _DifferenceType __m{}; + __par_backend::__parallel_strict_scan( + __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __n, _DifferenceType(0), + [=](_DifferenceType __i, _DifferenceType __len) { // Reduce + return __internal::__brick_compute_mask(__first + __i, __len, __pred, __mask + __i, _IsVector{}).first; + }, + std::plus<_DifferenceType>(), // Combine + [=](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial) { // Scan + __internal::__brick_copy_by_mask( + __first + __i, __first + (__i + __len), __result + __initial, __mask + __i, + [](_RandomAccessIterator1 __x, _RandomAccessIterator2 __z) { *__z = *__x; }, _IsVector{}); + }, + [&__m](_DifferenceType __total) { __m = __total; }); // Apex + return __result + __m; + }); +} + template _OutputIterator __pattern_copy_if(_Tag, _ExecutionPolicy&&, _ForwardIterator __first, _ForwardIterator __last, _OutputIterator __result, @@ -1388,35 +1416,16 @@ __pattern_copy_if(_Tag, _ExecutionPolicy&&, _ForwardIterator __first, _ForwardIt template _RandomAccessIterator2 -__pattern_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, +__pattern_copy_if(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, _UnaryPredicate __pred) { - using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag; - using _DifferenceType = typename std::iterator_traits<_RandomAccessIterator1>::difference_type; const _DifferenceType __n = __last - __first; if (_DifferenceType(1) < __n) { - __par_backend::__buffer __mask_buf(__n); - return __internal::__except_handler([&__exec, __n, __first, __result, &__pred, &__mask_buf]() { - bool* __mask = __mask_buf.get(); - _DifferenceType __m{}; - __par_backend::__parallel_strict_scan( - __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __n, _DifferenceType(0), - [=, &__pred](_DifferenceType __i, _DifferenceType __len) { // Reduce - return __internal::__brick_compute_mask(__first + __i, __len, - [&__pred](_RandomAccessIterator1 __it, _DifferenceType __idx){ return __pred(__it[__idx]);}, - __mask + __i, _IsVector{}).first; - }, - ::std::plus<_DifferenceType>(), // Combine - [=](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial) { // Scan - __internal::__brick_copy_by_mask( - __first + __i, __first + (__i + __len), __result + __initial, __mask + __i, - [](_RandomAccessIterator1 __x, _RandomAccessIterator2 __z) { *__z = *__x; }, _IsVector{}); - }, - [&__m](_DifferenceType __total) { __m = __total; }); - return __result + __m; - }); + return __parallel_selective_copy( + __tag, std::forward<_ExecutionPolicy>(__exec), __first, __n, __result, + [&__pred](_RandomAccessIterator1 __it, _DifferenceType __idx) { return __pred(__it[__idx]); }); } // trivial sequence - use serial algorithm return __internal::__brick_copy_if(__first, __last, __result, __pred, _IsVector{}); @@ -1436,8 +1445,9 @@ __pattern_bounded_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, __par_backend::__buffer __mask_buf(__n); bool* __mask = __mask_buf.get(); - auto __it_pred = - [=](_RandomAccessIterator1 __it, _DifferenceType __idx) { return std::invoke(__pred, __it[__idx]); }; + auto __it_pred = [=](_RandomAccessIterator1 __it, _DifferenceType __idx) { + return std::invoke(__pred, __it[__idx]); + }; return __internal::__except_handler([&__exec, __n, __first, __result, __it_pred, __mask, __n_out]() { _DifferenceType __res_in{__n}, __res_out{__n_out}; __par_backend::__parallel_strict_scan( @@ -1697,39 +1707,19 @@ __pattern_unique_copy(_Tag, _ExecutionPolicy&&, _ForwardIterator __first, _Forwa template _RandomAccessIterator2 -__pattern_unique_copy(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, +__pattern_unique_copy(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, _BinaryPredicate __pred) { - using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag; - using _DifferenceType = typename std::iterator_traits<_RandomAccessIterator1>::difference_type; _DifferenceType __n = __last - __first; if (_DifferenceType(2) < __n) { *__result++ = *__first++; // Always copy the first element --__n; - __par_backend::__buffer __mask_buf(__n); - return __internal::__except_handler([&__exec, __n, __first, __result, &__pred, &__mask_buf]() { - bool* __mask = __mask_buf.get(); - _DifferenceType __m{}; - __par_backend::__parallel_strict_scan( - __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __n, _DifferenceType(0), - [=, &__pred](_DifferenceType __i, _DifferenceType __len) -> _DifferenceType { // Reduce - return __internal::__brick_compute_mask(__first + __i, __len, - [&__pred](_RandomAccessIterator1 __it, _DifferenceType __idx){ - return !__pred(__it[__idx], __it[__idx - 1]); - }, __mask + __i, _IsVector{}).first; - }, - std::plus<_DifferenceType>(), // Combine - [=](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial) { // Scan - // Phase 2 is same as for __pattern_copy_if - __internal::__brick_copy_by_mask( - __first + __i, __first + (__i + __len), __result + __initial, __mask + __i, - [](_RandomAccessIterator1 __x, _RandomAccessIterator2 __z) { *__z = *__x; }, _IsVector{}); - }, - [&__m](_DifferenceType __total) { __m = __total; }); // Apex - return __result + __m; - }); + return __parallel_selective_copy(__tag, std::forward<_ExecutionPolicy>(__exec), __first, __n, __result, + [&__pred](_RandomAccessIterator1 __it, _DifferenceType __idx) { + return !__pred(__it[__idx], __it[__idx - 1]); + }); } // trivial sequence - use serial algorithm return __internal::__brick_unique_copy(__first, __last, __result, __pred, _IsVector{}); @@ -2465,8 +2455,9 @@ __pattern_partition_copy(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _ __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __n, std::make_pair(_DifferenceType(0), _DifferenceType(0)), [=, &__pred](_DifferenceType __i, _DifferenceType __len) { // Reduce - return __internal::__brick_compute_mask(__first + __i, __len, - [&__pred](_RandomAccessIterator1 __it, _DifferenceType __idx){ return __pred(__it[__idx]);}, + return __internal::__brick_compute_mask( + __first + __i, __len, + [&__pred](_RandomAccessIterator1 __it, _DifferenceType __idx) { return __pred(__it[__idx]); }, __mask + __i, _IsVector{}); }, [](const _ReturnType& __x, const _ReturnType& __y) -> _ReturnType { From 028945e7a4c2f9c3df495bad0cf7aafc23d04995 Mon Sep 17 00:00:00 2001 From: Alexey Kukanov Date: Wed, 3 Dec 2025 18:31:01 +0100 Subject: [PATCH 6/7] Improve test diagnostic formatting --- test/parallel_api/ranges/std_ranges_test.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/parallel_api/ranges/std_ranges_test.h b/test/parallel_api/ranges/std_ranges_test.h index 62747313758..0a343506d3b 100644 --- a/test/parallel_api/ranges/std_ranges_test.h +++ b/test/parallel_api/ranges/std_ranges_test.h @@ -368,7 +368,7 @@ struct test void process_data_in(int max_n, Policy&& exec, Algo algo, Checker& checker, TransIn tr_in, auto... args) { - std::string sizes{"for "}; + std::string sizes{" for "}; sizes += std::to_string(max_n) + " elements"; Container cont_in(exec, max_n, DataGen1{}); @@ -407,7 +407,7 @@ struct test TransOut tr_out, auto... args) { static_assert(mode == data_in_out || mode == data_in_out_lim); - std::string sizes{"for "}; + std::string sizes{" for "}; sizes += std::to_string(n_in) + " elements and " + std::to_string(n_out) + " space"; Container cont_in(exec, n_in, DataGen1{}); @@ -506,7 +506,7 @@ struct test process_data_in_in(int max_n, int n_in1, int n_in2, Policy&& exec, Algo algo, Checker& checker, TransIn tr_in, auto... args) { - std::string sizes{"for "}; + std::string sizes{" for "}; sizes += std::to_string(n_in1) + " and " + std::to_string(n_in2) + " elements"; assert(n_in1 <= max_n); @@ -551,7 +551,7 @@ struct test TransIn tr_in, TransOut tr_out, auto... args) { static_assert(mode == data_in_in_out || mode == data_in_in_out_lim); - std::string sizes{"for "}; + std::string sizes{" for "}; sizes += std::to_string(n_in1) + " and " + std::to_string(n_in2) + " elements and " + std::to_string(n_out) + " space"; Container cont_in1(exec, n_in1, DataGen1{}); From b1751dee58074598d6dee59e8dbe4c59464cf7e6 Mon Sep 17 00:00:00 2001 From: Alexey Kukanov Date: Thu, 4 Dec 2025 23:23:20 +0100 Subject: [PATCH 7/7] Change simd_copy_by_mask to use scan instead of ordered monotonic --- include/oneapi/dpl/pstl/algorithm_impl.h | 4 ---- include/oneapi/dpl/pstl/unseq_backend_simd.h | 14 ++++++-------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index b25898c1e46..c6ea546eb1a 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -1287,11 +1287,7 @@ void __brick_copy_by_mask(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, bool* __mask, _Assigner __assigner, /*vector=*/::std::true_type) noexcept { -#if (_PSTL_MONOTONIC_PRESENT || _ONEDPL_MONOTONIC_PRESENT) __unseq_backend::__simd_copy_by_mask(__first, __last - __first, __result, __mask, __assigner); -#else - __internal::__brick_copy_by_mask(__first, __last, __result, __mask, __assigner, ::std::false_type()); -#endif } template diff --git a/include/oneapi/dpl/pstl/unseq_backend_simd.h b/include/oneapi/dpl/pstl/unseq_backend_simd.h index d9861f5d74c..40edb99a84c 100644 --- a/include/oneapi/dpl/pstl/unseq_backend_simd.h +++ b/include/oneapi/dpl/pstl/unseq_backend_simd.h @@ -302,20 +302,18 @@ _DifferenceType __simd_copy_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, bool* __mask, _Assigner __assigner) noexcept { - _DifferenceType __cnt = 0; - _ONEDPL_PRAGMA_SIMD + std::make_signed_t<_DifferenceType> __cnt = -1; // to use inclusive scan of the mask + _ONEDPL_PRAGMA_SIMD_SCAN(+ : __cnt) for (_DifferenceType __i = 0; __i < __n; ++__i) { + __cnt += __mask[__i]; + _ONEDPL_PRAGMA_SIMD_INCLUSIVE_SCAN(__cnt) if (__mask[__i]) { - _ONEDPL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1) - { - __assigner(__first + __i, __result + __cnt); - ++__cnt; - } + __assigner(__first + __i, __result + __cnt); } } - return __cnt; + return __cnt + 1; // accounts for the initial -1 } template