diff --git a/docs/libcudacxx/extended_api/math.rst b/docs/libcudacxx/extended_api/math.rst index ab491edc63c..83d0d781032 100644 --- a/docs/libcudacxx/extended_api/math.rst +++ b/docs/libcudacxx/extended_api/math.rst @@ -97,3 +97,8 @@ Math - Most significant half of the product - CCCL 3.2.0 - CUDA 13.2 + + * - :ref:`sincos ` + - Computes sine and cosine of a value at the same time. + - CCCL 3.3.0 + - CUDA 13.3 diff --git a/docs/libcudacxx/extended_api/math/sincos.rst b/docs/libcudacxx/extended_api/math/sincos.rst new file mode 100644 index 00000000000..40d87ce9333 --- /dev/null +++ b/docs/libcudacxx/extended_api/math/sincos.rst @@ -0,0 +1,68 @@ +.. _libcudacxx-extended-api-math-sincos: + +``cuda::sincos`` +==================================== + +Defined in the ```` header. + +.. code:: cuda + + namespace cuda { + + template + struct sincos_result + { + T sin; + T cos; + }; + + template + [[nodiscard]] __host__ __device__ + sincos_result sincos(T value) noexcept; // (1) + + template + [[nodiscard]] __host__ __device__ + sincos_result sincos(Integral value) noexcept; // (2) + + } // namespace cuda + +Computes :math:`\sin value` and :math:`\cos value` at the same time using more efficient algorithms than if operations were computed separately. + +**Parameters** + +- ``value``: The input value. + +**Return value** + +- ``cuda::sincos_result`` object with both values set to ``NaN`` if the input value is :math:`\pm\infty` or ``NaN`` and to results of :math:`\sin value` and :math:`\cos value` otherwise. (1) +- if ``T`` is an integral type, the input value is treated as ``double``. (2) + +**Constraints** + +- ``T`` is an arithmetic type. + +**Performance considerations** + +- If available, the functionality is implemented by compiler builtins, otherwise fallbacks to ``cuda::std::sin(value)`` and ``cuda::std::cos(value)``. + +Example +------- + +.. code:: cuda + + #include + #include + + __global__ void sincos_kernel() { + auto [sin_pi, cos_pi] = cuda::sincos(0.f); + assert(sin_pi == 0.f); + assert(cos_pi == 1.f); + } + + int main() { + sincos_kernel<<<1, 1>>>(); + cudaDeviceSynchronize(); + return 0; + } + +`See it on Godbolt 🔗 `__ diff --git a/libcudacxx/include/cuda/__cmath/sincos.h b/libcudacxx/include/cuda/__cmath/sincos.h new file mode 100644 index 00000000000..a74c11082a9 --- /dev/null +++ b/libcudacxx/include/cuda/__cmath/sincos.h @@ -0,0 +1,134 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___CMATH_SINCOS_H +#define _CUDA___CMATH_SINCOS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include + +#include + +#if _CCCL_HAS_BUILTIN(__builtin_sincosf) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_SINCOSF(...) __builtin_sincosf(__VA_ARGS__) +#endif // _CCCL_HAS_BUILTIN(__builtin_sincosf) || _CCCL_COMPILER(GCC) + +#if _CCCL_HAS_BUILTIN(__builtin_sincos) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_SINCOS(...) __builtin_sincos(__VA_ARGS__) +#endif // _CCCL_HAS_BUILTIN(__builtin_sincos) || _CCCL_COMPILER(GCC) + +#if _CCCL_HAS_BUILTIN(__builtin_sincosl) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_SINCOSL(...) __builtin_sincosl(__VA_ARGS__) +#endif // _CCCL_HAS_BUILTIN(__builtin_sincosl) || _CCCL_COMPILER(GCC) + +// clang-cuda crashes if these builtins are used. +#if _CCCL_CUDA_COMPILER(CLANG) +# undef _CCCL_BUILTIN_SINCOSF +# undef _CCCL_BUILTIN_SINCOS +# undef _CCCL_BUILTIN_SINCOSL +#endif // _CCCL_CUDA_COMPILER(CLANG) + +_CCCL_BEGIN_NAMESPACE_CUDA + +//! @brief Type returned by \c cuda::sincos. +template +struct _CCCL_TYPE_VISIBILITY_DEFAULT sincos_result +{ + _Tp sin; //!< The sin result. + _Tp cos; //!< The cos result. +}; + +//! @brief Computes sin and cos operation of a value. +//! +//! @param __v The value. +//! +//! @return The \c cuda::sincos_result with the results of sin and cos operations. +_CCCL_TEMPLATE(class _Tp) +_CCCL_REQUIRES(::cuda::std::__is_extended_arithmetic_v<_Tp>) +[[nodiscard]] _CCCL_API auto sincos(_Tp __v) noexcept + -> sincos_result<::cuda::std::conditional_t<::cuda::std::is_integral_v<_Tp>, double, _Tp>> +{ + if constexpr (::cuda::std::is_integral_v<_Tp>) + { + return ::cuda::sincos(static_cast(__v)); + } + else + { + [[maybe_unused]] sincos_result<_Tp> __ret{}; +#if defined(_CCCL_BUILTIN_SINCOSF) + if constexpr (::cuda::std::is_same_v<_Tp, float>) + { + _CCCL_BUILTIN_SINCOSF(__v, &__ret.sin, &__ret.cos); + return __ret; + } +#endif // _CCCL_BUILTIN_SINCOSF +#if defined(_CCCL_BUILTIN_SINCOS) + if constexpr (::cuda::std::is_same_v<_Tp, double>) + { + _CCCL_BUILTIN_SINCOS(__v, &__ret.sin, &__ret.cos); + return __ret; + } +#endif // _CCCL_BUILTIN_SINCOS +#if _CCCL_HAS_LONG_DOUBLE() && defined(_CCCL_BUILTIN_SINCOSL) + if constexpr (::cuda::std::is_same_v<_Tp, long double>) + { + _CCCL_BUILTIN_SINCOSL(__v, &__ret.sin, &__ret.cos); + return __ret; + } +#endif // _CCCL_HAS_LONG_DOUBLE() && _CCCL_BUILTIN_SINCOSL + + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + if constexpr (::cuda::std::is_same_v<_Tp, float>) + { + NV_IF_TARGET(NV_IS_DEVICE, (::sincosf(__v, &__ret.sin, &__ret.cos); return __ret;)) + } + if constexpr (::cuda::std::is_same_v<_Tp, double>) + { + NV_IF_TARGET(NV_IS_DEVICE, (::sincos(__v, &__ret.sin, &__ret.cos); return __ret;)) + } +#if _LIBCUDACXX_HAS_NVFP16() + if constexpr (::cuda::std::is_same_v<_Tp, ::__half>) + { + const auto __result_float = ::cuda::sincos(::__half2float(__v)); + return {::__float2half(__result_float.sin), ::__float2half(__result_float.cos)}; + } +#endif // _LIBCUDACXX_HAS_NVFP16() +#if _LIBCUDACXX_HAS_NVBF16() + if constexpr (::cuda::std::is_same_v<_Tp, ::__nv_bfloat16>) + { + const auto __result_float = ::cuda::sincos(::__bfloat162float(__v)); + return {::__float2bfloat16(__result_float.sin), ::__float2bfloat16(__result_float.cos)}; + } +#endif // _LIBCUDACXX_HAS_NVBF16() + } + return {::cuda::std::sin(__v), ::cuda::std::cos(__v)}; + } +} + +_CCCL_END_NAMESPACE_CUDA + +#include + +#endif // _CUDA___CMATH_SINCOS_H diff --git a/libcudacxx/include/cuda/cmath b/libcudacxx/include/cuda/cmath index 9cc7ec56ead..6c6534bb6de 100644 --- a/libcudacxx/include/cuda/cmath +++ b/libcudacxx/include/cuda/cmath @@ -31,6 +31,7 @@ #include #include #include +#include #include #include diff --git a/libcudacxx/include/cuda/std/__complex/exponential_functions.h b/libcudacxx/include/cuda/std/__complex/exponential_functions.h index 129ea3fa5e9..7d985ebcb97 100644 --- a/libcudacxx/include/cuda/std/__complex/exponential_functions.h +++ b/libcudacxx/include/cuda/std/__complex/exponential_functions.h @@ -21,6 +21,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -67,8 +68,9 @@ template return complex<_Tp>(__x.real(), __i); } } - _Tp __e = ::cuda::std::exp(__x.real()); - return complex<_Tp>(__e * ::cuda::std::cos(__i), __e * ::cuda::std::sin(__i)); + _Tp __e = ::cuda::std::exp(__x.real()); + const auto [__i_sin, __i_cos] = ::cuda::sincos(__i); + return complex<_Tp>(__e * __i_cos, __e * __i_sin); } // A real exp that doesn't combine the final polynomial estimate with the ldexp factor. @@ -175,12 +177,7 @@ _CCCL_API inline complex exp(const complex& __x) __exp_r_reduced = (__r < 0.0f) ? 0.0f : 1e3f; } - // Compile to sincos when possible: - float __sin_i; - float __cos_i; - NV_IF_ELSE_TARGET(NV_IS_DEVICE, - (::sincosf(__i, &__sin_i, &__cos_i);), - (__sin_i = ::cuda::std::sinf(__i); __cos_i = ::cuda::std::cosf(__i);)) + const auto [__sin_i, __cos_i] = ::cuda::sincos(__i); // Our answer now is: (ldexp(__exp_r_reduced * __sin_r, __j_int), ldexp(__exp_r_reduced * __sin_r, __j_int)) // However we don't need a full ldexp here, and if __exp_r_reduced*__sin_r is denormal we can lose bits. @@ -265,12 +262,7 @@ _CCCL_API inline complex exp(const complex& __x) __exp_r_reduced = (__r < 0.0) ? 0.0 : 1e10; } - // Compile to sincos when possible: - double __sin_i; - double __cos_i; - NV_IF_ELSE_TARGET(NV_IS_DEVICE, - (::sincos(__i, &__sin_i, &__cos_i);), - (__sin_i = ::cuda::std::sin(__i); __cos_i = ::cuda::std::cos(__i);)) + const auto [__sin_i, __cos_i] = ::cuda::sincos(__i); // Our answer now is: (ldexp(__exp_mant * __sin_r, __j_int), ldexp(__exp_mant * __sin_r, __j_int)) // However we don't need a full ldexp here, and if __exp_mant*__sin_r is denormal we can lose bits. diff --git a/libcudacxx/include/cuda/std/__complex/hyperbolic_functions.h b/libcudacxx/include/cuda/std/__complex/hyperbolic_functions.h index f70d4e9609d..4f826218e2f 100644 --- a/libcudacxx/include/cuda/std/__complex/hyperbolic_functions.h +++ b/libcudacxx/include/cuda/std/__complex/hyperbolic_functions.h @@ -21,6 +21,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -52,8 +53,8 @@ template { return __x; } - return complex<_Tp>(::cuda::std::sinh(__x.real()) * ::cuda::std::cos(__x.imag()), - ::cuda::std::cosh(__x.real()) * ::cuda::std::sin(__x.imag())); + const auto [__im_sin, __im_cos] = ::cuda::sincos(__x.imag()); + return complex<_Tp>(::cuda::std::sinh(__x.real()) * __im_cos, ::cuda::std::cosh(__x.real()) * __im_sin); } // cosh @@ -77,8 +78,8 @@ template { return complex<_Tp>(::cuda::std::abs(__x.real()), __x.imag()); } - return complex<_Tp>(::cuda::std::cosh(__x.real()) * ::cuda::std::cos(__x.imag()), - ::cuda::std::sinh(__x.real()) * ::cuda::std::sin(__x.imag())); + const auto [__im_sin, __im_cos] = ::cuda::sincos(__x.imag()); + return complex<_Tp>(::cuda::std::cosh(__x.real()) * __im_cos, ::cuda::std::sinh(__x.real()) * __im_sin); } // tanh @@ -101,13 +102,14 @@ template } _Tp __2r(_Tp(2) * __x.real()); _Tp __2i(_Tp(2) * __x.imag()); - _Tp __d(::cuda::std::cosh(__2r) + ::cuda::std::cos(__2i)); + const auto [__2i_sin, __2i_cos] = ::cuda::sincos(__2i); + _Tp __d(::cuda::std::cosh(__2r) + __2i_cos); _Tp __2rsh(::cuda::std::sinh(__2r)); if (::cuda::std::isinf(__2rsh) && ::cuda::std::isinf(__d)) { return complex<_Tp>(__2rsh > _Tp(0) ? _Tp(1) : _Tp(-1), __2i > _Tp(0) ? _Tp(0) : _Tp(-0.)); } - return complex<_Tp>(__2rsh / __d, ::cuda::std::sin(__2i) / __d); + return complex<_Tp>(__2rsh / __d, __2i_sin / __d); } _CCCL_END_NAMESPACE_CUDA_STD diff --git a/libcudacxx/include/cuda/std/__complex/math.h b/libcudacxx/include/cuda/std/__complex/math.h index cd765bbe3a1..e6b51c21308 100644 --- a/libcudacxx/include/cuda/std/__complex/math.h +++ b/libcudacxx/include/cuda/std/__complex/math.h @@ -21,6 +21,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -139,12 +140,13 @@ template } return complex<_Tp>(numeric_limits<_Tp>::quiet_NaN(), numeric_limits<_Tp>::quiet_NaN()); } - _Tp __x = __rho * ::cuda::std::cos(__theta); + const auto [__sin_theta, __cos_theta] = ::cuda::sincos(__theta); + _Tp __x = __rho * __cos_theta; if (::cuda::std::isnan(__x)) { __x = 0; } - _Tp __y = __rho * ::cuda::std::sin(__theta); + _Tp __y = __rho * __sin_theta; if (::cuda::std::isnan(__y)) { __y = 0; diff --git a/libcudacxx/test/libcudacxx/cuda/cmath/sincos.pass.cpp b/libcudacxx/test/libcudacxx/cuda/cmath/sincos.pass.cpp new file mode 100644 index 00000000000..c952a73fb45 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/cmath/sincos.pass.cpp @@ -0,0 +1,145 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +#include +#include +#include +#include +#include + +__host__ __device__ bool is_about(float x, float y) +{ + return (cuda::std::abs((x - y) / (x + y)) < 1.e-6); +} + +__host__ __device__ bool is_about(double x, double y) +{ + return (cuda::std::abs((x - y) / (x + y)) < 1.e-14); +} + +#if _CCCL_HAS_LONG_DOUBLE() +__host__ __device__ bool is_about(long double x, long double y) +{ + return (cuda::std::abs((x - y) / (x + y)) < 1.e-14); +} +#endif // _CCCL_HAS_LONG_DOUBLE() + +#if _LIBCUDACXX_HAS_NVFP16() +__host__ __device__ bool is_about(__half x, __half y) +{ + return (cuda::std::fabs((x - y) / (x + y)) <= __half(1e-3)); +} +#endif // _LIBCUDACXX_HAS_NVFP16() + +#if _LIBCUDACXX_HAS_NVBF16() +__host__ __device__ bool is_about(__nv_bfloat16 x, __nv_bfloat16 y) +{ + return (cuda::std::fabs((x - y) / (x + y)) <= __nv_bfloat16(5e-3)); +} +#endif // _LIBCUDACXX_HAS_NVBF16() + +template +__host__ __device__ void test_type(float zero) +{ + using Result = cuda::std::conditional_t, double, T>; + + // 1. Test signature. + static_assert(cuda::std::is_same_v, decltype(cuda::sincos(T{}))>); + static_assert(noexcept(cuda::sincos(cuda::std::declval()))); + + // 2. Test sincos(0). + { + auto result = cuda::sincos(static_cast(zero)); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + assert(result.sin == Result{0}); + assert(result.cos == Result{1}); + } + + // 3. Test sincos(value) to result of separate sin/cos calls. + { + const auto value = static_cast(4); + auto result = cuda::sincos(value); + assert(is_about(result.sin, cuda::std::sin(value))); + assert(is_about(result.cos, cuda::std::cos(value))); + } + + // 4. Test sincos(+-inf) + if constexpr (cuda::std::numeric_limits::has_infinity && cuda::std::numeric_limits::has_quiet_NaN) + { + auto pos_result = cuda::sincos(cuda::std::numeric_limits::infinity()); + assert(cuda::std::isnan(pos_result.sin)); + assert(cuda::std::isnan(pos_result.cos)); + + auto neg_result = cuda::sincos(-cuda::std::numeric_limits::infinity()); + assert(cuda::std::isnan(neg_result.sin)); + assert(cuda::std::isnan(neg_result.cos)); + } + + // 5. Test sincos(+-nan) + if constexpr (cuda::std::numeric_limits::has_quiet_NaN) + { + auto pos_result = cuda::sincos(cuda::std::numeric_limits::quiet_NaN()); + assert(cuda::std::isnan(pos_result.sin)); + assert(cuda::std::isnan(pos_result.cos)); + + auto neg_result = cuda::sincos(-cuda::std::numeric_limits::quiet_NaN()); + assert(cuda::std::isnan(neg_result.sin)); + assert(cuda::std::isnan(neg_result.cos)); + } +} + +__host__ __device__ void test(float zero) +{ + test_type(zero); + test_type(zero); +#if _CCCL_HAS_LONG_DOUBLE() + test_type(zero); +#endif // _CCCL_HAS_LONG_DOUBLE() +#if _LIBCUDACXX_HAS_NVFP16() + test_type<__half>(zero); +#endif // _LIBCUDACXX_HAS_NVFP16() +#if _LIBCUDACXX_HAS_NVBF16() + test_type<__nv_bfloat16>(zero); +#endif // _LIBCUDACXX_HAS_NVBF16() + + // todo: add tests for f128 once supported + + test_type(zero); + test_type(zero); + test_type(zero); + test_type(zero); + test_type(zero); +#if _CCCL_HAS_INT128() && !_CCCL_CUDA_COMPILER(CLANG) // clang-cuda crashes with int128 + test_type<__int128_t>(static_cast(zero)); +#endif // _CCCL_HAS_INT128() && !_CCCL_CUDA_COMPILER(CLANG) + + test_type(zero); + test_type(zero); + test_type(zero); + test_type(zero); + test_type(zero); +#if _CCCL_HAS_INT128() && !_CCCL_CUDA_COMPILER(CLANG) // clang-cuda crashes with int128 + test_type<__uint128_t>(static_cast(zero)); +#endif // _CCCL_HAS_INT128() && !_CCCL_CUDA_COMPILER(CLANG) +} + +__global__ void kernel() +{ + test(0.f); +} + +int main(int, char**) +{ + volatile float zero = 0.0f; + test(zero); + return 0; +}