Implement cuda::sincos

davebayer · davebayer · commit 419077d3eaa6 · 2025-11-24T08:53:51.000+01:00
diff --git a/docs/libcudacxx/extended_api/math.rst b/docs/libcudacxx/extended_api/math.rst
@@ -97,3 +97,8 @@ Math
      - Most significant half of the product
      - CCCL 3.2.0
      - CUDA 13.2
+
+   * - :ref:`sincos <libcudacxx-extended-api-math-sincos>`
+     - Computes sine and cosine of a value at the same time.
+     - CCCL 3.3.0
+     - CUDA 13.3
diff --git a/docs/libcudacxx/extended_api/math/sincos.rst b/docs/libcudacxx/extended_api/math/sincos.rst
@@ -0,0 +1,64 @@
+.. _libcudacxx-extended-api-math-sincos:
+
+``cuda::sincos``
+====================================
+
+Defined in the ``<cuda/cmath>`` header.
+
+.. code:: cuda
+
+   namespace cuda {
+
+   template <class T>
+   struct sincos_result
+   {
+     T sin;
+     T cos;
+   };
+
+   template <class T>
+   [[nodiscard]] __host__ __device__
+   sincos_result<T> sincos(T value) noexcept;
+
+   } // namespace cuda
+
+Computes :math:`\sin value` and :math:`\cos value` at the same time using more efficient algorithms than if operations were computed separately.
+
+**Parameters**
+
+- ``value``: The input value.
+
+**Return value**
+
+- ``cuda::sincos_result`` object with both values set to ``NaN`` if the ``value`` is :math:`\pm\infty` or ``NaN`` and to results of :math:`\sin value` and :math:`\cos value` otherwise.
+
+**Constraints**
+
+- ``T`` is an arithmetic type.
+
+**Performance considerations**
+
+- If available, the functionality is implemented by compiler builtins, otherwise fallbacks to ``cuda::std::sin(value)`` and ``cuda::std::cos(value)``.
+
+Example
+-------
+
+.. code:: cuda
+
+    #include <cuda/cmath>
+    #include <cuda/std/cassert>
+    #include <cuda/std/numbers>
+
+    __global__ void sincos_kernel() {
+        auto [sin_pi, cos_pi] = cuda::sincos(cuda::std::numbers::pi_v<float>);
+        assert(sin_pi == 0.f);
+        assert(cos_pi == 1.f);
+    }
+
+    int main() {
+        sincos_kernel<<<1, 1>>>();
+        cudaDeviceSynchronize();
+        return 0;
+    }
+
+`See it on Godbolt 🔗 <https://godbolt.org/z/WYfEnhGaq>`__
diff --git a/libcudacxx/include/cuda/__cmath/sincos.h b/libcudacxx/include/cuda/__cmath/sincos.h
@@ -0,0 +1,128 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___CMATH_SINCOS_H
+#define _CUDA___CMATH_SINCOS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cmath/trigonometric_functions.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__floating_point/traits.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_same.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+#if _CCCL_HAS_BUILTIN(__builtin_sincosf) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_SINCOSF(...) __builtin_sincosf(__VA_ARGS__)
+#endif // _CCCL_HAS_BUILTIN(__builtin_sincosf) || _CCCL_COMPILER(GCC)
+
+#if _CCCL_HAS_BUILTIN(__builtin_sincos) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_SINCOS(...) __builtin_sincos(__VA_ARGS__)
+#endif // _CCCL_HAS_BUILTIN(__builtin_sincos) || _CCCL_COMPILER(GCC)
+
+#if _CCCL_HAS_BUILTIN(__builtin_sincosl) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_SINCOSL(...) __builtin_sincosl(__VA_ARGS__)
+#endif // _CCCL_HAS_BUILTIN(__builtin_sincosl) || _CCCL_COMPILER(GCC)
+
+// clang-cuda crashes if these builtins are used.
+#if _CCCL_CUDA_COMPILER(CLANG)
+#  undef _CCCL_BUILTIN_SINCOSF
+#  undef _CCCL_BUILTIN_SINCOS
+#  undef _CCCL_BUILTIN_SINCOSL
+#endif // _CCCL_CUDA_COMPILER(CLANG)
+
+_CCCL_BEGIN_NAMESPACE_CUDA
+
+//! @brief Type returned by \c cuda::sincos.
+template <class _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT sincos_result
+{
+  _Tp sin; //!< The sin result.
+  _Tp cos; //!< The cos result.
+};
+
+//! @brief Computes sin and cos operation of a value.
+//!
+//! @param __v The value.
+//!
+//! @return The \c cuda::sincos_result with the results of sin and cos operations.
+_CCCL_TEMPLATE(class _Tp)
+_CCCL_REQUIRES(::cuda::std::__is_fp_v<_Tp>)
+[[nodiscard]] _CCCL_API sincos_result<_Tp> sincos(_Tp __v) noexcept
+{
+  sincos_result<_Tp> __ret{};
+#if defined(_CCCL_BUILTIN_SINCOSF)
+  if constexpr (::cuda::std::is_same_v<_Tp, float>)
+  {
+    _CCCL_BUILTIN_SINCOSF(__v, &__ret.sin, &__ret.cos);
+    return __ret;
+  }
+#endif // _CCCL_BUILTIN_SINCOSF
+#if defined(_CCCL_BUILTIN_SINCOS)
+  if constexpr (::cuda::std::is_same_v<_Tp, double>)
+  {
+    _CCCL_BUILTIN_SINCOS(__v, &__ret.sin, &__ret.cos);
+    return __ret;
+  }
+#endif // _CCCL_BUILTIN_SINCOS
+#if _CCCL_HAS_LONG_DOUBLE() && defined(_CCCL_BUILTIN_SINCOSL)
+  if constexpr (::cuda::std::is_same_v<_Tp, long double>)
+  {
+    _CCCL_BUILTIN_SINCOSL(__v, &__ret.sin, &__ret.cos);
+    return __ret;
+  }
+#endif // _CCCL_HAS_LONG_DOUBLE() && _CCCL_BUILTIN_SINCOSL
+
+  // clang-cuda crashes if these builtins are used.
+#if !_CCCL_CUDA_COMPILER(CLANG)
+  _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+  {
+    if constexpr (::cuda::std::is_same_v<_Tp, float>)
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (::sincosf(__v, &__ret.sin, &__ret.cos); return __ret;))
+    }
+    if constexpr (::cuda::std::is_same_v<_Tp, double>)
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (::sincos(__v, &__ret.sin, &__ret.cos); return __ret;))
+    }
+  }
+#endif // !_CCCL_CUDA_COMPILER(CLANG)
+  __ret.sin = ::cuda::std::sin(__v);
+  __ret.cos = ::cuda::std::cos(__v);
+  return __ret;
+}
+
+//! @brief Computes sin and cos operation of a value.
+//!
+//! @param __v The value.
+//!
+//! @return The \c cuda::sincos_result with the results of sin and cos operations.
+_CCCL_TEMPLATE(class _Tp)
+_CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>)
+[[nodiscard]] _CCCL_API sincos_result<double> sincos(_Tp __v) noexcept
+{
+  return ::cuda::sincos(static_cast<double>(__v));
+}
+
+_CCCL_END_NAMESPACE_CUDA
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA___CMATH_SINCOS_H
diff --git a/libcudacxx/include/cuda/cmath b/libcudacxx/include/cuda/cmath
@@ -31,6 +31,7 @@
 #include <cuda/__cmath/pow2.h>
 #include <cuda/__cmath/round_down.h>
 #include <cuda/__cmath/round_up.h>
+#include <cuda/__cmath/sincos.h>
 #include <cuda/__cmath/uabs.h>
 #include <cuda/std/cmath>
 
diff --git a/libcudacxx/include/cuda/std/__complex/exponential_functions.h b/libcudacxx/include/cuda/std/__complex/exponential_functions.h
@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__cmath/sincos.h>
 #include <cuda/std/__cmath/copysign.h>
 #include <cuda/std/__cmath/isfinite.h>
 #include <cuda/std/__cmath/isinf.h>
@@ -175,12 +176,7 @@ _CCCL_API inline complex<float> exp(const complex<float>& __x)
     __exp_r_reduced = (__r < 0.0f) ? 0.0f : 1e3f;
   }
 
-  // Compile to sincos when possible:
-  float __sin_i;
-  float __cos_i;
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
-                    (::sincosf(__i, &__sin_i, &__cos_i);),
-                    (__sin_i = ::cuda::std::sinf(__i); __cos_i = ::cuda::std::cosf(__i);))
+  const auto [__sin_i, __cos_i] = ::cuda::sincos(__i);
 
   // Our answer now is: (ldexp(__exp_r_reduced * __sin_r, __j_int), ldexp(__exp_r_reduced * __sin_r, __j_int))
   // However we don't need a full ldexp here, and if __exp_r_reduced*__sin_r is denormal we can lose bits.
@@ -265,12 +261,7 @@ _CCCL_API inline complex<double> exp<double>(const complex<double>& __x)
     __exp_r_reduced = (__r < 0.0) ? 0.0 : 1e10;
   }
 
-  // Compile to sincos when possible:
-  double __sin_i;
-  double __cos_i;
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
-                    (::sincos(__i, &__sin_i, &__cos_i);),
-                    (__sin_i = ::cuda::std::sin(__i); __cos_i = ::cuda::std::cos(__i);))
+  const auto [__sin_i, __cos_i] = ::cuda::sincos(__i);
 
   // Our answer now is: (ldexp(__exp_mant * __sin_r, __j_int), ldexp(__exp_mant * __sin_r, __j_int))
   // However we don't need a full ldexp here, and if __exp_mant*__sin_r is denormal we can lose bits.
diff --git a/libcudacxx/test/libcudacxx/cuda/cmath/sincos.pass.cpp b/libcudacxx/test/libcudacxx/cuda/cmath/sincos.pass.cpp
@@ -0,0 +1,101 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/cmath>
+
+#include <cuda/cmath>
+#include <cuda/std/cassert>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+
+template <class T>
+__host__ __device__ void test_type(float zero)
+{
+  using Result = cuda::std::conditional_t<cuda::std::is_integral_v<T>, double, T>;
+
+  // 1. Test signature.
+  static_assert(cuda::std::is_same_v<cuda::sincos_result<Result>, decltype(cuda::sincos(T{}))>);
+  static_assert(noexcept(cuda::sincos(cuda::std::declval<T>())));
+
+  // 2. Test sincos(0).
+  {
+    auto result = cuda::sincos(static_cast<T>(zero));
+    static_assert(cuda::std::is_same_v<Result, decltype(result.sin)>);
+    static_assert(cuda::std::is_same_v<Result, decltype(result.cos)>);
+    assert(result.sin == Result{0});
+    assert(result.cos == Result{1});
+  }
+
+  // 3. Test sincos(+-inf)
+  if constexpr (cuda::std::numeric_limits<T>::has_infinity && cuda::std::numeric_limits<T>::has_quiet_NaN)
+  {
+    auto pos_result = cuda::sincos(cuda::std::numeric_limits<T>::infinity());
+    assert(cuda::std::isnan(pos_result.sin));
+    assert(cuda::std::isnan(pos_result.cos));
+
+    auto neg_result = cuda::sincos(-cuda::std::numeric_limits<T>::infinity());
+    assert(cuda::std::isnan(neg_result.sin));
+    assert(cuda::std::isnan(neg_result.cos));
+  }
+
+  // 3. Test sincos(+-nan)
+  if constexpr (cuda::std::numeric_limits<T>::has_quiet_NaN)
+  {
+    auto pos_result = cuda::sincos(cuda::std::numeric_limits<T>::quiet_NaN());
+    assert(cuda::std::isnan(pos_result.sin));
+    assert(cuda::std::isnan(pos_result.cos));
+
+    auto neg_result = cuda::sincos(-cuda::std::numeric_limits<T>::quiet_NaN());
+    assert(cuda::std::isnan(neg_result.sin));
+    assert(cuda::std::isnan(neg_result.cos));
+  }
+}
+
+__host__ __device__ void test(float zero)
+{
+  test_type<float>(zero);
+  test_type<double>(zero);
+#if _CCCL_HAS_LONG_DOUBLE()
+  test_type<long double>(zero);
+#endif // _CCCL_HAS_LONG_DOUBLE()
+#if _LIBCUDACXX_HAS_NVFP16()
+  test_type<__half>(zero);
+#endif // _LIBCUDACXX_HAS_NVFP16()
+#if _LIBCUDACXX_HAS_NVBF16()
+  test_type<__nv_bfloat16>(zero);
+#endif // _LIBCUDACXX_HAS_NVBF16()
+
+  // todo: add tests for f128 once supported
+
+  test_type<signed char>(zero);
+  test_type<signed short>(zero);
+  test_type<signed int>(zero);
+  test_type<signed long>(zero);
+  test_type<signed long long>(zero);
+#if _CCCL_HAS_INT128()
+  test_type<__int128_t>(zero);
+#endif // _CCCL_HAS_INT128()
+
+  test_type<unsigned char>(zero);
+  test_type<unsigned short>(zero);
+  test_type<unsigned int>(zero);
+  test_type<unsigned long>(zero);
+  test_type<unsigned long long>(zero);
+#if _CCCL_HAS_INT128()
+  test_type<__uint128_t>(zero);
+#endif // _CCCL_HAS_INT128()
+}
+
+int main(int, char**)
+{
+  volatile float zero = 0.0f;
+  test(zero);
+  return 0;
+}