NVIDIA · davebayer · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025
@@ -97,3 +97,8 @@ Math
      - Most significant half of the product
      - CCCL 3.2.0
      - CUDA 13.2
+
+   * - :ref:`sincos <libcudacxx-extended-api-math-sincos>`
+     - Computes sine and cosine of a value at the same time.
+     - CCCL 3.3.0
+     - CUDA 13.3
@@ -0,0 +1,68 @@
+.. _libcudacxx-extended-api-math-sincos:
+
+``cuda::sincos``
+====================================
+
+Defined in the ``<cuda/cmath>`` header.
+
+.. code:: cuda
+
+   namespace cuda {
+
+   template <class T>
+   struct sincos_result
+   {
+     T sin;
+     T cos;
+   };
+
+   template </*floating-point-type*/ T>
+   [[nodiscard]] __host__ __device__
+   sincos_result<T> sincos(T value) noexcept; // (1)
+
+   template <class Integral>
+   [[nodiscard]] __host__ __device__
+   sincos_result<double> sincos(Integral value) noexcept; // (2)
+
+   } // namespace cuda
+
+Computes :math:`\sin value` and :math:`\cos value` at the same time using more efficient algorithms than if operations were computed separately.
+
+**Parameters**
+
+- ``value``: The input value.
+
+**Return value**
+
+- ``cuda::sincos_result`` object with both values set to ``NaN`` if the input value is :math:`\pm\infty` or ``NaN`` and to results of :math:`\sin value` and :math:`\cos value` otherwise. (1)
+- if ``T`` is an integral type, the input value is treated as ``double``. (2)
+
+**Constraints**
+
+- ``T`` is an arithmetic type.
+
+**Performance considerations**
+
+- If available, the functionality is implemented by compiler builtins, otherwise fallbacks to ``cuda::std::sin(value)`` and ``cuda::std::cos(value)``.
+
+Example
+-------
+
+.. code:: cuda
+
+    #include <cuda/cmath>
+    #include <cuda/std/cassert>
+
+    __global__ void sincos_kernel() {
+        auto [sin_pi, cos_pi] = cuda::sincos(0.f);
+        assert(sin_pi == 0.f);
+        assert(cos_pi == 1.f);
+    }
+
+    int main() {
+        sincos_kernel<<<1, 1>>>();
+        cudaDeviceSynchronize();
+        return 0;
+    }
+
+`See it on Godbolt 🔗 <https://godbolt.org/z/99PP9s1z6>`__
@@ -0,0 +1,134 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___CMATH_SINCOS_H
+#define _CUDA___CMATH_SINCOS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cmath/trigonometric_functions.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/is_extended_arithmetic.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_same.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+#if _CCCL_HAS_BUILTIN(__builtin_sincosf) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_SINCOSF(...) __builtin_sincosf(__VA_ARGS__)
+#endif // _CCCL_HAS_BUILTIN(__builtin_sincosf) || _CCCL_COMPILER(GCC)
+
+#if _CCCL_HAS_BUILTIN(__builtin_sincos) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_SINCOS(...) __builtin_sincos(__VA_ARGS__)
+#endif // _CCCL_HAS_BUILTIN(__builtin_sincos) || _CCCL_COMPILER(GCC)
+
+#if _CCCL_HAS_BUILTIN(__builtin_sincosl) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_SINCOSL(...) __builtin_sincosl(__VA_ARGS__)
+#endif // _CCCL_HAS_BUILTIN(__builtin_sincosl) || _CCCL_COMPILER(GCC)
+
+// clang-cuda crashes if these builtins are used.
+#if _CCCL_CUDA_COMPILER(CLANG)
+#  undef _CCCL_BUILTIN_SINCOSF
+#  undef _CCCL_BUILTIN_SINCOS
+#  undef _CCCL_BUILTIN_SINCOSL
+#endif // _CCCL_CUDA_COMPILER(CLANG)
+
+_CCCL_BEGIN_NAMESPACE_CUDA
+
+//! @brief Type returned by \c cuda::sincos.
+template <class _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT sincos_result
+{
+  _Tp sin; //!< The sin result.
+  _Tp cos; //!< The cos result.
+};
+
+//! @brief Computes sin and cos operation of a value.
+//!
+//! @param __v The value.
+//!
+//! @return The \c cuda::sincos_result with the results of sin and cos operations.
+_CCCL_TEMPLATE(class _Tp)
+_CCCL_REQUIRES(::cuda::std::__is_extended_arithmetic_v<_Tp>)
+[[nodiscard]] _CCCL_API auto sincos(_Tp __v) noexcept
+  -> sincos_result<::cuda::std::conditional_t<::cuda::std::is_integral_v<_Tp>, double, _Tp>>
+{
+  if constexpr (::cuda::std::is_integral_v<_Tp>)
+  {
+    return ::cuda::sincos(static_cast<double>(__v));
+  }
+  else
+  {
+    [[maybe_unused]] sincos_result<_Tp> __ret{};
+#if defined(_CCCL_BUILTIN_SINCOSF)
+    if constexpr (::cuda::std::is_same_v<_Tp, float>)
+    {
+      _CCCL_BUILTIN_SINCOSF(__v, &__ret.sin, &__ret.cos);
+      return __ret;
+    }
+#endif // _CCCL_BUILTIN_SINCOSF
+#if defined(_CCCL_BUILTIN_SINCOS)
+    if constexpr (::cuda::std::is_same_v<_Tp, double>)
+    {
+      _CCCL_BUILTIN_SINCOS(__v, &__ret.sin, &__ret.cos);
+      return __ret;
+    }
+#endif // _CCCL_BUILTIN_SINCOS
+#if _CCCL_HAS_LONG_DOUBLE() && defined(_CCCL_BUILTIN_SINCOSL)
+    if constexpr (::cuda::std::is_same_v<_Tp, long double>)
+    {
+      _CCCL_BUILTIN_SINCOSL(__v, &__ret.sin, &__ret.cos);
+      return __ret;
+    }
+#endif // _CCCL_HAS_LONG_DOUBLE() && _CCCL_BUILTIN_SINCOSL
+
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      if constexpr (::cuda::std::is_same_v<_Tp, float>)
+      {
+        NV_IF_TARGET(NV_IS_DEVICE, (::sincosf(__v, &__ret.sin, &__ret.cos); return __ret;))
+      }
+      if constexpr (::cuda::std::is_same_v<_Tp, double>)
+      {
+        NV_IF_TARGET(NV_IS_DEVICE, (::sincos(__v, &__ret.sin, &__ret.cos); return __ret;))
+      }
+#if _LIBCUDACXX_HAS_NVFP16()
+      if constexpr (::cuda::std::is_same_v<_Tp, ::__half>)
+      {
+        const auto __result_float = ::cuda::sincos(::__half2float(__v));
+        return {::__float2half(__result_float.sin), ::__float2half(__result_float.cos)};
+      }
+#endif // _LIBCUDACXX_HAS_NVFP16()
+#if _LIBCUDACXX_HAS_NVBF16()
+      if constexpr (::cuda::std::is_same_v<_Tp, ::__nv_bfloat16>)
+      {
+        const auto __result_float = ::cuda::sincos(::__bfloat162float(__v));
+        return {::__float2bfloat16(__result_float.sin), ::__float2bfloat16(__result_float.cos)};
+      }
+#endif // _LIBCUDACXX_HAS_NVBF16()
+    }
+    return {::cuda::std::sin(__v), ::cuda::std::cos(__v)};
+  }
+}
+
+_CCCL_END_NAMESPACE_CUDA
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA___CMATH_SINCOS_H
@@ -31,6 +31,7 @@
 #include <cuda/__cmath/pow2.h>
 #include <cuda/__cmath/round_down.h>
 #include <cuda/__cmath/round_up.h>
+#include <cuda/__cmath/sincos.h>
 #include <cuda/__cmath/uabs.h>
 #include <cuda/std/cmath>
 

@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__cmath/sincos.h>
 #include <cuda/std/__cmath/copysign.h>
 #include <cuda/std/__cmath/isfinite.h>
 #include <cuda/std/__cmath/isinf.h>
@@ -67,8 +68,9 @@ template <class _Tp>
       return complex<_Tp>(__x.real(), __i);
     }
   }
-  _Tp __e = ::cuda::std::exp(__x.real());
-  return complex<_Tp>(__e * ::cuda::std::cos(__i), __e * ::cuda::std::sin(__i));
+  _Tp __e                       = ::cuda::std::exp(__x.real());
+  const auto [__i_sin, __i_cos] = ::cuda::sincos(__i);
+  return complex<_Tp>(__e * __i_cos, __e * __i_sin);
 }
 
 // A real exp that doesn't combine the final polynomial estimate with the ldexp factor.
@@ -175,12 +177,7 @@ _CCCL_API inline complex<float> exp(const complex<float>& __x)
     __exp_r_reduced = (__r < 0.0f) ? 0.0f : 1e3f;
   }
 
-  // Compile to sincos when possible:
-  float __sin_i;
-  float __cos_i;
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
-                    (::sincosf(__i, &__sin_i, &__cos_i);),
-                    (__sin_i = ::cuda::std::sinf(__i); __cos_i = ::cuda::std::cosf(__i);))
+  const auto [__sin_i, __cos_i] = ::cuda::sincos(__i);
 
   // Our answer now is: (ldexp(__exp_r_reduced * __sin_r, __j_int), ldexp(__exp_r_reduced * __sin_r, __j_int))
   // However we don't need a full ldexp here, and if __exp_r_reduced*__sin_r is denormal we can lose bits.
@@ -265,12 +262,7 @@ _CCCL_API inline complex<double> exp<double>(const complex<double>& __x)
     __exp_r_reduced = (__r < 0.0) ? 0.0 : 1e10;
   }
 
-  // Compile to sincos when possible:
-  double __sin_i;
-  double __cos_i;
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
-                    (::sincos(__i, &__sin_i, &__cos_i);),
-                    (__sin_i = ::cuda::std::sin(__i); __cos_i = ::cuda::std::cos(__i);))
+  const auto [__sin_i, __cos_i] = ::cuda::sincos(__i);
 
   // Our answer now is: (ldexp(__exp_mant * __sin_r, __j_int), ldexp(__exp_mant * __sin_r, __j_int))
   // However we don't need a full ldexp here, and if __exp_mant*__sin_r is denormal we can lose bits.

@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__cmath/sincos.h>
 #include <cuda/std/__cmath/abs.h>
 #include <cuda/std/__cmath/copysign.h>
 #include <cuda/std/__cmath/hyperbolic_functions.h>
@@ -52,8 +53,8 @@ template <class _Tp>
   {
     return __x;
   }
-  return complex<_Tp>(::cuda::std::sinh(__x.real()) * ::cuda::std::cos(__x.imag()),
-                      ::cuda::std::cosh(__x.real()) * ::cuda::std::sin(__x.imag()));
+  const auto [__im_sin, __im_cos] = ::cuda::sincos(__x.imag());
+  return complex<_Tp>(::cuda::std::sinh(__x.real()) * __im_cos, ::cuda::std::cosh(__x.real()) * __im_sin);
 }
 
 // cosh
@@ -77,8 +78,8 @@ template <class _Tp>
   {
     return complex<_Tp>(::cuda::std::abs(__x.real()), __x.imag());
   }
-  return complex<_Tp>(::cuda::std::cosh(__x.real()) * ::cuda::std::cos(__x.imag()),
-                      ::cuda::std::sinh(__x.real()) * ::cuda::std::sin(__x.imag()));
+  const auto [__im_sin, __im_cos] = ::cuda::sincos(__x.imag());
+  return complex<_Tp>(::cuda::std::cosh(__x.real()) * __im_cos, ::cuda::std::sinh(__x.real()) * __im_sin);
 }
 
 // tanh
@@ -101,13 +102,14 @@ template <class _Tp>
   }
   _Tp __2r(_Tp(2) * __x.real());
   _Tp __2i(_Tp(2) * __x.imag());
-  _Tp __d(::cuda::std::cosh(__2r) + ::cuda::std::cos(__2i));
+  const auto [__2i_sin, __2i_cos] = ::cuda::sincos(__2i);
+  _Tp __d(::cuda::std::cosh(__2r) + __2i_cos);
   _Tp __2rsh(::cuda::std::sinh(__2r));
   if (::cuda::std::isinf(__2rsh) && ::cuda::std::isinf(__d))
   {
     return complex<_Tp>(__2rsh > _Tp(0) ? _Tp(1) : _Tp(-1), __2i > _Tp(0) ? _Tp(0) : _Tp(-0.));
   }
-  return complex<_Tp>(__2rsh / __d, ::cuda::std::sin(__2i) / __d);
+  return complex<_Tp>(__2rsh / __d, __2i_sin / __d);
 }
 
 _CCCL_END_NAMESPACE_CUDA_STD

@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__cmath/sincos.h>
 #include <cuda/std/__cmath/abs.h>
 #include <cuda/std/__cmath/hypot.h>
 #include <cuda/std/__cmath/inverse_trigonometric_functions.h>
@@ -139,12 +140,13 @@ template <class _Tp>
     }
     return complex<_Tp>(numeric_limits<_Tp>::quiet_NaN(), numeric_limits<_Tp>::quiet_NaN());
   }
-  _Tp __x = __rho * ::cuda::std::cos(__theta);
+  const auto [__sin_theta, __cos_theta] = ::cuda::sincos(__theta);
+  _Tp __x                               = __rho * __cos_theta;
   if (::cuda::std::isnan(__x))
   {
     __x = 0;
   }
-  _Tp __y = __rho * ::cuda::std::sin(__theta);
+  _Tp __y = __rho * __sin_theta;
   if (::cuda::std::isnan(__y))
   {
     __y = 0;