Implement cuda::sincos

davebayer · davebayer · commit a5b2ae0da4ee · 2025-11-23T13:02:50.000+01:00
diff --git a/libcudacxx/include/cuda/__cmath/sincos.h b/libcudacxx/include/cuda/__cmath/sincos.h
@@ -0,0 +1,113 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___CMATH_SINCOS_H
+#define _CUDA___CMATH_SINCOS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cmath/trigonometric_functions.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__floating_point/traits.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_same.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+#if _CCCL_HAS_BUILTIN(__builtin_sincosf) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_SINCOSF(...) __builtin_sincosf(__VA_ARGS__)
+#endif // _CCCL_HAS_BUILTIN(__builtin_sincosf) || _CCCL_COMPILER(GCC)
+
+#if _CCCL_HAS_BUILTIN(__builtin_sincos) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_SINCOS(...) __builtin_sincos(__VA_ARGS__)
+#endif // _CCCL_HAS_BUILTIN(__builtin_sincos) || _CCCL_COMPILER(GCC)
+
+#if _CCCL_HAS_BUILTIN(__builtin_sincosl) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_SINCOSL(...) __builtin_sincosl(__VA_ARGS__)
+#endif // _CCCL_HAS_BUILTIN(__builtin_sincosl) || _CCCL_COMPILER(GCC)
+
+// clang-cuda crashes when using these builtins in device code
+#if _CCCL_CUDA_COMPILER(CLANG) && _CCCL_DEVICE_COMPILATION()
+#  undef _CCCL_BUILTIN_SINCOSF
+#  undef _CCCL_BUILTIN_SINCOS
+#  undef _CCCL_BUILTIN_SINCOSL
+#endif // _CCCL_CUDA_COMPILER(CLANG) && _CCCL_DEVICE_COMPILATION()
+
+_CCCL_BEGIN_NAMESPACE_CUDA
+
+template <class _Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT sincos_result
+{
+  _Tp sin;
+  _Tp cos;
+};
+
+_CCCL_TEMPLATE(class _Tp)
+_CCCL_REQUIRES(::cuda::std::__is_fp_v<_Tp>)
+[[nodiscard]] _CCCL_API sincos_result<_Tp> sincos(_Tp __v) noexcept
+{
+  sincos_result<_Tp> __ret{};
+#if defined(_CCCL_BUILTIN_SINCOSF)
+  if constexpr (::cuda::std::is_same_v<_Tp, float>)
+  {
+    _CCCL_BUILTIN_SINCOSF(__v, &__ret.sin, &__ret.cos);
+    return __ret;
+  }
+#endif // _CCCL_BUILTIN_SINCOSF
+#if defined(_CCCL_BUILTIN_SINCOS)
+  if constexpr (::cuda::std::is_same_v<_Tp, double>)
+  {
+    _CCCL_BUILTIN_SINCOS(__v, &__ret.sin, &__ret.cos);
+    return __ret;
+  }
+#endif // _CCCL_BUILTIN_SINCOS
+#if _CCCL_HAS_LONG_DOUBLE() && defined(_CCCL_BUILTIN_SINCOSL)
+  if constexpr (::cuda::std::is_same_v<_Tp, long double>)
+  {
+    _CCCL_BUILTIN_SINCOSL(__v, &__ret.sin, &__ret.cos);
+    return __ret;
+  }
+#endif // _CCCL_HAS_LONG_DOUBLE() && _CCCL_BUILTIN_SINCOSL
+  _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+  {
+    if constexpr (::cuda::std::is_same_v<_Tp, float>)
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (::sincosf(__v, &__ret.sin, &__ret.cos); return __ret;))
+    }
+    if constexpr (::cuda::std::is_same_v<_Tp, double>)
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (::sincos(__v, &__ret.sin, &__ret.cos); return __ret;))
+    }
+  }
+  __ret.sin = ::cuda::std::sin(__v);
+  __ret.cos = ::cuda::std::cos(__v);
+  return __ret;
+}
+
+_CCCL_TEMPLATE(class _Tp)
+_CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>)
+[[nodiscard]] _CCCL_API sincos_result<double> sincos(_Tp __v) noexcept
+{
+  return ::cuda::sincos(static_cast<double>(__v));
+}
+
+_CCCL_END_NAMESPACE_CUDA
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA___CMATH_SINCOS_H
diff --git a/libcudacxx/include/cuda/cmath b/libcudacxx/include/cuda/cmath
@@ -31,6 +31,7 @@
 #include <cuda/__cmath/pow2.h>
 #include <cuda/__cmath/round_down.h>
 #include <cuda/__cmath/round_up.h>
+#include <cuda/__cmath/sincos.h>
 #include <cuda/__cmath/uabs.h>
 #include <cuda/std/cmath>
 
diff --git a/libcudacxx/include/cuda/std/__complex/exponential_functions.h b/libcudacxx/include/cuda/std/__complex/exponential_functions.h
@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__cmath/sincos.h>
 #include <cuda/std/__cmath/copysign.h>
 #include <cuda/std/__cmath/isfinite.h>
 #include <cuda/std/__cmath/isinf.h>
@@ -175,12 +176,7 @@ _CCCL_API inline complex<float> exp(const complex<float>& __x)
     __exp_r_reduced = (__r < 0.0f) ? 0.0f : 1e3f;
   }
 
-  // Compile to sincos when possible:
-  float __sin_i;
-  float __cos_i;
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
-                    (::sincosf(__i, &__sin_i, &__cos_i);),
-                    (__sin_i = ::cuda::std::sinf(__i); __cos_i = ::cuda::std::cosf(__i);))
+  const auto [__sin_i, __cos_i] = ::cuda::sincos(__i);
 
   // Our answer now is: (ldexp(__exp_r_reduced * __sin_r, __j_int), ldexp(__exp_r_reduced * __sin_r, __j_int))
   // However we don't need a full ldexp here, and if __exp_r_reduced*__sin_r is denormal we can lose bits.
@@ -265,12 +261,7 @@ _CCCL_API inline complex<double> exp<double>(const complex<double>& __x)
     __exp_r_reduced = (__r < 0.0) ? 0.0 : 1e10;
   }
 
-  // Compile to sincos when possible:
-  double __sin_i;
-  double __cos_i;
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
-                    (::sincos(__i, &__sin_i, &__cos_i);),
-                    (__sin_i = ::cuda::std::sin(__i); __cos_i = ::cuda::std::cos(__i);))
+  const auto [__sin_i, __cos_i] = ::cuda::sincos(__i);
 
   // Our answer now is: (ldexp(__exp_mant * __sin_r, __j_int), ldexp(__exp_mant * __sin_r, __j_int))
   // However we don't need a full ldexp here, and if __exp_mant*__sin_r is denormal we can lose bits.
diff --git a/libcudacxx/test/libcudacxx/cuda/cmath/sincos.pass.cpp b/libcudacxx/test/libcudacxx/cuda/cmath/sincos.pass.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/cmath>
+
+#include <cuda/cmath>
+#include <cuda/std/cassert>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+
+template <class T>
+__host__ __device__ void test_type(float zero)
+{
+  using Result = cuda::std::conditional_t<cuda::std::is_integral_v<T>, double, T>;
+
+  static_assert(cuda::std::is_same_v<cuda::sincos_result<Result>, decltype(cuda::sincos(T{}))>);
+  static_assert(noexcept(cuda::sincos(cuda::std::declval<T>())));
+
+  auto result = cuda::sincos(static_cast<T>(zero));
+  static_assert(cuda::std::is_same_v<Result, decltype(result.sin)>);
+  static_assert(cuda::std::is_same_v<Result, decltype(result.cos)>);
+  assert(result.sin == Result{0});
+  assert(result.cos == Result{1});
+}
+
+__host__ __device__ void test(float zero)
+{
+  test_type<float>(zero);
+  test_type<double>(zero);
+#if _CCCL_HAS_LONG_DOUBLE()
+  test_type<long double>(zero);
+#endif // _CCCL_HAS_LONG_DOUBLE()
+#if _LIBCUDACXX_HAS_NVFP16()
+  test_type<__half>(zero);
+#endif // _LIBCUDACXX_HAS_NVFP16()
+#if _LIBCUDACXX_HAS_NVBF16()
+  test_type<__nv_bfloat16>(zero);
+#endif // _LIBCUDACXX_HAS_NVBF16()
+
+  // todo: add tests for f128 once supported
+
+  test_type<signed char>(zero);
+  test_type<signed short>(zero);
+  test_type<signed int>(zero);
+  test_type<signed long>(zero);
+  test_type<signed long long>(zero);
+#if _CCCL_HAS_INT128()
+  test_type<__int128_t>(zero);
+#endif // _CCCL_HAS_INT128()
+
+  test_type<unsigned char>(zero);
+  test_type<unsigned short>(zero);
+  test_type<unsigned int>(zero);
+  test_type<unsigned long>(zero);
+  test_type<unsigned long long>(zero);
+#if _CCCL_HAS_INT128()
+  test_type<__uint128_t>(zero);
+#endif // _CCCL_HAS_INT128()
+}
+
+int main(int, char**)
+{
+  volatile float zero = 0.0f;
+  test(zero);
+  return 0;
+}