Skip to content

Update PAPI Instrumentation for device BLAS #118

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ else
(( err += $? ))

# CUDA, HIP, or SYCL. These fail gracefully when GPUs are absent.
./run_tests.py ${args} --blas1-device --blas3-device
./run_tests.py ${args} --blas1-device --blas2-device --blas3-device
(( err += $? ))

./run_tests.py ${args} --batch-blas3-device
Expand Down
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ if (CUDAToolkit_FOUND)
blaspp
PRIVATE
src/cuda/device_shift_vec.cu
src/cuda/device_conj.cu
)
# Some platforms need these to be public libraries.
target_link_libraries(
Expand All @@ -400,6 +401,7 @@ elseif (rocblas_FOUND)
blaspp
PRIVATE
src/hip/device_shift_vec.hip
src/hip/device_conj.hip
)
# Some platforms need these to be public libraries.
target_link_libraries(
Expand Down
8 changes: 5 additions & 3 deletions config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,16 +715,18 @@ def sycl_onemkl_library():
Does not actually run the resulting exe, to allow compiling on a
machine without GPUs.
'''
libs = '-lmkl_sycl -lsycl -lOpenCL'
ldflags = '-fsycl'
libs = '-lmkl_sycl -lsycl -lOpenCL'
print_subhead( 'SYCL and oneMKL libraries' )
print_test( ' ' + libs )
print_test( ' ' + ldflags + ' ' + libs )

# Intel compiler vars.sh defines $CMPLR_ROOT
root = environ['CMPLR_ROOT'] or environ['CMPROOT']
inc = ''
if (root):
inc = '-I' + root + '/linux/include ' # space at end for concat
env = {'LIBS': libs,
env = {'LDFLAGS': ldflags,
'LIBS': libs,
'CXXFLAGS': inc + define('HAVE_SYCL')
+ ' -fsycl -Wno-deprecated-declarations'}
(rc, out, err) = compile_exe( 'config/onemkl.cc', env )
Expand Down
340 changes: 273 additions & 67 deletions include/blas/counter.hh

Large diffs are not rendered by default.

39 changes: 35 additions & 4 deletions include/blas/device.hh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#endif

#include <hip/hip_runtime.h>
#include <hip/hip_complex.h>

// Headers moved in ROCm 5.2
#if HIP_VERSION >= 50200000
Expand Down Expand Up @@ -699,13 +700,43 @@ void Queue::work_ensure_size( size_t lwork )
}
}

//------------------------------------------------------------------------------
/// Add a constant c to an n-element vector v.
///

template <typename scalar_t>
void shift_vec( int64_t n, scalar_t* v, scalar_t c, blas::Queue& queue );

template <typename TS, typename TD>
void conj(
int64_t n,
TS const* src, int64_t inc_src,
TD* dst, int64_t inc_dst,
blas::Queue& queue );

#if defined(BLAS_HAVE_SYCL)

template <typename TS, typename TD>
void conj(
int64_t n,
TS const* src, int64_t inc_src,
TD* dst, int64_t inc_dst,
blas::Queue& queue )
{
using std::conj;

if (n <= 0) {
return;
}

int64_t i_src = (inc_src > 0 ? 0 : (1 - n) * inc_src);
int64_t i_dst = (inc_dst > 0 ? 0 : (1 - n) * inc_dst);

queue.stream().submit( [&]( sycl::handler& h ) {
h.parallel_for( sycl::range<1>(n), [=]( sycl::id<1> i ) {
dst[ i*inc_dst + i_dst ] = conj( src[ i*inc_src + i_src ] );
} );
} );
}

#endif // BLAS_HAVE_SYCL

} // namespace blas

#endif // #ifndef BLAS_DEVICE_HH
10 changes: 10 additions & 0 deletions include/blas/util.hh
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,16 @@ inline void abort_if( bool cond, const char* func, const char* format, ... )

#endif

//------------------------------------------------------------------------------
/// Integer division rounding up instead of down
/// @return ceil( x / y ), for integer types T1, T2.
template <typename T1, typename T2>
inline constexpr std::common_type_t<T1, T2> ceildiv( T1 x, T2 y )
{
using T = std::common_type_t<T1, T2>;
return T((x + y - 1) / y);
}

} // namespace blas

#endif // #ifndef BLAS_UTIL_HH
104 changes: 104 additions & 0 deletions src/cuda/device_conj.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#include "blas/device.hh"
#include "thrust/complex.h"

#if defined(BLAS_HAVE_CUBLAS)

namespace blas {

__device__ std::complex<float> conj_convert(
std::complex<float> z)
{
((cuComplex*) &z)->y *= -1;
return z;
}

__device__ std::complex<double> conj_convert(
std::complex<double> z)
{
((cuDoubleComplex*) &z)->y *= -1;
return z;
}

// Each thread conjugates 1 item
template <typename TS, typename TD>
__global__ void conj_kernel(
int64_t n,
TS const* src, int64_t inc_src, int64_t i_src,
TD* dst, int64_t inc_dst, int64_t i_dst)
{
using thrust::conj;

int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
dst[ i*inc_dst + i_dst ] = conj_convert( src[ i*inc_src + i_src ] );
}

//------------------------------------------------------------------------------
/// Conjugates each element of the vector src and stores in dst.
///
/// @param[in] n
/// Number of elements in the vector. n >= 0.
///
/// @param[in] src
/// Pointer to the input vector of length n.
///
/// @param[in] inc_src
/// Stride between elements of src. inc_src >= 1.
///
/// @param[out] dst
/// Pointer to output vector
/// On exit, each element dst[i] is updated as dst[i] = conj( src[i] ).
/// dst may be the same as src.
///
/// @param[in] inc_dst
/// Stride between elements of dst. inc_dst >= 1.
///
/// @param[in] queue
/// BLAS++ queue to execute in.
///
template <typename TS, typename TD>
void conj(
int64_t n,
TS const* src, int64_t inc_src,
TD* dst, int64_t inc_dst,
blas::Queue& queue )
{
if (n <= 0) {
return;
}

const int64_t BlockSize = 128;

int64_t n_threads = min( BlockSize, n );
int64_t n_blocks = ceildiv(n, n_threads);

int64_t i_src = (inc_src > 0 ? 0 : (1 - n) * inc_src);
int64_t i_dst = (inc_dst > 0 ? 0 : (1 - n) * inc_dst);

blas_dev_call(
cudaSetDevice( queue.device() ) );

conj_kernel<<<n_blocks, n_threads, 0, queue.stream()>>>(
n, src, inc_src, i_src, dst, inc_dst, i_dst );

blas_dev_call(
cudaGetLastError() );
}

//------------------------------------------------------------------------------
// Explicit instantiations.
template void conj(
int64_t n,
std::complex<float> const* src, int64_t inc_src,
std::complex<float>* dst, int64_t inc_dst,
blas::Queue& queue);

template void conj(
int64_t n,
std::complex<double> const* src, int64_t inc_src,
std::complex<double>* dst, int64_t inc_dst,
blas::Queue& queue);

} // namespace blas

#endif // BLAS_HAVE_CUBLAS
1 change: 1 addition & 0 deletions src/device_asum.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.

#include "blas/device_blas.hh"
#include "blas/counter.hh"

#include "device_internal.hh"

Expand Down
1 change: 1 addition & 0 deletions src/device_axpy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.

#include "blas/device_blas.hh"
#include "blas/counter.hh"

#include "device_internal.hh"

Expand Down
2 changes: 1 addition & 1 deletion src/device_batch_gemm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ void gemm(
element = { transA_, transB_, m_, n_, k_, batch_size };
counter::insert( element, counter::Id::dev_batch_gemm );

double gflops = 1e9 * blas::Gflop< scalar_t >::gemm( m, n, k );
double gflops = 1e9 * blas::Gflop< scalar_t >::gemm( m_, n_, k_ );
counter::inc_flop_count( (long long int)gflops );
#endif

Expand Down
2 changes: 1 addition & 1 deletion src/device_batch_hemm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ void hemm(
element = { batch_size };
counter::insert( element, counter::Id::dev_batch_hemm );

double gflops = 1e9 * blas::Gflop< scalar_t >::hemm( side, m, n );
double gflops = 1e9 * blas::Gflop< scalar_t >::hemm( side[0], m[0], n[0] );
counter::inc_flop_count( (long long int)gflops );
#endif

Expand Down
1 change: 1 addition & 0 deletions src/device_iamax.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.

#include "blas/device_blas.hh"
#include "blas/counter.hh"

#include "device_internal.hh"

Expand Down
78 changes: 78 additions & 0 deletions src/hip/device_conj.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#include "blas/device.hh"
#include <hip/hip_complex.h>

#if defined(BLAS_HAVE_ROCBLAS)

namespace blas {

__device__ std::complex<float> conj_convert(
std::complex<float> z)
{
hipFloatComplex res = hipConjf(*(hipFloatComplex*) &z);
return *(std::complex<float>*) &res;
}

__device__ std::complex<double> conj_convert(
std::complex<double> z)
{
hipDoubleComplex res = hipConj(*(hipDoubleComplex*) &z);
return *(std::complex<double>*) &res;
}

template <typename TS, typename TD>
__global__ void conj_kernel(
int64_t n,
TS const* src, int64_t inc_src, int64_t i_src,
TD* dst, int64_t inc_dst, int64_t i_dst)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
dst[ i*inc_dst + i_dst ] = conj_convert( src[ i*inc_src + i_src ] );
}

template <typename TS, typename TD>
void conj(
int64_t n,
TS const* src, int64_t inc_src,
TD* dst, int64_t inc_dst,
blas::Queue& queue )
{
if (n <= 0) {
return;
}

const int BlockSize = 128;

int64_t n_threads = std::min( int64_t( BlockSize ), n );
int64_t n_blocks = ceildiv(n, n_threads);

int64_t i_src = (inc_src > 0 ? 0 : (1 - n) * inc_src);
int64_t i_dst = (inc_dst > 0 ? 0 : (1 - n) * inc_dst);

blas_dev_call(
hipSetDevice( queue.device() ) );

conj_kernel<<<n_blocks, n_threads, 0, queue.stream()>>>(
n, src, inc_src, i_src, dst, inc_dst, i_dst );

blas_dev_call(
hipGetLastError() );
}

//------------------------------------------------------------------------------
// Explicit instantiations.
template void conj(
int64_t n,
std::complex<float> const* src, int64_t inc_src,
std::complex<float>* dst, int64_t inc_dst,
blas::Queue& queue);

template void conj(
int64_t n,
std::complex<double> const* src, int64_t inc_src,
std::complex<double>* dst, int64_t inc_dst,
blas::Queue& queue);

} // namespace blas

#endif // BLAS_HAVE_ROCBLAS
4 changes: 4 additions & 0 deletions test/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
group_cat.add_argument( '--host', action='store_true', help='run all CPU host routines' ),

group_cat.add_argument( '--blas1-device', action='store_true', help='run Level 1 BLAS on devices (GPUs)' ),
group_cat.add_argument( '--blas2-device', action='store_true', help='run Level 2 BLAS on devices (GPUs)' ),
group_cat.add_argument( '--blas3-device', action='store_true', help='run Level 3 BLAS on devices (GPUs)' ),
group_cat.add_argument( '--batch-blas3-device', action='store_true', help='run Level 3 Batch BLAS on devices (GPUs)' ),

Expand Down Expand Up @@ -317,6 +318,9 @@ def filter_csv( values, csv ):
[ 'trmv', dtype + layout + align + uplo + trans + diag + n + incx ],
[ 'trsv', dtype + layout + align + uplo + trans + diag + n + incx ],
]

if (opts.blas2_device):
cmds += []

# Level 3
if (opts.blas3):
Expand Down