Skip to content

Commit f08d23e

Browse files
authored
Zero imag portion of diagonal in syrk/herk (#1648) (#1651)
* Zero imag portion of diagonal in syrk/herk (#1648) * Bump version to 4.4.1 and update changelog
1 parent 7f88fac commit f08d23e

File tree

7 files changed

+50
-32
lines changed

7 files changed

+50
-32
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@
33
rocBLAS documentation is available at
44
[https://rocm.docs.amd.com/projects/rocBLAS/en/latest/index.html](https://rocm.docs.amd.com/projects/rocBLAS/en/latest/index.html).
55

6+
## rocBLAS 4.4.1 for ROCm 6.4.2
7+
8+
### Resolved issues
9+
10+
* Zero imaginary portion of diagonal of C matrix for cherk/zherk for gfx90a/gfx942 with problem sizes `k > 500`
11+
612
## rocBLAS 4.4.0 for ROCm 6.4
713

814
### Added

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# ########################################################################
2-
# Copyright (C) 2016-2024 Advanced Micro Devices, Inc. All rights reserved.
2+
# Copyright (C) 2016-2025 Advanced Micro Devices, Inc. All rights reserved.
33
#
44
# Permission is hereby granted, free of charge, to any person obtaining a copy
55
# of this software and associated documentation files (the "Software"), to deal
@@ -100,7 +100,7 @@ get_os_id(OS_ID)
100100
message (STATUS "OS detected is ${OS_ID}")
101101

102102
# Versioning via rocm-cmake
103-
set ( VERSION_STRING "4.4.0" )
103+
set ( VERSION_STRING "4.4.1" )
104104
rocm_setup_version( VERSION ${VERSION_STRING} )
105105

106106
# Users may override HIP path by specifying their own in CMAKE_MODULE_PATH

clients/include/blas3/testing_herk.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ void testing_herk(const Arguments& arg)
212212
rocblas_init_matrix(
213213
hA, arg, rocblas_client_alpha_sets_nan, rocblas_client_general_matrix, true, true);
214214
rocblas_init_matrix(
215-
hC, arg, rocblas_client_beta_sets_nan, rocblas_client_hermitian_matrix, false, true);
215+
hC, arg, rocblas_client_beta_sets_nan, rocblas_client_general_matrix, false, true);
216216

217217
hC_gold = hC;
218218

clients/include/blas3/testing_herk_batched.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ void testing_herk_batched(const Arguments& arg)
259259
rocblas_init_matrix(
260260
hA, arg, rocblas_client_alpha_sets_nan, rocblas_client_general_matrix, true, true);
261261
rocblas_init_matrix(
262-
hC, arg, rocblas_client_beta_sets_nan, rocblas_client_hermitian_matrix, false, true);
262+
hC, arg, rocblas_client_beta_sets_nan, rocblas_client_general_matrix, false, true);
263263

264264
hC_gold.copy_from(hC);
265265

clients/include/blas3/testing_herk_strided_batched.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,7 @@ void testing_herk_strided_batched(const Arguments& arg)
412412
rocblas_init_matrix(
413413
hA, arg, rocblas_client_alpha_sets_nan, rocblas_client_general_matrix, true, true);
414414
rocblas_init_matrix(
415-
hC, arg, rocblas_client_beta_sets_nan, rocblas_client_hermitian_matrix, false, true);
415+
hC, arg, rocblas_client_beta_sets_nan, rocblas_client_general_matrix, false, true);
416416

417417
hC_gold.copy_from(hC);
418418

library/src/blas3/herk_syrk_device.hpp

Lines changed: 34 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/* ************************************************************************
2-
* Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
2+
* Copyright (C) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
33
*
44
* Permission is hereby granted, free of charge, to any person obtaining a copy
55
* of this software and associated documentation files (the "Software"), to deal
@@ -1327,14 +1327,20 @@ rocblas_status rocblas_syr2k_her2k_dispatch(rocblas_handle handle,
13271327
return rocblas_status_success;
13281328
}
13291329

1330-
template <bool copy_from_C_to_W_C, bool is_upper, typename T, typename TPtr, int DIM_X, int DIM_Y>
1330+
template <bool copy_from_C_to_W_C,
1331+
bool is_upper,
1332+
bool HERM,
1333+
typename T,
1334+
typename TPtr,
1335+
int DIM_X,
1336+
int DIM_Y>
13311337
ROCBLAS_KERNEL(DIM_X* DIM_Y)
1332-
rocblas_copy_triangular_excluding_diagonal_kernel(rocblas_int n,
1333-
TPtr d_C,
1334-
rocblas_int ldc,
1335-
rocblas_stride stride_C,
1336-
T* W_C,
1337-
rocblas_int batch_count)
1338+
rocblas_copy_triangular_syrk_herk_kernel(rocblas_int n,
1339+
TPtr d_C,
1340+
rocblas_int ldc,
1341+
rocblas_stride stride_C,
1342+
T* W_C,
1343+
rocblas_int batch_count)
13381344
{
13391345
uint32_t batch = blockIdx.z;
13401346

@@ -1379,19 +1385,24 @@ rocblas_copy_triangular_excluding_diagonal_kernel(rocblas_int n,
13791385
}
13801386
}
13811387

1388+
// When copying back to C, we need to zero-out diagonal imaginary
1389+
if constexpr(HERM && !copy_from_C_to_W_C)
1390+
if(row == col && row < n)
1391+
C[row + row * int64_t(ldc)] = std::real(C[row + row * int64_t(ldc)]);
1392+
13821393
#if DEVICE_GRID_YZ_16BIT
13831394
}
13841395
#endif
13851396
}
13861397

1387-
template <bool copy_from_C_to_W_C, bool is_upper, typename T, typename TPtr>
1388-
rocblas_status rocblas_copy_triangular_excluding_diagonal(rocblas_handle handle,
1389-
rocblas_int n,
1390-
TPtr C,
1391-
rocblas_int ldc,
1392-
rocblas_stride stride_C,
1393-
T* W_C,
1394-
rocblas_int batch_count)
1398+
template <bool copy_from_C_to_W_C, bool is_upper, bool HERM, typename T, typename TPtr>
1399+
rocblas_status rocblas_copy_triangular_syrk_herk(rocblas_handle handle,
1400+
rocblas_int n,
1401+
TPtr C,
1402+
rocblas_int ldc,
1403+
rocblas_stride stride_C,
1404+
T* W_C,
1405+
rocblas_int batch_count)
13951406
{
13961407
hipStream_t rocblas_stream = handle->get_stream();
13971408

@@ -1405,12 +1416,13 @@ rocblas_status rocblas_copy_triangular_excluding_diagonal(rocblas_handle handle,
14051416
dim3 gridDim((n - 1) / blockDim.x + 1, (n - 1) / blockDim.y + 1, batches);
14061417

14071418
// Launch kernel
1408-
ROCBLAS_LAUNCH_KERNEL((rocblas_copy_triangular_excluding_diagonal_kernel<copy_from_C_to_W_C,
1409-
is_upper,
1410-
T,
1411-
TPtr,
1412-
DIM_X,
1413-
DIM_Y>),
1419+
ROCBLAS_LAUNCH_KERNEL((rocblas_copy_triangular_syrk_herk_kernel<copy_from_C_to_W_C,
1420+
is_upper,
1421+
HERM,
1422+
T,
1423+
TPtr,
1424+
DIM_X,
1425+
DIM_Y>),
14141426
gridDim,
14151427
blockDim,
14161428
0,

library/src/blas3/rocblas_syrk_herk_kernels.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/* ************************************************************************
2-
* Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
2+
* Copyright (C) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
33
*
44
* Permission is hereby granted, free of charge, to any person obtaining a copy
55
* of this software and associated documentation files (the "Software"), to deal
@@ -104,10 +104,10 @@ rocblas_status rocblas_internal_syrk_herk_template(rocblas_handle handle,
104104

105105
// Launch kernel to copy the data from triangular matrix to the workspace memory
106106
if(rocblas_fill_upper == uplo)
107-
RETURN_IF_ROCBLAS_ERROR((rocblas_copy_triangular_excluding_diagonal<true, true>(
107+
RETURN_IF_ROCBLAS_ERROR((rocblas_copy_triangular_syrk_herk<true, true, HERM>(
108108
handle, n, C, ldc, stride_C, (T*)w_mem, batch_count)));
109109
else
110-
RETURN_IF_ROCBLAS_ERROR((rocblas_copy_triangular_excluding_diagonal<true, false>(
110+
RETURN_IF_ROCBLAS_ERROR((rocblas_copy_triangular_syrk_herk<true, false, HERM>(
111111
handle, n, C, ldc, stride_C, (T*)w_mem, batch_count)));
112112

113113
RETURN_IF_ROCBLAS_ERROR((rocblas_internal_gemm_64<BATCHED>(handle,
@@ -134,10 +134,10 @@ rocblas_status rocblas_internal_syrk_herk_template(rocblas_handle handle,
134134

135135
// Launch kernel to copy the data from workspace memory back to triangular matrix
136136
if(rocblas_fill_upper == uplo)
137-
RETURN_IF_ROCBLAS_ERROR((rocblas_copy_triangular_excluding_diagonal<false, true>(
137+
RETURN_IF_ROCBLAS_ERROR((rocblas_copy_triangular_syrk_herk<false, true, HERM>(
138138
handle, n, C, ldc, stride_C, (T*)w_mem, batch_count)));
139139
else
140-
RETURN_IF_ROCBLAS_ERROR((rocblas_copy_triangular_excluding_diagonal<false, false>(
140+
RETURN_IF_ROCBLAS_ERROR((rocblas_copy_triangular_syrk_herk<false, false, HERM>(
141141
handle, n, C, ldc, stride_C, (T*)w_mem, batch_count)));
142142

143143
return rocblas_status_success;

0 commit comments

Comments
 (0)