Conversation
| ASSERT_EQ(err, hipSuccess) << hipGetErrorString(err); | ||
|
|
||
| const float amax = 1.0f; | ||
| input.set_tensor_amax(amax); |
There was a problem hiding this comment.
set_scale() instead?
There was a problem hiding this comment.
Yeah, I think for dequantization, the scale is needed
There was a problem hiding this comment.
This leads to memory fault run-time error, whereas my current method (set_tensor_amax) works fine. Leaving as is for now.
There was a problem hiding this comment.
Please double-check. Quantization does not need amax, dequant should not have it either.
ipanfilo
left a comment
There was a problem hiding this comment.
It is based on PR#472. Not to review the same changes twice let's wait for that PR to merge
| ASSERT_EQ(err, hipSuccess) << hipGetErrorString(err); | ||
|
|
||
| const float amax = 1.0f; | ||
| input.set_tensor_amax(amax); |
There was a problem hiding this comment.
Yeah, I think for dequantization, the scale is needed
2682291 to
1d0a70e
Compare
| #ifdef __HIP_PLATFORM_AMD__ | ||
| static __device__ constexpr uint64_t WARP_REDUCE_AMAX_GROUP_MASKS[8] = { | ||
| 0x0101010101010101ULL, 0x0202020202020202ULL, | ||
| 0x0404040404040404ULL, 0x0808080808080808ULL, | ||
| 0x1010101010101010ULL, 0x2020202020202020ULL, | ||
| 0x4040404040404040ULL, 0x8080808080808080ULL}; | ||
| #else | ||
| static __device__ constexpr unsigned int WARP_REDUCE_AMAX_GROUP_MASKS[8] = { | ||
| 0x01010101, 0x02020202, 0x04040404, 0x08080808, 0x10101010, 0x20202020, 0x40404040, 0x80808080}; | ||
| #endif | ||
|
|
||
| // max for every group_size elements in warp | ||
| template <int group_size, int shfl_down_stride> | ||
| __device__ __forceinline__ float groupMax(float val, unsigned int groupMask) { | ||
| __device__ __forceinline__ float groupMax(float val, | ||
| #ifdef __HIP_PLATFORM_AMD__ | ||
| uint64_t groupMask) { | ||
| #else | ||
| unsigned int groupMask) { | ||
| #endif |
There was a problem hiding this comment.
I think the changes in this file are due to a merge error and should not be necessary.
| size_t divide_round_up(size_t x, size_t y) { | ||
| return (x + y - 1) / y; | ||
| } |
There was a problem hiding this comment.
Isn't this part of test_common.h?
| const uint8_t bits = static_cast<uint8_t>(dis(gen)); | ||
|
|
||
| fp8e4m3 candidate; | ||
| std::memcpy(&candidate, &bits, sizeof(bits)); | ||
|
|
||
| const float decoded = static_cast<float>(candidate); | ||
| if (std::isfinite(decoded)) { | ||
| scale_buffer[idx] = candidate; | ||
| break; | ||
| } |
There was a problem hiding this comment.
This section of codes generating a valid fp8e4m3 are reused in the 2d scale as well, let's consolidate them to avoid maintaining duplicated copies
| for (size_t block = 0; block < mathematical_blocks_per_row; ++block) { | ||
| const size_t idx = row * physical_row_stride + block; | ||
|
|
||
| while (true) { |
There was a problem hiding this comment.
By the way, is there a way to generate fp8e4m3 without using a while loop try? I understand with multiple-tryout, we will finally find a non-infinite fp8e4m3, but it's a little bit waste for the random seed and execution time. Fp8e4m3 is well documented, probably we can study which bit-patterns give non-infinite values?
| std::memcpy(&candidate, &bits, sizeof(bits)); | ||
|
|
||
| const float decoded = static_cast<float>(candidate); | ||
| if (std::isfinite(decoded)) { |
There was a problem hiding this comment.
Scales also need to be non-negative, right?
| generate_1d_scales(host_scales_rowwise_1d.get(), | ||
| unpadded_blocks_Y, | ||
| unpadded_blocks_X, | ||
| scales_stride, | ||
| gen, | ||
| fp8_dis); |
| generate_1d_scales(host_scales_colwise_1d.get(), | ||
| unpadded_blocks_Y_t, | ||
| unpadded_blocks_X_t, | ||
| scales_stride_t, | ||
| gen, | ||
| fp8_dis); |
| const size_t mathematical_rows, | ||
| const size_t mathematical_blocks_per_row, | ||
| const size_t physical_row_stride, |
There was a problem hiding this comment.
You can reuse the existing names (unpadded_blocks_Y, unpadded_blocks_X, and scales_stride)
| } | ||
|
|
||
| // Decode a single FP4 (E2M1) value from packed storage. | ||
| float get_fp4_value(const fp4e2m1* data, const size_t mathematical_idx) { |
There was a problem hiding this comment.
Only the scale has the padding/alignment distinction, mathemtical (unpadded) idx vs padded index. I recall for rowwise /columnwise data, we don't have this padding issue? If so, we can mathematical_idx -> idx
| float *amax_gpu = nullptr; | ||
| NVTE_CHECK_CUDA(cudaMalloc(&amax_gpu, sizeof(float))); | ||
| NVTE_CHECK_CUDA(cudaMemcpy(amax_gpu, amax_cpu_data_.get(), | ||
| sizeof(float), cudaMemcpyHostToDevice)); | ||
|
|
||
| tensor_.set_amax(amax_gpu, DType::kFloat32, tensor_.defaultShape); |
There was a problem hiding this comment.
use from_cpu()
TransformerEngine/tests/cpp/test_common.cu
Line 481 in 1d0a70e
| tensor_.set_amax(nullptr, DType::kFloat32, tensor_.defaultShape); | ||
| } | ||
|
|
||
| void set_tensor_amax(float amax) { |
There was a problem hiding this comment.
Guard our rocm specific changes by macro
| constexpr size_t scale_tensor_alignment_X_colwise = 128; | ||
| #endif | ||
|
|
||
| static constexpr float E2M1_LUT[16] = { |
Description
Fixes https://github.com/ROCm/frameworks-internal/issues/15998
Enable NVFP4 dequantization on AMD GPU (gfx950) and add unit test.
Type of change
Changes
Please list the changes introduced in this PR:
Checklist: