cpu: rv64: add rvv batch normalization integration

zhangjian29 · zhangjian29 · commit 4f779efd0b65 · 2025-11-19T09:57:20.000+08:00
diff --git a/src/cpu/cpu_batch_normalization_list.cpp b/src/cpu/cpu_batch_normalization_list.cpp
@@ -34,6 +34,11 @@ using namespace dnnl::impl::cpu::x64;
 #include "cpu/aarch64/acl_batch_normalization.hpp"
 #endif
 using namespace dnnl::impl::cpu::aarch64;
+#elif DNNL_RV64
+#if defined(DNNL_RISCV_USE_RVV_INTRINSICS)
+#include "cpu/rv64/rvv_batch_normalization.hpp"
+using namespace dnnl::impl::cpu::rv64;
+#endif
 #endif
 
 namespace dnnl {
@@ -59,6 +64,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map() {
             CPU_INSTANCE_AARCH64(jit_uni_batch_normalization_fwd_t<sve_256>)
             CPU_INSTANCE_AARCH64(jit_uni_batch_normalization_fwd_t<asimd>)
             CPU_INSTANCE_AARCH64_ACL(acl_batch_normalization_fwd_t)
+            CPU_INSTANCE_RV64GCV(rvv_batch_normalization_fwd_t)
             CPU_INSTANCE(ncsp_batch_normalization_fwd_t<f32>)
             CPU_INSTANCE(ncsp_batch_normalization_fwd_t<bf16>)
             CPU_INSTANCE(ncsp_batch_normalization_fwd_t<f16>)
diff --git a/src/cpu/rv64/rvv_batch_normalization.cpp b/src/cpu/rv64/rvv_batch_normalization.cpp
@@ -0,0 +1,195 @@
+/******************************************************************************
+* Copyright 2025
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+******************************************************************************/
+
+#include <assert.h>
+#include <math.h>
+#include <vector>
+#include <riscv_vector.h>
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/type_helpers.hpp"
+
+#include "cpu/rv64/rvv_batch_normalization.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace rv64 {
+
+namespace {
+
+// If per_elem_params is false, uses broadcast scalars mean/sm/sv (mean[0], sm[0], sv[0]).
+// If true, loads per-element mean/sm/sv from the provided arrays.
+static inline void bn_fwd_kernel_f32(const void *s_base, void *d_base,
+        size_t len, const float *mean, const float *sm, const float *sv,
+        bool per_elem_params, const rv64::rvv_postops_t &po) {
+    const size_t data_size = types::data_type_size(data_type::f32);
+    for (size_t i = 0; i < len;) {
+        size_t vl = __riscv_vsetvl_e32m1(len - i);
+
+        const float *s_ptr = reinterpret_cast<const float *>(
+                reinterpret_cast<const char *>(s_base) + i * data_size);
+        float *d_ptr = reinterpret_cast<float *>(
+                reinterpret_cast<char *>(d_base) + i * data_size);
+
+        vfloat32m1_t vx = __riscv_vle32_v_f32m1(s_ptr, vl);
+
+        vfloat32m1_t vmean_v;
+        vfloat32m1_t vsm_v;
+        vfloat32m1_t vsv_v;
+        if (per_elem_params) {
+            vmean_v = __riscv_vle32_v_f32m1(mean + i, vl);
+            vsm_v = __riscv_vle32_v_f32m1(sm + i, vl);
+            vsv_v = __riscv_vle32_v_f32m1(sv + i, vl);
+        } else {
+            vmean_v = __riscv_vfmv_v_f_f32m1(mean[0], vl);
+            vsm_v = __riscv_vfmv_v_f_f32m1(sm[0], vl);
+            vsv_v = __riscv_vfmv_v_f_f32m1(sv[0], vl);
+        }
+
+        vfloat32m1_t vtmp = __riscv_vfsub_vv_f32m1(vx, vmean_v, vl);
+        vfloat32m1_t vout = __riscv_vfmul_vv_f32m1(vtmp, vsm_v, vl);
+        vout = __riscv_vfadd_vv_f32m1(vout, vsv_v, vl);
+        vout = po.apply(vout, vl);
+
+        __riscv_vse32_v_f32m1(d_ptr, vout, vl);
+        i += vl;
+    }
+}
+
+} // namespace
+
+status_t rvv_batch_normalization_fwd_t::execute_forward(
+        const exec_ctx_t &ctx) const {
+    const memory_desc_wrapper data_d(pd()->src_md());
+    const auto dtsrc = pd()->src_md()->data_type;
+    const int ndims = data_d.ndims();
+
+    const dim_t N = pd()->MB();
+    const dim_t C = pd()->C();
+    const dim_t D = pd()->D();
+    const dim_t H = pd()->H();
+    const dim_t W = pd()->W();
+
+    const float eps = pd()->desc()->batch_norm_epsilon;
+
+    void *dst = CTX_OUT_MEM(void *, DNNL_ARG_DST);
+    const void *src = CTX_IN_MEM(const void *, DNNL_ARG_SRC);
+    const float *mean = CTX_IN_MEM(const float *, DNNL_ARG_MEAN);
+    const float *var = CTX_IN_MEM(const float *, DNNL_ARG_VARIANCE);
+    const float *scale = pd()->use_scale()
+            ? CTX_IN_MEM(const float *, DNNL_ARG_SCALE)
+            : nullptr;
+    const float *shift = pd()->use_shift()
+            ? CTX_IN_MEM(const float *, DNNL_ARG_SHIFT)
+            : nullptr;
+
+    rv64::rvv_postops_t po = pd()->fused_relu_in_kernel()
+            ? rv64::rvv_postops_t(alg_kind::eltwise_relu)
+            : rv64::rvv_postops_t(pd()->attr()->post_ops_);
+
+    auto off = [&](dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) -> size_t {
+        switch (ndims) {
+            case 3: return data_d.off(n, c, w);
+            case 4: return data_d.off(n, c, h, w);
+            case 5: return data_d.off(n, c, d, h, w);
+            default: assert(!"unsupported ndims"); return dim_t(0);
+        }
+    };
+
+    const bool channels_dense = data_d.blocking_desc().strides[1] == 1;
+
+    if (!channels_dense) {
+        // abx data tag: vectorize over W for fixed channel
+        parallel_nd(C, N, D, H, [&](dim_t c, dim_t n, dim_t d, dim_t h) {
+            const float vmean = mean[c];
+            const float inv_std = 1.0f / sqrtf(var[c] + eps);
+            const float vscale = scale ? scale[c] : 1.0f;
+            const float vshift = shift ? shift[c] : 0.0f;
+            const float sm = vscale * inv_std;
+            const float sv = vshift;
+            size_t base_off = off(n, c, d, h, 0);
+
+            switch (dtsrc) {
+                case data_type::f32: {
+                    const size_t data_size
+                            = types::data_type_size(data_type::f32);
+                    const void *s_ptr = reinterpret_cast<const void *>(
+                            reinterpret_cast<const char *>(src)
+                            + base_off * data_size);
+                    void *d_ptr = reinterpret_cast<void *>(
+                            reinterpret_cast<char *>(dst)
+                            + base_off * data_size);
+                    const float mean_b[1] = {vmean};
+                    const float sm_b[1] = {sm};
+                    const float sv_b[1] = {sv};
+                    bn_fwd_kernel_f32(s_ptr, d_ptr, static_cast<size_t>(W),
+                            mean_b, sm_b, sv_b, /*per_elem_params=*/false, po);
+                    break;
+                }
+                default:
+                    assert(!"Unsupported data type for RVV batch "
+                            "normalization");
+            }
+        });
+    } else {
+        // axb data tag: vectorize across channels
+        auto &grantor = ctx.get_scratchpad_grantor();
+        float *sm_arr = grantor.template get<float>(
+                memory_tracking::names::key_bnorm_tmp_mean);
+        float *sv_arr = grantor.template get<float>(
+                memory_tracking::names::key_bnorm_tmp_var);
+        for (dim_t c = 0; c < C; ++c) {
+            const float inv_std = 1.0f / sqrtf(var[c] + eps);
+            const float vscale = scale ? scale[c] : 1.0f;
+            const float vshift = shift ? shift[c] : 0.0f;
+            sm_arr[static_cast<size_t>(c)] = vscale * inv_std;
+            sv_arr[static_cast<size_t>(c)] = vshift;
+        }
+
+        parallel_nd(N, D, H, W, [&](dim_t n, dim_t d, dim_t h, dim_t w) {
+            switch (dtsrc) {
+                case data_type::f32: {
+                    const size_t data_size
+                            = types::data_type_size(data_type::f32);
+                    size_t base_off = off(n, 0, d, h, w);
+                    const void *s_ptr = reinterpret_cast<const void *>(
+                            reinterpret_cast<const char *>(src)
+                            + base_off * data_size);
+                    void *d_ptr = reinterpret_cast<void *>(
+                            reinterpret_cast<char *>(dst)
+                            + base_off * data_size);
+
+                    bn_fwd_kernel_f32(s_ptr, d_ptr, static_cast<size_t>(C),
+                            mean, sm_arr, sv_arr,
+                            /*per_elem_params=*/true, po);
+                    break;
+                }
+                default:
+                    assert(!"Unsupported data type for RVV batch "
+                            "normalization");
+            }
+        });
+    }
+
+    return status::success;
+}
+
+} // namespace rv64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/rv64/rvv_batch_normalization.hpp b/src/cpu/rv64/rvv_batch_normalization.hpp
@@ -0,0 +1,142 @@
+/******************************************************************************
+* Copyright 2025
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+******************************************************************************/
+
+#ifndef CPU_RV64_RVV_BATCH_NORMALIZATION_HPP
+#define CPU_RV64_RVV_BATCH_NORMALIZATION_HPP
+
+#include "common/memory_tracking.hpp"
+#include "common/primitive.hpp"
+
+#include "cpu/cpu_batch_normalization_pd.hpp"
+#include "cpu/platform.hpp"
+#include "cpu/rv64/rvv_postops.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace rv64 {
+
+struct rvv_batch_normalization_fwd_t : public primitive_t {
+    struct pd_t : public cpu_batch_normalization_fwd_pd_t {
+        using cpu_batch_normalization_fwd_pd_t::
+                cpu_batch_normalization_fwd_pd_t;
+
+        DECLARE_COMMON_PD_T_("RISCV64GCV", rvv_batch_normalization_fwd_t);
+
+        status_t init(engine_t *engine) {
+            UNUSED(engine);
+
+            using namespace data_type;
+
+            VDISPATCH_BNORM(is_fwd(), VERBOSE_BAD_PROPKIND);
+
+            const data_type_t dtsrc = src_md()->data_type;
+            const data_type_t dtdst = dst_md()->data_type;
+            bool types_ok = (dtsrc == f32 && dtdst == f32)
+                    && platform::has_data_type_support(dtsrc)
+                    && IMPLICATION(is_training(),
+                            platform::has_training_support(dtsrc));
+            VDISPATCH_BNORM(types_ok, VERBOSE_UNSUPPORTED_DT);
+
+            VDISPATCH_BNORM(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
+
+            // Require global stats (G). Flags C/H/R(inference) are optional. Disallow none and A.
+            VDISPATCH_BNORM(!fuse_norm_add_relu(), VERBOSE_UNSUPPORTED_FEATURE,
+                    "fuse_norm_add_relu not supported");
+            VDISPATCH_BNORM(use_global_stats(), VERBOSE_UNSUPPORTED_FEATURE,
+                    "stats must already have been computed (use global stats)");
+            using smask_t = primitive_attr_t::skip_mask_t;
+            VDISPATCH_BNORM(!(fuse_norm_relu()
+                                    && desc()->prop_kind
+                                            == prop_kind::forward_training),
+                    VERBOSE_UNSUPPORTED_FEATURE,
+                    "forward training with fused ReLU is not supported");
+            // Only support eltwise ReLU without alpha/beta post-op as current rvv_postops requires.
+            VDISPATCH_BNORM(attr()->has_default_values(smask_t::post_ops),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            {
+                const post_ops_t &po = attr()->post_ops_;
+                bool relu_no_params_ok = true;
+                if (po.len() == 1) {
+                    const auto &e = po.entry_[0];
+                    relu_no_params_ok = e.is_eltwise()
+                            && e.eltwise.alg == alg_kind::eltwise_relu
+                            && e.eltwise.alpha == 0.f && e.eltwise.beta == 0.f;
+                } else if (po.len() > 1) {
+                    relu_no_params_ok = false;
+                }
+                VDISPATCH_BNORM(relu_no_params_ok, VERBOSE_UNSUPPORTED_ATTR);
+            }
+            VDISPATCH_BNORM(rv64::rvv_postops_t::post_ops_ok(attr()->post_ops_),
+                    VERBOSE_UNSUPPORTED_ATTR);
+
+            // Simplest memory layouts only: plain, dense, same layout src/dst, no blocking/padding.
+            VDISPATCH_BNORM(
+                    set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
+            const memory_desc_wrapper src_d(src_md());
+            const memory_desc_wrapper dst_d(dst_md());
+            VDISPATCH_BNORM(
+                    check_layouts(src_d, dst_d), VERBOSE_UNSUPPORTED_TAG);
+
+            fused_relu_in_kernel_ = fuse_norm_relu();
+            init_scratchpad();
+
+            return status::success;
+        }
+        bool check_layouts(const memory_desc_wrapper &src_d,
+                const memory_desc_wrapper &dst_d) const {
+            // Require plain, dense, no blocking/padding, same plain layout.
+            bool ndims_ok = utils::one_of(ndims(), 3, 4, 5);
+            bool plain_dense = src_d.blocking_desc().inner_nblks == 0
+                    && dst_d.blocking_desc().inner_nblks == 0
+                    && src_d.is_dense(/*with_padding=*/false)
+                    && dst_d.is_dense(/*with_padding=*/false)
+                    && src_d.is_plain() && dst_d.is_plain();
+            bool same_layouts = src_d.similar_to(dst_d, /*with_strides=*/true,
+                    /*with_pads=*/false);
+            return ndims_ok && plain_dense && same_layouts;
+        }
+
+        bool fused_relu_in_kernel() const { return fused_relu_in_kernel_; }
+
+    private:
+        void init_scratchpad() {
+            using namespace memory_tracking::names;
+            auto scratchpad = scratchpad_registry().registrar();
+            // Reserve per-channel temporary buffers for axb (channels-dense) path
+            scratchpad.template book<float>(key_bnorm_tmp_mean, C());
+            scratchpad.template book<float>(key_bnorm_tmp_var, C());
+        }
+        bool fused_relu_in_kernel_ = false;
+    };
+
+    rvv_batch_normalization_fwd_t(const pd_t *apd) : primitive_t(apd) {}
+
+    status_t execute(const exec_ctx_t &ctx) const override {
+        return execute_forward(ctx);
+    }
+
+private:
+    status_t execute_forward(const exec_ctx_t &ctx) const;
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+};
+
+} // namespace rv64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // CPU_RV64_RVV_BATCH_NORMALIZATION_HPP
diff --git a/src/cpu/rv64/rvv_postops.hpp b/src/cpu/rv64/rvv_postops.hpp
@@ -86,6 +86,8 @@ struct rvv_postops_t {
         return status::success;
     }
 
+    explicit rvv_postops_t(alg_kind_t alg) : alg_(alg) {}
+
     static bool post_ops_ok(const post_ops_t &po) {
         if (po.len() == 0) return true;
         if (po.len() > 1) return false;

Original file line number	Diff line number	Diff line change
`@@ -86,6 +86,8 @@ struct rvv_postops_t {`
`86`	`86`	`return status::success;`
`87`	`87`	`}`
`88`	`88`
	`89`	`+ explicit rvv_postops_t(alg_kind_t alg) : alg_(alg) {}`
	`90`	`+`
`89`	`91`	`static bool post_ops_ok(const post_ops_t &po) {`
`90`	`92`	`if (po.len() == 0) return true;`
`91`	`93`	`if (po.len() > 1) return false;`