Tencent
diff --git a/‎docs/developer-guide/operators.md‎
Lines changed: 13 additions & 0 deletions b/‎docs/developer-guide/operators.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎src/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/layer/rotaryembed.cpp‎
Lines changed: 88 additions & 0 deletions b/‎src/layer/rotaryembed.cpp‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎src/layer/rotaryembed.h‎
Lines changed: 26 additions & 0 deletions b/‎src/layer/rotaryembed.h‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎tests/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎tests/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/test_rotaryembed.cpp‎
Lines changed: 52 additions & 0 deletions b/‎tests/test_rotaryembed.cpp‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎tests/test_rotaryembed_oom.cpp‎
Lines changed: 46 additions & 0 deletions b/‎tests/test_rotaryembed_oom.cpp‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎tools/modelwriter.h‎
Lines changed: 8 additions & 0 deletions b/‎tools/modelwriter.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎tools/pnnx/src/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎tools/pnnx/src/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp‎
Lines changed: 69 additions & 0 deletions b/‎tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp‎
Lines changed: 69 additions & 0 deletions
@@ -76,6 +76,7 @@
 * [Reshape](#reshape)
 * [RMSNorm](#rmsnorm)
 * [RNN](#rnn)
+* [RotaryEmbed](#rotaryembed)
 * [Scale](#scale)
 * [SDPA](#sdpa)
 * [SELU](#selu)
@@ -1778,6 +1779,18 @@ Direction flag:
 - 1 = reverse only
 - 2 = bidirectional
 
+# RotaryEmbed
+Apply rotary positional embeddings with cos and sin cache
+
+```
+y1 = x1 * cos - x2 * sin
+y2 = x1 * sin + x2 * cos
+```
+
+| param id  | name          | type  | default   | description       |
+| --------- | ------------- | ----- | --------- | ----------------- |
+| 0         | interleaved   | int   | 0         |                   |
+
 # Scale
 ```
 if scale_data_size == -233  y = x0 * x1
 
@@ -172,6 +172,7 @@ ncnn_add_layer(Spectrogram)
 ncnn_add_layer(InverseSpectrogram)
 ncnn_add_layer(Flip)
 ncnn_add_layer(SDPA)
+ncnn_add_layer(RotaryEmbed)
 
 if(NCNN_VULKAN)
     ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp)
 
@@ -0,0 +1,88 @@
+// Copyright 2025 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "rotaryembed.h"
+
+namespace ncnn {
+
+RotaryEmbed::RotaryEmbed()
+{
+}
+
+int RotaryEmbed::load_param(const ParamDict& pd)
+{
+    interleaved = pd.get(0, 0);
+
+    return 0;
+}
+
+int RotaryEmbed::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    // assert bottom_blobs.size() == 3
+
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& cos_cache = bottom_blobs[1];
+    const Mat& sin_cache = bottom_blobs[2];
+
+    const int embed_dim = bottom_blob.w;
+    const int seqlen = bottom_blob.h;
+    const int num_heads = bottom_blob.c;
+
+    Mat& top_blob = top_blobs[0];
+    top_blob.create_like(bottom_blob, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < num_heads; q++)
+    {
+        const Mat head = bottom_blob.channel(q);
+        Mat out_head = top_blob.channel(q);
+
+        for (int i = 0; i < seqlen; i++)
+        {
+            if (interleaved)
+            {
+                const float* ptr = head.row(i);
+                const float* cos_ptr = cos_cache.row(i);
+                const float* sin_ptr = sin_cache.row(i);
+                float* outptr = out_head.row(i);
+
+                for (int j = 0; j < embed_dim / 2; j++)
+                {
+                    const float x0 = ptr[0];
+                    const float x1 = ptr[1];
+                    const float cos_val = *cos_ptr++;
+                    const float sin_val = *sin_ptr++;
+                    outptr[0] = x0 * cos_val - x1 * sin_val;
+                    outptr[1] = x0 * sin_val + x1 * cos_val;
+                    ptr += 2;
+                    outptr += 2;
+                }
+            }
+            else
+            {
+                const float* ptr0 = head.row(i);
+                const float* ptr1 = ptr0 + embed_dim / 2;
+                const float* sin_ptr = sin_cache.row(i);
+                const float* cos_ptr = cos_cache.row(i);
+                float* outptr0 = out_head.row(i);
+                float* outptr1 = outptr0 + embed_dim / 2;
+
+                for (int j = 0; j < embed_dim / 2; j++)
+                {
+                    const float x0 = *ptr0++;
+                    const float x1 = *ptr1++;
+                    const float cos_val = *cos_ptr++;
+                    const float sin_val = *sin_ptr++;
+                    *outptr0++ = x0 * cos_val - x1 * sin_val;
+                    *outptr1++ = x0 * sin_val + x1 * cos_val;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
@@ -0,0 +1,26 @@
+// Copyright 2025 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef LAYER_ROTARYEMBED_H
+#define LAYER_ROTARYEMBED_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class RotaryEmbed : public Layer
+{
+public:
+    RotaryEmbed();
+
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+public:
+    int interleaved;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ROTARYEMBED_H
@@ -150,6 +150,7 @@ ncnn_add_layer_test(RMSNorm)
 ncnn_add_layer_test(RNN)
 ncnn_add_layer_test(ROIPooling)
 ncnn_add_layer_test(ROIAlign)
+ncnn_add_layer_test(RotaryEmbed)
 ncnn_add_layer_test(Scale)
 ncnn_add_layer_test(SDPA)
 ncnn_add_layer_test(SELU)
 
@@ -0,0 +1,52 @@
+// Copyright 2025 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "testutil.h"
+
+static int test_rotaryembed(const ncnn::Mat& a, int interleaved)
+{
+    const int embed_dim = a.w;
+    const int seqlen = a.h;
+    const int num_heads = a.c;
+
+    ncnn::Mat cos_cache = RandomMat(embed_dim / 2, seqlen);
+    ncnn::Mat sin_cache = RandomMat(embed_dim / 2, seqlen);
+
+    ncnn::ParamDict pd;
+    pd.set(0, interleaved);
+
+    std::vector<ncnn::Mat> weights(0);
+
+    std::vector<ncnn::Mat> as(3);
+    as[0] = a;
+    as[1] = cos_cache;
+    as[2] = sin_cache;
+
+    int ret = test_layer("RotaryEmbed", pd, weights, as, 1);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_rotaryembed failed a=(%d %d %d) interleaved=%d\n", a.w, a.h, a.c, interleaved);
+    }
+
+    return ret;
+}
+
+static int test_rotaryembed_0()
+{
+    return 0
+           || test_rotaryembed(RandomMat(32, 66, 8), 0)
+           || test_rotaryembed(RandomMat(26, 64, 8), 1)
+           || test_rotaryembed(RandomMat(64, 28, 12), 0)
+           || test_rotaryembed(RandomMat(48, 22, 12), 1)
+           || test_rotaryembed(RandomMat(44, 28, 64), 0)
+           || test_rotaryembed(RandomMat(12, 27, 64), 1)
+           || test_rotaryembed(RandomMat(28, 17, 15), 0)
+           || test_rotaryembed(RandomMat(28, 17, 15), 1);
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return test_rotaryembed_0();
+}
@@ -0,0 +1,46 @@
+// Copyright 2025 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "testutil.h"
+
+static int test_rotaryembed_oom(const ncnn::Mat& a, int interleaved)
+{
+    const int embed_dim = a.w;
+    const int seqlen = a.h;
+    const int num_heads = a.c;
+
+    ncnn::Mat cos_cache = RandomMat(embed_dim / 2, seqlen);
+    ncnn::Mat sin_cache = RandomMat(embed_dim / 2, seqlen);
+
+    ncnn::ParamDict pd;
+    pd.set(0, interleaved);
+
+    std::vector<ncnn::Mat> weights(0);
+
+    std::vector<ncnn::Mat> as(3);
+    as[0] = a;
+    as[1] = cos_cache;
+    as[2] = sin_cache;
+
+    int ret = test_layer_oom("RotaryEmbed", pd, weights, as, 1);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_rotaryembed_oom failed a=(%d %d %d) interleaved=%d\n", a.w, a.h, a.c, interleaved);
+    }
+
+    return ret;
+}
+
+static int test_rotaryembed_0()
+{
+    return 0
+           || test_rotaryembed_oom(RandomMat(32, 66, 8), 0)
+           || test_rotaryembed_oom(RandomMat(28, 17, 15), 1);
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return test_rotaryembed_0();
+}
@@ -92,6 +92,7 @@
 #include "layer/rnn.h"
 #include "layer/roialign.h"
 #include "layer/roipooling.h"
+#include "layer/rotaryembed.h"
 #include "layer/scale.h"
 #include "layer/sdpa.h"
 #include "layer/shufflechannel.h"
@@ -2407,6 +2408,13 @@ int ModelWriter::save(const char* parampath, const char* binpath)
             fprintf_param_value(" 1=%d", pooled_height)
             fprintf_param_value(" 2=%e", spatial_scale)
         }
+        else if (layer->type == "RotaryEmbed")
+        {
+            ncnn::RotaryEmbed* op = (ncnn::RotaryEmbed*)layer;
+            ncnn::RotaryEmbed* op_default = (ncnn::RotaryEmbed*)layer_default;
+
+            fprintf_param_value(" 0=%d", interleaved)
+        }
         else if (layer->type == "Scale")
         {
             ncnn::Scale* op = (ncnn::Scale*)layer;
 
@@ -430,6 +430,7 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/eliminate_output.cpp
     pass_ncnn/expand_expression.cpp
     pass_ncnn/fuse_convert_shufflechannel_slice.cpp
+    pass_ncnn/fuse_convert_rotaryembed.cpp
     pass_ncnn/insert_split.cpp
     pass_ncnn/chain_multi_output.cpp
     pass_ncnn/solve_batch_index.cpp
 
@@ -54,6 +54,71 @@ pnnx.Output             output      1 0 out
     }
 };
 
+class fuse_rmsnorm_pass_without_gamma : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input       0 1 input
+pnnx.Expression         op_0        1 1 input sq expr=pow(@0,2)
+torch.mean              op_1        1 1 sq sqmean dim=(-1) keepdim=True
+pnnx.Expression         op_2        2 1 input sqmean out expr=mul(@0,rsqrt(add(@1,%eps)))
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.RMSNorm";
+    }
+
+    const char* name_str() const
+    {
+        return "t5ln";
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators, const std::map<std::string, Parameter>& /*captured_params*/, const std::map<std::string, Attribute>& /*captured_attrs*/) const
+    {
+        const Operator* op_0 = matched_operators.at("op_0");
+        const std::vector<int>& shape = op_0->inputs[0]->shape;
+        if (shape.empty())
+        {
+            // unknown normalized_shape
+            return false;
+        }
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        const std::vector<int>& shape = op->inputs[0]->shape;
+        const int c = shape[shape.size() - 1];
+
+        op->params["elementwise_affine"] = false;
+        op->params["eps"] = captured_params.at("eps");
+        op->params["normalized_shape"] = std::vector<int>{c};
+    }
+};
+
+class fuse_rmsnorm_pass_without_gamma_1 : public fuse_rmsnorm_pass_without_gamma
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input       0 1 input
+pnnx.Expression         op_0        1 1 input sq expr=pow(@0,2)
+torch.mean              op_1        1 1 sq sqmean dim=(-1) keepdim=True
+pnnx.Expression         op_2        2 1 input sqmean out expr=mul(@0,reciprocal(sqrt(add(@1,%eps))))
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
 class fuse_rmsnorm_pass_onnx : public fuse_rmsnorm_pass
 {
 public:
@@ -75,11 +140,15 @@ void fuse_rmsnorm(Graph& graph)
 {
     fuse_rmsnorm_pass a;
     fuse_rmsnorm_pass_1 a1;
+    fuse_rmsnorm_pass_without_gamma a2;
+    fuse_rmsnorm_pass_without_gamma_1 a3;
     fuse_rmsnorm_pass_onnx b;
     int opindex = 0;
 
     pnnx_graph_rewrite(graph, &a, opindex);
     pnnx_graph_rewrite(graph, &a1, opindex);
+    pnnx_graph_rewrite(graph, &a2, opindex);
+    pnnx_graph_rewrite(graph, &a3, opindex);
     pnnx_graph_rewrite(graph, &b, opindex);
 }