ravi9
diff --git a/‎ggml/src/ggml-openvino/ggml-decoder.cpp‎
Lines changed: 45 additions & 51 deletions b/‎ggml/src/ggml-openvino/ggml-decoder.cpp‎
Lines changed: 45 additions & 51 deletions
diff --git a/‎ggml/src/ggml-openvino/ggml-openvino.cpp‎
Lines changed: 9 additions & 5 deletions b/‎ggml/src/ggml-openvino/ggml-openvino.cpp‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎ggml/src/ggml-openvino/openvino/op/cont.cpp‎
Lines changed: 4 additions & 1 deletion b/‎ggml/src/ggml-openvino/openvino/op/cont.cpp‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎ggml/src/ggml-openvino/openvino/op/cpy.cpp‎
Lines changed: 20 additions & 0 deletions b/‎ggml/src/ggml-openvino/openvino/op/cpy.cpp‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp‎
Lines changed: 35 additions & 0 deletions b/‎ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎ggml/src/ggml-openvino/openvino/op/get_rows.cpp‎
Lines changed: 0 additions & 1 deletion b/‎ggml/src/ggml-openvino/openvino/op/get_rows.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎ggml/src/ggml-openvino/openvino/op/mulmat.cpp‎
Lines changed: 15 additions & 4 deletions b/‎ggml/src/ggml-openvino/openvino/op/mulmat.cpp‎
Lines changed: 15 additions & 4 deletions
diff --git a/‎ggml/src/ggml-openvino/openvino/op/permute.cpp‎
Lines changed: 2 additions & 3 deletions b/‎ggml/src/ggml-openvino/openvino/op/permute.cpp‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎ggml/src/ggml-openvino/openvino/op/rope.cpp‎
Lines changed: 0 additions & 1 deletion b/‎ggml/src/ggml-openvino/openvino/op/rope.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎ggml/src/ggml-openvino/openvino/op/set_rows.cpp‎
Lines changed: 1 addition & 15 deletions b/‎ggml/src/ggml-openvino/openvino/op/set_rows.cpp‎
Lines changed: 1 addition & 15 deletions
@@ -73,6 +73,11 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
 }
 
 GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) {
+    if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
+        std::string filename = "cgraph.txt";
+        dump_cgraph(cgraph, filename);
+    }
+
     m_cgraph = cgraph;
     for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
         auto* cur_node = cgraph->nodes[node_n];
@@ -173,49 +178,46 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
             break;
         }
         case GGML_OP_CONT: {
-            if (ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src)) {
-                // The input comes from a PERMUTE
-                m_op_case = 1;
-            } else {
-                // The input comes from a VIEW which is subtensor
-                m_op_case = 2;
-            }
-            break;
-        }
-        case GGML_OP_SET_ROWS: {
-            if (std::string(node->name).find("cache_k") == 0) {
+            if (node->src[0]->op == GGML_OP_PERMUTE) {
                 m_op_case = 1;
-            } else {
+            } else if (node->src[0]->op == GGML_OP_TRANSPOSE) {
                 m_op_case = 2;
+            } else if (node->src[0]->op == GGML_OP_VIEW) {
+                // The input comes from a VIEW which is subtensor
+                m_op_case = 3;
             }
             break;
         }
         case GGML_OP_PERMUTE: {
-            if (node->src[0]->view_src == nullptr) {
-                // Permute Qcur
+            if (node->src[0]->op != GGML_OP_VIEW) {
                 m_op_case = 1;
             } else if (ggml_is_contiguous(node->src[0])) {
                 // Permute cache_k (view)
                 m_op_case = 2;
             } else {
-                // Permute cache_v (view)
+                // Permute cache_v (view), deprecated, cache_v will also fall to case 2
+                m_op_case = 3;
+            }
+            break;
+        }
+        case GGML_OP_MUL_MAT: {
+            if (node->src[0]->op == GGML_OP_CONT && node->src[0]->src[0]->op == GGML_OP_TRANSPOSE) {
+                m_op_case = 2;
+            } else if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) {
+                // test-backend-ops case
                 m_op_case = 3;
             }
             break;
         }
         case GGML_OP_GET_ROWS: {
             if (node->src[1]->op == GGML_OP_VIEW) {
                 m_op_case = 2;
-            } else {
-                m_op_case = 1;
             }
             break;
         }
         case GGML_OP_ROPE: {
             if (node->src[0]->op == GGML_OP_VIEW) {
                 m_op_case = 2;
-            } else {
-                m_op_case = 1;
             }
             break;
         }
@@ -270,19 +272,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
     } else if (name.find("cache_k") == 0) {
         input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size};
     } else if (name.find("cache_v") == 0) {
-        input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size};
+        input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size};
     } else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) {
-        input_shape = ov::PartialShape{1, 1, -1};
-        if (m_is_static) {
-            if (m_is_first_token) {
-                // Dummy static shape, since the indices are not used in this case
-                input_shape = ov::PartialShape{1};
-            } else if (std::string(op->name).find("cache_k") == 0) {
-                input_shape = ov::PartialShape{1, 1, 1};
-            } else {
-                input_shape = ov::PartialShape{1, 1, m_num_heads_kv * m_head_size};
-            }
-        }
+        input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1};
     } else if (src->op == GGML_OP_VIEW) {
         // This case is added to make test-backend-ops work
         input_shape = ov::PartialShape{get_shape(src->view_src)};
@@ -610,26 +602,28 @@ void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecode
 
 const std::string& GgmlOvDecoder::get_op_type() const {
     static const std::map<ggml_op, std::string> ops = {
-        {GGML_OP_NONE,      "GGML_OP_NONE"     },
-        {GGML_OP_ACC,       "GGML_OP_ACC"      },
-        {GGML_OP_ADD,       "GGML_OP_ADD"      },
-        {GGML_OP_ADD1,      "GGML_OP_ADD1"     },
-        {GGML_OP_CONT,      "GGML_OP_CONT"     },
-        {GGML_OP_DIV,       "GGML_OP_DIV"      },
-        {GGML_OP_DUP,       "GGML_OP_DUP"      },
-        {GGML_OP_GET_ROWS,  "GGML_OP_GET_ROWS" },
-        {GGML_OP_MUL,       "GGML_OP_MUL"      },
-        {GGML_OP_MUL_MAT,   "GGML_OP_MUL_MAT"  },
-        {GGML_OP_PERMUTE,   "GGML_OP_PERMUTE"  },
-        {GGML_OP_RESHAPE,   "GGML_OP_RESHAPE"  },
-        {GGML_OP_RMS_NORM,  "GGML_OP_RMS_NORM" },
-        {GGML_OP_ROPE,      "GGML_OP_ROPE"     },
-        {GGML_OP_SCALE,     "GGML_OP_SCALE"    },
-        {GGML_OP_SOFT_MAX,  "GGML_OP_SOFT_MAX" },
-        {GGML_OP_SUB,       "GGML_OP_SUB"      },
-        {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"},
-        {GGML_OP_VIEW,      "GGML_OP_VIEW"     },
-        {GGML_OP_SET_ROWS,  "GGML_OP_SET_ROWS" },
+        {GGML_OP_NONE,           "GGML_OP_NONE"          },
+        {GGML_OP_ACC,            "GGML_OP_ACC"           },
+        {GGML_OP_ADD,            "GGML_OP_ADD"           },
+        {GGML_OP_ADD1,           "GGML_OP_ADD1"          },
+        {GGML_OP_CONT,           "GGML_OP_CONT"          },
+        {GGML_OP_DIV,            "GGML_OP_DIV"           },
+        {GGML_OP_DUP,            "GGML_OP_DUP"           },
+        {GGML_OP_GET_ROWS,       "GGML_OP_GET_ROWS"      },
+        {GGML_OP_MUL,            "GGML_OP_MUL"           },
+        {GGML_OP_MUL_MAT,        "GGML_OP_MUL_MAT"       },
+        {GGML_OP_PERMUTE,        "GGML_OP_PERMUTE"       },
+        {GGML_OP_RESHAPE,        "GGML_OP_RESHAPE"       },
+        {GGML_OP_RMS_NORM,       "GGML_OP_RMS_NORM"      },
+        {GGML_OP_ROPE,           "GGML_OP_ROPE"          },
+        {GGML_OP_SCALE,          "GGML_OP_SCALE"         },
+        {GGML_OP_SOFT_MAX,       "GGML_OP_SOFT_MAX"      },
+        {GGML_OP_SUB,            "GGML_OP_SUB"           },
+        {GGML_OP_TRANSPOSE,      "GGML_OP_TRANSPOSE"     },
+        {GGML_OP_VIEW,           "GGML_OP_VIEW"          },
+        {GGML_OP_SET_ROWS,       "GGML_OP_SET_ROWS"      },
+        {GGML_OP_CPY,            "GGML_OP_CPY"           },
+        {GGML_OP_FLASH_ATTN_EXT, "GGML_OP_FLASH_ATTN_EXT"},
     };
     static const std::map<ggml_unary_op, std::string> unary_ops = {
         {GGML_UNARY_OP_ABS,         "GGML_UNARY_OP_ABS"        },
 
@@ -270,12 +270,14 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
         }
     }
 
-    if (op->op == GGML_OP_MUL_MAT) {
-        if ((op->src[0]->view_src && op->src[0]->op != GGML_OP_PERMUTE) ||
-            (op->src[1]->view_src && op->src[1]->op != GGML_OP_PERMUTE)) {
-            GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with view_src tensors that are not PERMUTE\n");
+    if (op->op == GGML_OP_CPY) {
+        if (op->src[1] != op) {
+            GGML_LOG_WARN("OpenVINO backend only supports CPY that is a cast\n");
             return true;
         }
+    }
+
+    if (op->op == GGML_OP_MUL_MAT) {
         if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) {
             // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"`
             GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n");
@@ -346,7 +348,9 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
                                                  GGML_OP_RMS_NORM,
                                                  GGML_OP_SCALE,
                                                  GGML_OP_SOFT_MAX,
-                                                 GGML_OP_SET_ROWS};
+                                                 GGML_OP_SET_ROWS,
+                                                 GGML_OP_FLASH_ATTN_EXT,
+                                                 GGML_OP_CPY};
     static const std::set<ggml_unary_op> supported_unary_ops{
         GGML_UNARY_OP_SILU,
     };
 
@@ -19,7 +19,7 @@ OutputVector translate_cont(const NodeContext& context) {
     num_inputs_check(context, 1, 1);
 
     int op_case = context.get_op_case();
-    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
+    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case");
 
     auto src_shape = context.get_input_shape(0).to_shape();
     auto dst_shape = context.get_output_shape(0).to_shape();
@@ -32,6 +32,9 @@ OutputVector translate_cont(const NodeContext& context) {
             context.get_input(0),
             ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape),
             false);
+    } else if (op_case == 2) {
+        // The input comes from a TRANSPOSE
+        return {context.get_input(0)};
     } else {
         // The input comes from a VIEW
         res = process_view_input(context, 0);
 
@@ -0,0 +1,20 @@
+#include <memory>
+#include <openvino/op/convert.hpp>
+#include "../node_context.hpp"
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_cpy(const NodeContext& context) {
+    auto res = std::make_shared<ov::op::v0::Convert>(context.get_input(0), context.get_output_type(0));
+    return rename_outputs_with_suffix({res}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
@@ -0,0 +1,35 @@
+#include <memory>
+#include <openvino/op/convert.hpp>
+#include <openvino/op/scaled_dot_product_attention.hpp>
+#include "../node_context.hpp"
+#include "../op_table.hpp"
+#include "../utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace ggml {
+namespace op {
+
+OutputVector translate_flash_attn_ext(const NodeContext& context) {
+    num_inputs_check(context, 4, 4);
+    auto q_f32 = context.get_input(0);
+    auto k = context.get_input(1);
+    auto v = context.get_input(2);
+    auto mask = context.get_input(3);
+
+    float* params = reinterpret_cast<float*>(context.get_output_op_params(0));
+    float scale         = params[0];
+    // float max_bias      = params[1];
+    // float logit_softcap = params[2];
+
+    auto q = std::make_shared<ov::op::v0::Convert>(q_f32, ov::element::f16);
+    auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, std::vector<float>{scale});
+    auto res = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v , mask, scale_node, false);
+    auto res_f32 = std::make_shared<ov::op::v0::Convert>(res, ov::element::f32);
+    return rename_outputs_with_suffix({res_f32}, context.get_name());
+}
+
+}  // namespace op
+}  // namespace ggml
+}  // namespace frontend
+}  // namespace ov
@@ -21,7 +21,6 @@ OutputVector translate_get_rows(const NodeContext& context) {
     num_inputs_check(context, 2, 2);
 
     int op_case = context.get_op_case();
-    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
 
     Output<Node> res;
     auto data = context.get_input(0);
 
@@ -27,15 +27,26 @@ namespace op {
 OutputVector translate_mulmat(const NodeContext& context) {
     num_inputs_check(context, 2, 2);
 
+    int op_case = context.get_op_case();
+
     ov::Output<Node> res;
     ov::Output<ov::Node> B = context.get_input(0);
     ov::Output<ov::Node> A = context.get_input(1);
 
+    bool transpose_b = true;
+    if (op_case == 2) {
+        B = B.get_node_shared_ptr()->input_value(0);
+        transpose_b = false;
+    } else if (op_case == 3) {
+        B = process_view_input(context, 0);
+        A = process_view_input(context, 1);
+    }
+
     bool convert_out_type = false;
     if (ov::op::util::is_constant(B.get_node()) && context.get_input_type(0) != context.get_input_type(1)) {
-        B = std::make_shared<ov::op::v0::Convert>(context.get_input(0), context.get_input_type(1));
+        B = std::make_shared<ov::op::v0::Convert>(B, context.get_input_type(1));
     } else if (context.get_input_type(0) != context.get_input_type(1)) {
-        A = std::make_shared<ov::op::v0::Convert>(context.get_input(1), context.get_input_type(0));
+        A = std::make_shared<ov::op::v0::Convert>(A, context.get_input_type(0));
         convert_out_type = true;
     }
 
@@ -72,10 +83,10 @@ OutputVector translate_mulmat(const NodeContext& context) {
         }
 
         if (convert_out_type) {
-            auto result_lp = std::make_shared<ov::op::v0::MatMul>(A, B, false, true);
+            auto result_lp = std::make_shared<ov::op::v0::MatMul>(A, B, false, transpose_b);
             res = std::make_shared<ov::op::v0::Convert>(result_lp, context.get_output_type(0));
         } else {
-            res = std::make_shared<ov::op::v0::MatMul>(A, B, false, true);
+            res = std::make_shared<ov::op::v0::MatMul>(A, B, false, transpose_b);
         }
 
         return rename_outputs_with_suffix({res}, context.get_name());
 
@@ -21,13 +21,12 @@ OutputVector translate_permute(const NodeContext& context) {
     num_inputs_check(context, 1, 1);
 
     int op_case = context.get_op_case();
-    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case");
+    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported PERMUTE case");
     ov::Output<Node> res;
 
     if (op_case == 1) {
-        auto perm = argsort_descend(context.get_output_stride(0));
         res = std::make_shared<ov::op::v1::Transpose>(context.get_input(0),
-                                                      ov::op::v0::Constant::create(ov::element::i64, {3}, perm));
+                                                      ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
     } else {
         auto src = context.get_input(0);
         auto attention_size = context.get_input("attention_size");
 
@@ -27,7 +27,6 @@ OutputVector translate_rope(const NodeContext& context) {
     num_inputs_check(context, 2, 3);
 
     int op_case = context.get_op_case();
-    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
 
     ov::Output<Node> res;
 
 
@@ -32,21 +32,7 @@ OutputVector translate_set_rows(const NodeContext& context) {
     FRONT_END_OP_CONVERSION_CHECK(dst_shape[0] == 1, "Unsupported shape in SET_ROWS");
 
     if (context.is_static() && context.is_first_token()) {
-        Output<Node> res;
-        if (context.get_op_case() == 2) {
-            res = std::make_shared<ov::op::v1::Reshape>(
-                data,
-                ov::op::v0::Constant::create(
-                    ov::element::i64,
-                    {3},
-                    {context.get_context_size(), context.get_num_heads_kv(), context.get_head_size()}),
-                false);
-            res = std::make_shared<ov::op::v1::Transpose>(
-                res, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 2, 0}));
-        } else {
-            res = data;
-        }
-        return rename_outputs_with_suffix({res}, context.get_name());
+        return rename_outputs_with_suffix({data}, context.get_name());
     }
 
     auto indices = context.get_input(1);