Merge pull request #5 from ravi9/fp32_matmul

cavusmustafa · web-flow · commit cd073e9897c9 · 2025-07-30T11:30:24.000-07:00
matmul cpu optimization
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -212,6 +212,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
             } else {
                 m_op_case = 1;
             }
+            break;
         }
         default:
             break;
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -139,7 +139,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     std::vector<std::string> m_output_names;
     std::string m_op_name;
     mutable std::string m_name;
-    int m_op_case;
+    int m_op_case = 0;
     std::vector<std::pair<std::string, std::string>> m_op_node_name;
     std::map<std::string, std::shared_ptr<ov::Node>> m_model_inputs;
     std::map<std::string, std::shared_ptr<ov::Node>> m_model_extra_inputs;
diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp
@@ -12,6 +12,7 @@
 #include <openvino/op/slice.hpp>
 #include <openvino/op/transpose.hpp>
 #include <openvino/op/unsqueeze.hpp>
+#include <openvino/op/util/op_types.hpp>
 #include <vector>
 
 #include "../node_context.hpp"
@@ -28,7 +29,15 @@ OutputVector translate_mulmat(const NodeContext& context) {
 
     ov::Output<Node> res;
     ov::Output<ov::Node> B = context.get_input(0);
-    ov::Output<ov::Node> A = std::make_shared<ov::op::v0::Convert>(context.get_input(1), context.get_input_type(0));
+    ov::Output<ov::Node> A = context.get_input(1);
+
+    bool convert_out_type = false;
+    if (ov::op::util::is_constant(B.get_node()) && context.get_input_type(0) != context.get_input_type(1)) {
+        B = std::make_shared<ov::op::v0::Convert>(context.get_input(0), context.get_input_type(1));
+    } else if (context.get_input_type(0) != context.get_input_type(1)) {
+        A = std::make_shared<ov::op::v0::Convert>(context.get_input(1), context.get_input_type(0));
+        convert_out_type = true;
+    }
 
     auto B_shape = context.get_input_shape(0).to_shape();
     auto A_shape = context.get_input_shape(1).to_shape();
@@ -62,8 +71,12 @@ OutputVector translate_mulmat(const NodeContext& context) {
             A = Z;
         }
 
-        auto result_lp = std::make_shared<ov::op::v0::MatMul>(A, B, false, true);
-        res = std::make_shared<ov::op::v0::Convert>(result_lp, context.get_output_type(0));
+        if (convert_out_type) {
+            auto result_lp = std::make_shared<ov::op::v0::MatMul>(A, B, false, true);
+            res = std::make_shared<ov::op::v0::Convert>(result_lp, context.get_output_type(0));
+        } else {
+            res = std::make_shared<ov::op::v0::MatMul>(A, B, false, true);
+        }
 
         return rename_outputs_with_suffix({res}, context.get_name());
 }
diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp
@@ -53,16 +53,7 @@ OutputVector translate_soft_max(const NodeContext& context) {
 
     auto mask_node = context.get_input(1);
 
-    std::shared_ptr<ov::Node> token_len = get_dimensions(input_node, {1});
-    // Try using Q-cur to retrieve the token length, so that the translation of SOFT_MAX
-    // does not depend on the result of the QK MatMul, so that QK matmul + softmax + qkv matmul
-    // can be fused into SDPA.
-    if (input_node->get_type_info() == ov::op::v0::Convert::get_type_info_static()) {
-        auto qk = input_node->get_input_node_shared_ptr(0);
-        if (qk->get_type_info() == ov::op::v0::MatMul::get_type_info_static()) {
-            token_len = get_dimensions(qk->get_input_node_shared_ptr(0), {1});
-        }
-    }
+    auto token_len = context.get_input("token_len");
     auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
     auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
     std::shared_ptr<ov::Node> mask_node_sliced =
diff --git a/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp b/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "mark_decompression_convert_constant_folding.hpp"
+#include "openvino/pass/matcher_pass.hpp"
+#include "openvino/core/visibility.hpp"
+
+#ifdef OPENVINO_STATIC_LIBRARY
+#    define TRANSFORMATIONS_API
+#else
+#    ifdef IMPLEMENT_OPENVINO_API
+#        define TRANSFORMATIONS_API OPENVINO_CORE_EXPORTS
+#    else
+#        define TRANSFORMATIONS_API OPENVINO_CORE_IMPORTS
+#    endif  // IMPLEMENT_OPENVINO_API
+#endif      // OPENVINO_STATIC_LIBRARY
+
+namespace ov {
+namespace pass {
+
+class TRANSFORMATIONS_API MarkCompressedFloatConstants;
+
+}  // namespace pass
+}  // namespace ov
+
+class ov::pass::MarkCompressedFloatConstants : public MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("MarkCompressedFloatConstants");
+    MarkCompressedFloatConstants();
+};
diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp
@@ -27,6 +27,7 @@
 #include "ggml-openvino/openvino/utils.hpp"
 #include "input_model.hpp"
 #include "pass/fuse_to_sdpa.hpp"
+#include "pass/mark_decompression_convert_constant_folding.hpp"
 
 namespace ov {
 namespace frontend {
@@ -253,21 +254,24 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
     return resulting_model;
 }
 
-void TranslateSession::apply_transformations(const std::shared_ptr<Model>& model) {
+std::shared_ptr<Model> TranslateSession::apply_transformations(std::shared_ptr<Model> model) {
     auto ggml_model_decoder = std::dynamic_pointer_cast<InputModel>(m_input_model)->get_model_decoder();
+    {
+        ov::pass::Manager manager;
+        manager.set_per_pass_validation(true);
+        manager.register_pass<ov::pass::MarkCompressedFloatConstants>();
+        manager.register_pass<ov::pass::ConstantFolding>();
+
+        if (!ggml_model_decoder->is_static()) {
+            const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names();
+            const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names);
+            manager.register_pass<ov::pass::MakeStateful>(kv_param_res_pairs);
+        }
 
-    ov::pass::Manager manager;
-    manager.set_per_pass_validation(true);
-    manager.register_pass<ov::pass::ConstantFolding>();
-
-    if (!ggml_model_decoder->is_static()) {
-        const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names();
-        const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names);
-        manager.register_pass<ov::pass::MakeStateful>(kv_param_res_pairs);
+        manager.register_pass<pass::FuseToSDPA>();
+        manager.run_passes(model);
     }
-
-    manager.register_pass<pass::FuseToSDPA>();
-    manager.run_passes(model);
+    return model;
 }
 
 }  // namespace ggml
diff --git a/ggml/src/ggml-openvino/openvino/translate_session.hpp b/ggml/src/ggml-openvino/openvino/translate_session.hpp
@@ -16,7 +16,7 @@ class TranslateSession {
     std::shared_ptr<Model> translate_graph(const frontend::InputModel::Ptr& input_model);
 
 private:
-    void apply_transformations(const std::shared_ptr<Model>& model);
+    std::shared_ptr<Model> apply_transformations(std::shared_ptr<Model> model);
     const frontend::InputModel::Ptr m_input_model;
     const std::unordered_map<std::string, CreatorFunction>& m_translator_map;
     std::shared_ptr<Model> m_ov_model;

Original file line number	Diff line number	Diff line change
`@@ -212,6 +212,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {`
`212`	`212`	`} else {`
`213`	`213`	`m_op_case = 1;`
`214`	`214`	`}`
	`215`	`+ break;`
`215`	`216`	`}`
`216`	`217`	`default:`
`217`	`218`	`break;`