[CPU] add onednn context cache for qlinear to improve performance (pytorch#168150)

Xia-Weiwen · pytorchmergebot · commit a5436a5e8e4e · 2025-11-25T01:59:08.000Z
**Summary** We noticed big framework overhead of `qlinear`. It's because to call onednn's primitive, we need to prepare a bunch of data structs as its args, which has big overhead. In the past, such things are cached in the context and attached to torch jit graph. However, Inductor does not support non-tensor data on graph. This PR adds a cache of those data structs by using a static `std::unordered_map`, whose key is weight data address as an `int64` and value is a struct that contains all data needed to run a primitive. This cache is safe in most cases where weight data address won't change during inference and weight data are not reused by different layers. However, since we cannot guarantee the assumption, we define an environment variable `"ONEDNN_CACHE_CONTEXT_UNSAFE"` to control this feature. Users should use it at their own risk. We found >5% E2E performance gain when running ViT with PT2E static quantization on an 6th gen of Intel Xeon CPU. **Test plan** ``` pytest -sv test/test_quantization.py -k "qlinear and pt2e" ``` Pull Request resolved: pytorch#168150 Approved by: https://github.com/mingfeima, https://github.com/jerryzh168
diff --git a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
@@ -462,4 +462,40 @@ at::Tensor _qconv_prepack_onednn(
 
 #define FP8E4M3_MAX 448.0
 
+#define CACHE_ONEDNN_CONTEXT_FLAG "ONEDNN_CACHE_CONTEXT_UNSAFE"
+
+struct QlinearForwardParams {
+  dnnl::matmul primitive;
+  ideep::exec_args args;
+  ideep::tensor packed_weight;
+  ideep::tensor weight_scales;
+  std::optional<ideep::tensor> src_scale;
+  std::optional<ideep::tensor> src_zero_point;
+  std::optional<ideep::tensor> dst_scale;
+  std::optional<ideep::tensor> dst_zero_point;
+  std::optional<ideep::tensor> bias;
+  ideep::tensor scratchpad;
+
+  void init_args() {
+    args.insert({DNNL_ARG_WEIGHTS, packed_weight});
+    args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
+    if (bias.has_value()) {
+      args.insert({DNNL_ARG_BIAS, bias.value()});
+    }
+    if (src_scale.has_value()) {
+      args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scale.value()});
+    }
+    if (dst_scale.has_value()) {
+      args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_scale.value()});
+    }
+    args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, weight_scales});
+    if (src_zero_point.has_value()) {
+      args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC, src_zero_point.value()});
+    }
+    if (dst_zero_point.has_value()) {
+      args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST, dst_zero_point.value()});
+    }
+  }
+};
+
 #endif // #if AT_MKLDNN_ENABLED()
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -1147,24 +1147,13 @@ static at::Tensor linear_int8_with_onednn_weight(
       dim == 2 ? input.contiguous() : input.reshape({-1, input.size(dim - 1)}).contiguous();
 
   auto src = at::native::itensor_from_tensor(input_contig);
-  auto packed_weight = at::native::itensor_from_mkldnn(onednn_weight);
-  int64_t K = input.size(dim - 1), M = input.numel() / K, N = packed_weight.get_dim(1);
+  int64_t K = input.size(dim - 1), M = input.numel() / K, N = onednn_weight.size(1);
 
   auto output_size = input.sizes().vec();
   output_size[dim - 1] = N;
 
-  std::optional<ideep::tensor> onednn_bias{std::nullopt};
   bool with_bias = bias.has_value();
-  at::Tensor bias_val_float;
-  if (with_bias) {
-    bias_val_float = bias.value().to(at::kFloat);
-    if (bias_val_float.dim() == 1) {
-      auto b_reshape = bias_val_float.reshape({1, bias_val_float.size(0)});
-      onednn_bias = at::native::itensor_view_from_dense(b_reshape);
-    } else {
-      onednn_bias = at::native::itensor_view_from_dense(bias_val_float);
-    }
-  }
+
   std::vector<int64_t> src_dims = {M, K};
   std::vector<int64_t> dst_dims = {M, N};
   auto out_dtype = output_dtype.has_value() ? output_dtype.value() : input.scalar_type();
@@ -1185,14 +1174,47 @@ static at::Tensor linear_int8_with_onednn_weight(
       at::native::itensor_view_from_dense(other.value().reshape({-1, other.value().size(dim - 1)})) :
       empty_tensor;
 
+  // Fast path with cache of params
+  static const char* env_var = std::getenv(CACHE_ONEDNN_CONTEXT_FLAG);
+  static const std::string cache_flag_str = env_var ? std::string(env_var) : "";
+  static const bool context_cache_enabled = cache_flag_str != "" && cache_flag_str == "1";
+  static std::unordered_map<int64_t, QlinearForwardParams> qlinear_forward_params_map;
+  int64_t weight_addr = at::native::data_ptr_from_mkldnn(onednn_weight);
+  if (context_cache_enabled) {
+    auto it = qlinear_forward_params_map.find(weight_addr);
+    if (it != qlinear_forward_params_map.end()) {
+      auto& params = it->second;
+      auto& args = params.args;
+      args[DNNL_ARG_SRC] = std::move(src);
+      args[DNNL_ARG_DST] = std::move(dst);
+      if (binary_post_op == "add") {
+        args[DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1] = std::move(src1);
+      }
+      params.primitive.execute(ideep::stream::default_stream(), args);
+      return dim == 2 ? output : output.resize_(output_size);
+    }
+  }
+
+  // Regular path
+  auto packed_weight = at::native::itensor_from_mkldnn(onednn_weight);
+  tensor onednn_bias;
+  if (with_bias) {
+    at::Tensor bias_val_float = bias.value();
+    if (bias_val_float.dim() == 1) {
+      auto b_reshape = bias_val_float.reshape({1, bias_val_float.size(0)});
+      onednn_bias = at::native::itensor_view_from_dense(b_reshape);
+    } else {
+      onednn_bias = at::native::itensor_view_from_dense(bias_val_float);
+    }
+  }
   // Create onednn primitive
   auto src_dtype = at::native::get_mkldnn_dtype(input.scalar_type());
   auto src_desc = tensor::desc(src_dims, src_dtype, ideep::format_tag::any);
   auto weights_desc = packed_weight.get_desc();
   auto dst_dtype = dst.get_data_type();
   auto dst_desc = tensor::desc(dst_dims, dst_dtype, ideep::format_tag::any);
   auto bias_desc = with_bias ?
-      tensor::desc(onednn_bias.value().get_dims(), ideep::data_type::f32, ideep::format_tag::any) :
+      tensor::desc(onednn_bias.get_dims(), onednn_bias.get_data_type(), ideep::format_tag::any) :
       empty_tensor_desc;
   // Get op attr for primitive
   // Note: output_scale & output_zero_point are for re-quantization of the final output.
@@ -1249,7 +1271,7 @@ static at::Tensor linear_int8_with_onednn_weight(
   args.insert({DNNL_ARG_DST, dst});
   args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
   if (with_bias) {
-    args.insert({DNNL_ARG_BIAS, onednn_bias.value()});
+    args.insert({DNNL_ARG_BIAS, onednn_bias});
   }
   tensor src_scales_t = tensor(ideep::scale_t(1, input_scale));
   tensor wei_scales_t = at::native::itensor_from_tensor(weight_scales);
@@ -1273,7 +1295,22 @@ static at::Tensor linear_int8_with_onednn_weight(
     args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, src1});
   }
   primitive.execute(ideep::stream::default_stream(), args);
-  return dim == 2 ? output : output.reshape(output_size);
+  // Update cache if needed
+  if (context_cache_enabled) {
+    QlinearForwardParams params;
+    params.primitive = primitive;
+    params.packed_weight = expected_weight;
+    params.weight_scales = wei_scales_t;
+    params.src_scale = input_scale != 1.0f ? std::make_optional<tensor>(src_scales_t) : std::nullopt;
+    params.dst_scale = output_scale != 1.0f ? std::make_optional<tensor>(dst_scales_t) : std::nullopt;
+    params.src_zero_point = input_zero_point != 0 ? std::make_optional<tensor>(src_zp_t) : std::nullopt;
+    params.dst_zero_point = output_zero_point != 0 ? std::make_optional<tensor>(dst_zp_t) : std::nullopt;
+    params.bias = with_bias ? std::make_optional<tensor>(onednn_bias) : std::nullopt;
+    params.scratchpad = scratchpad;
+    params.init_args();
+    qlinear_forward_params_map[weight_addr] = params;
+  }
+  return dim == 2 ? output : output.resize_(output_size);
 }
 
 #if AT_MKLDNN_ACL_ENABLED()
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
@@ -4563,7 +4563,11 @@ def _test_qlinear_pt2e_helper(
         post_op="none",
         unary_post_op_args=(),
         post_op_algorithms=("none",),
+        test_fast_path=False,
     ):
+        if test_fast_path:
+            import os
+            os.environ["ONEDNN_CACHE_CONTEXT_UNSAFE"] = "1"
         qlinear_prepack = torch.ops.onednn.qlinear_prepack
         linear_op = F.linear
         in_channels_list = [4, 8]
@@ -4615,12 +4619,14 @@ def _test_qlinear_pt2e_helper(
                 qw_cpu = qw.int_repr()
                 qw_packed = qlinear_prepack(qw_cpu, x.shape)
 
+                num_iter = 2 if test_fast_path else 1  # rerun to use cache
                 if post_op in ("none", "relu", "gelu"):
-                    qy_cpu = qlinear_op(
-                        qx_cpu, x_scale, x_zp, qw_packed, w_scales, w_zps,
-                        b, used_y_scale, used_y_zp, output_dtype,
-                        post_op, unary_post_op_args, post_op_algo
-                    )
+                    for _ in range(num_iter):
+                        qy_cpu = qlinear_op(
+                            qx_cpu, x_scale, x_zp, qw_packed, w_scales, w_zps,
+                            b, used_y_scale, used_y_zp, output_dtype,
+                            post_op, unary_post_op_args, post_op_algo
+                        )
                     if post_op == "relu":
                         y_ref = F.relu(y_ref)
                     elif post_op == "gelu":
@@ -4637,12 +4643,14 @@ def _test_qlinear_pt2e_helper(
                     accum = qx2.int_repr() if output_dtype is None else qx2.dequantize()
                     if bfloat16_out:
                         accum = accum.bfloat16()
-                    qy_cpu = qlinear_op(
-                        qx_cpu, x_scale, x_zp, qw_packed, w_scales, w_zps,
-                        accum, b, used_y_scale, used_y_zp, output_dtype,
-                        x2_scale, x2_zp, "sum", binary_alpha,
-                        unary_post_op, unary_post_op_args, post_op_algo
-                    )
+                    for _ in range(num_iter):
+                        # clone accum otherwise it gets accumulated multiple times
+                        qy_cpu = qlinear_op(
+                            qx_cpu, x_scale, x_zp, qw_packed, w_scales, w_zps,
+                            accum.clone(), b, used_y_scale, used_y_zp, output_dtype,
+                            x2_scale, x2_zp, "sum", binary_alpha,
+                            unary_post_op, unary_post_op_args, post_op_algo
+                        )
                     y_ref = y_ref + x2 * binary_alpha
                     if unary_post_op == "relu":
                         y_ref = F.relu(y_ref)
@@ -4655,12 +4663,13 @@ def _test_qlinear_pt2e_helper(
                     x2 = torch.randn(y_ref.size()) * 10
                     unary_post_op = "relu" if post_op == "add_relu" else "none"
                     binary_alpha = 1.0  # we only support alpha=1.0 now
-                    qy_cpu = qlinear_op(
-                        qx_cpu, x_scale, x_zp, qw_packed, w_scales, w_zps,
-                        x2, b, used_y_scale, used_y_zp, output_dtype,
-                        1.0, 0, "add", binary_alpha,
-                        unary_post_op, unary_post_op_args, post_op_algo
-                    )
+                    for _ in range(num_iter):
+                        qy_cpu = qlinear_op(
+                            qx_cpu, x_scale, x_zp, qw_packed, w_scales, w_zps,
+                            x2, b, used_y_scale, used_y_zp, output_dtype,
+                            1.0, 0, "add", binary_alpha,
+                            unary_post_op, unary_post_op_args, post_op_algo
+                        )
                     y_ref = y_ref + x2 * binary_alpha
                     if unary_post_op == "relu":
                         y_ref = F.relu(y_ref)
@@ -4686,48 +4695,58 @@ def _test_qlinear_pt2e_helper(
                     y_s: {y_scale}, y_zp: {y_zp}""",
                 )
 
+        if test_fast_path:
+            del os.environ["ONEDNN_CACHE_CONTEXT_UNSAFE"]
+
     @unittest.skipIf(IS_FBCODE, "Skip pt2e ops in fbcode")
     @skipIfNoONEDNN
     def test_qlinear_pt2e(self):
         qlinear = torch.ops.onednn.qlinear_pointwise
         self._test_qlinear_pt2e_helper(qlinear, "none")
+        self._test_qlinear_pt2e_helper(qlinear, "none", test_fast_path=True)
 
     @unittest.skipIf(IS_FBCODE, "Skip pt2e ops in fbcode")
     @skipIfNoONEDNN
     def test_qlinear_relu_pt2e(self):
         qlinear = torch.ops.onednn.qlinear_pointwise
         self._test_qlinear_pt2e_helper(qlinear, "relu")
+        self._test_qlinear_pt2e_helper(qlinear, "relu", test_fast_path=True)
 
     @unittest.skipIf(IS_FBCODE, "Skip pt2e ops in fbcode")
     @skipIfNoONEDNN
     def test_qlinear_gelu_pt2e(self):
         qlinear = torch.ops.onednn.qlinear_pointwise
         post_op_algorithms = ['none', 'tanh']
         self._test_qlinear_pt2e_helper(qlinear, "gelu", post_op_algorithms=post_op_algorithms)
+        self._test_qlinear_pt2e_helper(qlinear, "gelu", post_op_algorithms=post_op_algorithms, test_fast_path=True)
 
     @unittest.skipIf(IS_FBCODE, "Skip pt2e ops in fbcode")
     @skipIfNoONEDNN
     def test_qlinear_sum_pt2e(self):
         qlinear = torch.ops.onednn.qlinear_pointwise.binary
         self._test_qlinear_pt2e_helper(qlinear, "sum")
+        self._test_qlinear_pt2e_helper(qlinear, "sum", test_fast_path=True)
 
     @unittest.skipIf(IS_FBCODE, "Skip pt2e ops in fbcode")
     @skipIfNoONEDNN
     def test_qlinear_sum_relu_pt2e(self):
         qlinear = torch.ops.onednn.qlinear_pointwise.binary
         self._test_qlinear_pt2e_helper(qlinear, "sum_relu")
+        self._test_qlinear_pt2e_helper(qlinear, "sum_relu", test_fast_path=True)
 
     @unittest.skipIf(IS_FBCODE, "Skip pt2e ops in fbcode")
     @skipIfNoONEDNN
     def test_qlinear_add_pt2e(self):
         qlinear = torch.ops.onednn.qlinear_pointwise.binary
         self._test_qlinear_pt2e_helper(qlinear, "add")
+        self._test_qlinear_pt2e_helper(qlinear, "add", test_fast_path=True)
 
     @unittest.skipIf(IS_FBCODE, "Skip pt2e ops in fbcode")
     @skipIfNoONEDNN
     def test_qlinear_add_relu_pt2e(self):
         qlinear = torch.ops.onednn.qlinear_pointwise.binary
         self._test_qlinear_pt2e_helper(qlinear, "add_relu")
+        self._test_qlinear_pt2e_helper(qlinear, "add_relu", test_fast_path=True)
 
     def _test_qlinear_fp8_helper(
         self,