diff --git a/eigen b/eigen
index dde02fce..36b95962 160000
--- a/eigen
+++ b/eigen
@@ -1 +1 @@
-Subproject commit dde02fceedfc1ba09d4d4f71a2b5dafcfcb85491
+Subproject commit 36b95962756c1fce8e29b1f8bc45967f30773c00
diff --git a/python/thundersvm/thundersvm.dll b/python/thundersvm/thundersvm.dll
new file mode 100644
index 00000000..ad8cc526
Binary files /dev/null and b/python/thundersvm/thundersvm.dll differ
diff --git a/src/test/googletest b/src/test/googletest
index a325ad2d..df1544bc 160000
--- a/src/test/googletest
+++ b/src/test/googletest
@@ -1 +1 @@
-Subproject commit a325ad2db5deb623eab740527e559b81c0f39d65
+Subproject commit df1544bcee0c7ce35cd5ea0b3eb8cc81855a4140
diff --git a/src/thundersvm/kernel/kernelmatrix_kernel.cpp b/src/thundersvm/kernel/kernelmatrix_kernel.cpp
index f4187f92..7bc011ba 100644
--- a/src/thundersvm/kernel/kernelmatrix_kernel.cpp
+++ b/src/thundersvm/kernel/kernelmatrix_kernel.cpp
@@ -146,6 +146,65 @@ namespace svm_kernel {
         }
     }
 
+    void sum_kernel_values_instant(const SyncArray<float_type>& coef, int total_sv, const SyncArray<int>& sv_start,
+        const SyncArray<int>& sv_count, const SyncArray<float_type>& rho,
+        const SyncArray<kernel_type>& k_mat,
+        SyncArray<float_type>& predict_instant, int n_classes, int n_instances,
+        SyncArray<float_type>& vote_device) {
+        const int* sv_start_data = sv_start.host_data();
+        const int* sv_count_data = sv_count.host_data();
+        const float_type* coef_data = coef.host_data();
+        const kernel_type* k_mat_data = k_mat.host_data();
+        float_type* predict_instant_data = predict_instant.host_data();
+        const float_type* rho_data = rho.host_data();
+        float_type* vote_device_data = vote_device.host_data();
+
+        int n_binary_models = n_classes * (n_classes - 1) / 2;
+
+#pragma omp parallel for schedule(guided)
+        for (int idx = 0; idx < n_instances; idx++) {
+            int k = 0;
+            for (int i = 0; i < n_classes; ++i) {
+                for (int j = i + 1; j < n_classes; ++j) {
+                    int si = sv_start_data[i];
+                    int sj = sv_start_data[j];
+                    int ci = sv_count_data[i];
+                    int cj = sv_count_data[j];
+
+                    const float_type* coef1 = &coef_data[(j - 1) * total_sv];
+                    const float_type* coef2 = &coef_data[i * total_sv];
+                    const kernel_type* k_values = &k_mat_data[idx * total_sv];
+
+                    double sum = 0.0;
+
+#pragma omp parallel for reduction(+:sum)
+                    for (int l = 0; l < ci; ++l) {
+                        sum += coef1[si + l] * k_values[si + l];
+                    }
+#pragma omp parallel for reduction(+:sum)
+                    for (int l = 0; l < cj; ++l) {
+                        sum += coef2[sj + l] * k_values[sj + l];
+                    }
+
+                    // Store in predict_instant and adjust with rho
+                    predict_instant_data[idx * n_binary_models + k] = sum - rho_data[k];
+
+                    // Update vote_device based on the decision value
+                    if (predict_instant_data[idx * n_binary_models + k] > 0) {
+#pragma omp atomic
+                        vote_device_data[idx * n_classes + i] += 1;
+                    }
+                    else {
+#pragma omp atomic
+                        vote_device_data[idx * n_classes + j] += 1;
+                    }
+
+                    k++;
+                }
+            }
+        }
+    }
+
     void dns_csr_mul(int m, int n, int k, const SyncArray<kernel_type> &dense_mat, const SyncArray<kernel_type> &csr_val,
                      const SyncArray<int> &csr_row_ptr, const SyncArray<int> &csr_col_ind, int nnz,
                      SyncArray<kernel_type> &result) {