reeselevine
diff --git a/‎ggml/src/ggml-webgpu/ggml-webgpu.cpp‎
Lines changed: 53 additions & 28 deletions b/‎ggml/src/ggml-webgpu/ggml-webgpu.cpp‎
Lines changed: 53 additions & 28 deletions
diff --git a/‎ggml/src/ggml-webgpu/wgsl-shaders/mat_mul_decls.tmpl‎
Lines changed: 51 additions & 2 deletions b/‎ggml/src/ggml-webgpu/wgsl-shaders/mat_mul_decls.tmpl‎
Lines changed: 51 additions & 2 deletions
diff --git a/‎ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl‎
Lines changed: 34 additions & 12 deletions b/‎ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl‎
Lines changed: 34 additions & 12 deletions
@@ -92,11 +92,11 @@
 #define WEBGPU_MUL_MAT_SUBGROUP_MATRIX_M 4
 #define WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N 2
 
-// gemv parameters
-#define WEBGPU_GEMV_WG_SIZE        256
-// Must be multiple of 4 to work with vectorized paths, and must divide gemv wg size
-#define WEBGPU_GEMV_OUTPUTS_PER_WG 16
-#define WEBGPU_GEMV_TILE_K         128
+// Matrix-vector multiplication parameters
+#define WEBGPU_MUL_MAT_VEC_WG_SIZE        256
+// Must be multiple of 4 to work with vectorized paths, and must divide mul_mat_vec wg size
+#define WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG 64
+#define WEBGPU_MUL_MAT_VEC_TILE_K         256
 
 /* End Constants */
 
@@ -278,7 +278,7 @@ struct webgpu_context_struct {
     webgpu_pipeline memset_pipeline;
 
     std::map<int, std::map<int, std::map<int, webgpu_pipeline>>> mul_mat_pipelines;  // src0_type, src1_type, vectorized
-    std::map<int, std::map<int, std::map<int, webgpu_pipeline>>> gemv_pipelines;     // src0_type, src1_type, vectorized
+    std::map<int, std::map<int, std::map<int, webgpu_pipeline>>> mul_mat_vec_pipelines;     // src0_type, src1_type, vectorized
 
     webgpu_pipeline mul_mat_pipeline[30][2];
     webgpu_pipeline set_rows_pipeline[1][2];  // dst->type, vectorized
@@ -957,6 +957,7 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
             switch (src0->type) {
                 case GGML_TYPE_F32:
                 case GGML_TYPE_F16:
+                case GGML_TYPE_Q4_0:
                     use_fast = true;
                     break;
                 default:
@@ -970,9 +971,11 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
     if (use_fast) {
         int vectorized = src0->ne[0] % 4 == 0 && dst->ne[0] % 4 == 0 && dst->ne[1] % 4 == 0;
         if (dst->ne[1] == 1) {
-            pipeline               = ctx->gemv_pipelines[src0->type][src1->type][vectorized];
+            // We don't support vectorized mul_mat_vec for quantized types
+            vectorized = vectorized && (src0->type < 2);
+            pipeline               = ctx->mul_mat_vec_pipelines[src0->type][src1->type][vectorized];
             uint32_t batches       = dst->ne[2] * dst->ne[3];
-            uint32_t output_groups = (dst->ne[0] + WEBGPU_GEMV_OUTPUTS_PER_WG - 1) / WEBGPU_GEMV_OUTPUTS_PER_WG;
+            uint32_t output_groups = (dst->ne[0] + WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG - 1) / WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG;
             uint32_t total_wg      = output_groups * batches;
             wg_x                   = total_wg % ctx->limits.maxComputeWorkgroupsPerDimension;
             wg_y                   = (total_wg + ctx->limits.maxComputeWorkgroupsPerDimension - 1) /
@@ -1777,6 +1780,10 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
             ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16, sg_matrix_repls);
         std::string proc_mul_mat_subgroup_matrix_f16_f16_vec =
             ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16_vec, sg_matrix_repls);
+        std::string proc_mul_mat_subgroup_matrix_q4_0_f32 =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32, sg_matrix_repls);
+        std::string proc_mul_mat_subgroup_matrix_q4_0_f32_vec =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32_vec, sg_matrix_repls);
 
         webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
             webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32.c_str(), "mul_mat_subgroup_matrix_f32_f32");
@@ -1793,6 +1800,11 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
         webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
             ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16_vec.c_str(),
                                          "mul_mat_subgroup_matrix_f16_f16_vec");
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
+            webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32.c_str(), "mul_mat_subgroup_matrix_q4_0_f32");
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
+            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32_vec.c_str(),
+                                         "mul_mat_subgroup_matrix_q4_0_f32_vec");
     } else {
         std::vector<wgpu::ConstantEntry> mul_mat_reg_tile_constants(3);
         mul_mat_reg_tile_constants[0].key   = "TILE_K";
@@ -1820,6 +1832,10 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
             ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16, reg_repls);
         std::string proc_mul_mat_reg_tile_f16_f16_vec =
             ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16_vec, reg_repls);
+        std::string proc_mul_mat_reg_tile_q4_0_f32 =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32, reg_repls);
+        std::string proc_mul_mat_reg_tile_q4_0_f32_vec =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32_vec, reg_repls);
 
         webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] =
             ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32.c_str(),
@@ -1839,28 +1855,37 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
         webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
             ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16_vec.c_str(),
                                          "mul_mat_reg_tile_f16_f16_vec", mul_mat_reg_tile_constants);
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] =
+            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32.c_str(),
+                                         "mul_mat_reg_tile_q4_0_f32", mul_mat_reg_tile_constants);
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
+            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32_vec.c_str(),
+                                         "mul_mat_reg_tile_q4_0_f32_vec", mul_mat_reg_tile_constants);
+
     }
 
-    std::vector<wgpu::ConstantEntry> gemv_constants(3);
-    gemv_constants[0].key   = "WORKGROUP_SIZE";
-    gemv_constants[0].value = WEBGPU_GEMV_WG_SIZE;
-    gemv_constants[1].key   = "TILE_K";
-    gemv_constants[1].value = WEBGPU_GEMV_TILE_K;
-    gemv_constants[2].key   = "OUTPUTS_PER_WG";
-    gemv_constants[2].value = WEBGPU_GEMV_OUTPUTS_PER_WG;
-
-    webgpu_ctx->gemv_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] =
-        ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_gemv_f32_f32, "gemv_f32_f32", gemv_constants);
-    webgpu_ctx->gemv_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
-        ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_gemv_f32_f32_vec, "gemv_f32_f32_vec", gemv_constants);
-    webgpu_ctx->gemv_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] =
-        ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_gemv_f16_f32, "gemv_f16_f32", gemv_constants);
-    webgpu_ctx->gemv_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
-        ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_gemv_f16_f32_vec, "gemv_f16_f32_vec", gemv_constants);
-    webgpu_ctx->gemv_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] =
-        ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_gemv_f16_f16, "gemv_f16_f16", gemv_constants);
-    webgpu_ctx->gemv_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
-        ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_gemv_f16_f16_vec, "gemv_f16_f16_vec", gemv_constants);
+    std::vector<wgpu::ConstantEntry> mul_mat_vec_constants(3);
+    mul_mat_vec_constants[0].key   = "WORKGROUP_SIZE";
+    mul_mat_vec_constants[0].value = WEBGPU_MUL_MAT_VEC_WG_SIZE;
+    mul_mat_vec_constants[1].key   = "TILE_K";
+    mul_mat_vec_constants[1].value = WEBGPU_MUL_MAT_VEC_TILE_K;
+    mul_mat_vec_constants[2].key   = "OUTPUTS_PER_WG";
+    mul_mat_vec_constants[2].value = WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG;
+
+    webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] =
+        ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_mul_mat_vec_f32_f32, "mul_mat_vec_f32_f32", mul_mat_vec_constants);
+    webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
+        ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_mul_mat_vec_f32_f32_vec, "mul_mat_vec_f32_f32_vec", mul_mat_vec_constants);
+    webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] =
+        ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_mul_mat_vec_f16_f32, "mul_mat_vec_f16_f32", mul_mat_vec_constants);
+    webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
+        ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_mul_mat_vec_f16_f32_vec, "mul_mat_vec_f16_f32_vec", mul_mat_vec_constants);
+    webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] =
+        ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_mul_mat_vec_f16_f16, "mul_mat_vec_f16_f16", mul_mat_vec_constants);
+    webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
+        ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_mul_mat_vec_f16_f16_vec, "mul_mat_vec_f16_f16_vec", mul_mat_vec_constants);
+    webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] =
+        ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_mul_mat_vec_q4_0_f32, "mul_mat_vec_q4_0_f32", mul_mat_vec_constants);
 }
 
 static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) {
 
@@ -13,7 +13,7 @@ fn store_shmem(val: f16, idx: u32) {
 }
 #enddecl(SHMEM_SCALAR)
 
-#decl(INIT_SHMEM_FLOAT)
+#decl(INIT_SRC0_SHMEM_FLOAT)
 
 fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
     for (var elem_idx = thread_id * {{VEC_SIZE}}; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE * {{VEC_SIZE}}) {
@@ -30,6 +30,10 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
     }
 }
 
+#enddecl(INIT_SRC0_SHMEM_FLOAT)
+
+#decl(INIT_SRC1_SHMEM)
+
 fn init_shmem_src1(thread_id: u32, batch_offset: u32, offset_n: u32, k_outer: u32) {
     for (var elem_idx = thread_id * {{VEC_SIZE}}; elem_idx < TILE_SRC1_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE * {{VEC_SIZE}}) {
         let tile_n = elem_idx / TILE_K;
@@ -45,5 +49,50 @@ fn init_shmem_src1(thread_id: u32, batch_offset: u32, offset_n: u32, k_outer: u3
     }
 }
 
-#enddecl(INIT_SHMEM_FLOAT)
+#enddecl(INIT_SRC1_SHMEM)
+
+#decl(INIT_SRC0_SHMEM_Q4_0)
+
+const BLOCK_SIZE = 32u;
+// the number of blocks per k-tile. Note that this currently only works if TILE_K is a multiple of BLOCK_SIZE, which may need to be rethought for larger quantized types.
+override BLOCKS_K = TILE_K/BLOCK_SIZE;
+const NQ = 16u;
+const F16_PER_BLOCK = 9u; // 1 scale + 8x4 packed weights
+const WEIGHTS_PER_F16 = 4u; // 4 weights per f16
+const F16_PER_THREAD = NQ / WEIGHTS_PER_F16;
+
+fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
+    for (var i = thread_id * NQ; i < TILE_SRC0_SHMEM; i += TOTAL_WORKGROUP_SIZE * NQ) {
+        let blck_idx = i / BLOCK_SIZE;
+        let block_offset = (i % BLOCK_SIZE) / WEIGHTS_PER_F16;
+        let shmem_idx = blck_idx * BLOCK_SIZE + block_offset * 2u;
+
+        let tile_m = blck_idx / BLOCKS_K;
+        let global_m = offset_m + tile_m;
+        let block_k = blck_idx % BLOCKS_K;
+        let global_k = k_outer / BLOCK_SIZE + block_k;
+
+        if (global_m < params.m && global_k < params.k / BLOCK_SIZE) {
+            let src0_idx = batch_offset + global_m * params.stride_01 + global_k;
+            let scale_idx = src0_idx * F16_PER_BLOCK;
+            let d = src0[scale_idx];
+
+            for (var j = 0u; j < F16_PER_THREAD; j += 2) {
+                let q_0 = src0[scale_idx + 1u + block_offset + j];
+                let q_1 = src0[scale_idx + 1u + block_offset + j + 1];
+
+                let q_packed = bitcast<u32>(vec2(q_0, q_1));
+                for (var k = 0u; k < 4u; k++) {
+                    let q_byte = get_byte(q_packed, k);
+                    let q_hi = (f16((q_byte >> 4) & 0xF) - 8.0) * d;
+                    let q_lo = (f16(q_byte & 0xF) - 8.0) * d;
+                    shmem[shmem_idx + j * 2 + k] = q_lo;
+                    shmem[shmem_idx + j * 2 + k + 16u] = q_hi;
+                }
+            }
+        }
+    }
+}
+
+#enddecl(INIT_SRC0_SHMEM_Q4_0)
 
@@ -7,9 +7,9 @@
       "SRC1_TYPE" : "vec4<f32>",
       "DST_TYPE" : "vec4<f32>",
       "SHMEM_TYPE" : "vec4<f16>",
-      "VEC_SIZE" : "4",
+      "VEC_SIZE" : 4,
     },
-    "DECLS": ["VEC", "SHMEM_VEC", "INIT_SHMEM_FLOAT"]
+    "DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
   },
   {
     "SHADER_SUFFIX": "f32_f32",
@@ -18,9 +18,9 @@
       "SRC1_TYPE" : "f32",
       "DST_TYPE" : "f32",
       "SHMEM_TYPE" : "f16",
-      "VEC_SIZE" : "1",
+      "VEC_SIZE" : 1,
     },
-    "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SHMEM_FLOAT"]
+    "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
   },
   {
     "SHADER_SUFFIX": "f16_f32_vec",
@@ -29,9 +29,9 @@
       "SRC1_TYPE" : "vec4<f32>",
       "DST_TYPE" : "vec4<f32>",
       "SHMEM_TYPE" : "vec4<f16>",
-      "VEC_SIZE" : "4",
+      "VEC_SIZE" : 4,
     },
-    "DECLS": ["VEC", "SHMEM_VEC", "INIT_SHMEM_FLOAT"]
+    "DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
   },
   {
     "SHADER_SUFFIX": "f16_f32",
@@ -40,9 +40,9 @@
       "SRC1_TYPE" : "f32",
       "DST_TYPE" : "f32",
       "SHMEM_TYPE" : "f16",
-      "VEC_SIZE" : "1",
+      "VEC_SIZE" : 1,
     },
-    "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SHMEM_FLOAT"]
+    "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
   },
   {
     "SHADER_SUFFIX": "f16_f16_vec",
@@ -51,9 +51,9 @@
       "SRC1_TYPE" : "vec4<f16>",
       "DST_TYPE" : "vec4<f32>",
       "SHMEM_TYPE" : "vec4<f16>",
-      "VEC_SIZE" : "4",
+      "VEC_SIZE" : 4,
     },
-    "DECLS": ["VEC", "SHMEM_VEC", "INIT_SHMEM_FLOAT"]
+    "DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
   },
   {
     "SHADER_SUFFIX": "f16_f16",
@@ -62,9 +62,31 @@
       "SRC1_TYPE" : "f16",
       "DST_TYPE" : "f32",
       "SHMEM_TYPE" : "f16",
-      "VEC_SIZE" : "1",
+      "VEC_SIZE" : 1,
     },
-    "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SHMEM_FLOAT"]
+    "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
+  },
+  {
+    "SHADER_SUFFIX": "q4_0_f32_vec",
+    "REPLS": {
+      "SRC0_TYPE" : "f16",
+      "SRC1_TYPE" : "vec4<f32>",
+      "DST_TYPE" : "vec4<f32>",
+      "SHMEM_TYPE" : "vec4<f16>",
+      "VEC_SIZE" : 4,
+    },
+    "DECLS": ["BYTE_HELPERS", "VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_Q4_0", "INIT_SRC1_SHMEM"]
+  },
+  {
+    "SHADER_SUFFIX": "q4_0_f32",
+    "REPLS": {
+      "SRC0_TYPE" : "f16",
+      "SRC1_TYPE" : "f32",
+      "DST_TYPE" : "f32",
+      "SHMEM_TYPE" : "f16",
+      "VEC_SIZE" : 1,
+    },
+    "DECLS": ["BYTE_HELPERS", "SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_Q4_0", "INIT_SRC1_SHMEM"]
   }
 ]