@@ -278,7 +278,8 @@ struct webgpu_context_struct {
278278 webgpu_pipeline memset_pipeline;
279279
280280 std::map<int , std::map<int , std::map<int , webgpu_pipeline>>> mul_mat_pipelines; // src0_type, src1_type, vectorized
281- std::map<int , std::map<int , std::map<int , webgpu_pipeline>>> mul_mat_vec_pipelines; // src0_type, src1_type, vectorized
281+ std::map<int , std::map<int , std::map<int , webgpu_pipeline>>>
282+ mul_mat_vec_pipelines; // src0_type, src1_type, vectorized
282283
283284 webgpu_pipeline mul_mat_pipeline[30 ][2 ];
284285 webgpu_pipeline set_rows_pipeline[1 ][2 ]; // dst->type, vectorized
@@ -972,13 +973,14 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
972973 int vectorized = src0->ne [0 ] % 4 == 0 && dst->ne [0 ] % 4 == 0 && dst->ne [1 ] % 4 == 0 ;
973974 if (dst->ne [1 ] == 1 ) {
974975 // We don't support vectorized mul_mat_vec for quantized types
975- vectorized = vectorized && (src0->type < 2 );
976- pipeline = ctx->mul_mat_vec_pipelines [src0->type ][src1->type ][vectorized];
977- uint32_t batches = dst->ne [2 ] * dst->ne [3 ];
978- uint32_t output_groups = (dst->ne [0 ] + WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG - 1 ) / WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG;
979- uint32_t total_wg = output_groups * batches;
980- wg_x = total_wg % ctx->limits .maxComputeWorkgroupsPerDimension ;
981- wg_y = (total_wg + ctx->limits .maxComputeWorkgroupsPerDimension - 1 ) /
976+ vectorized = vectorized && (src0->type < 2 );
977+ pipeline = ctx->mul_mat_vec_pipelines [src0->type ][src1->type ][vectorized];
978+ uint32_t batches = dst->ne [2 ] * dst->ne [3 ];
979+ uint32_t output_groups =
980+ (dst->ne [0 ] + WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG - 1 ) / WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG;
981+ uint32_t total_wg = output_groups * batches;
982+ wg_x = total_wg % ctx->limits .maxComputeWorkgroupsPerDimension ;
983+ wg_y = (total_wg + ctx->limits .maxComputeWorkgroupsPerDimension - 1 ) /
982984 ctx->limits .maxComputeWorkgroupsPerDimension ;
983985 } else {
984986 pipeline = ctx->mul_mat_pipelines [src0->type ][src1->type ][vectorized];
@@ -1861,7 +1863,6 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
18611863 webgpu_ctx->mul_mat_pipelines [GGML_TYPE_Q4_0][GGML_TYPE_F32][1 ] =
18621864 ggml_webgpu_create_pipeline2 (webgpu_ctx->device , proc_mul_mat_reg_tile_q4_0_f32_vec.c_str (),
18631865 " mul_mat_reg_tile_q4_0_f32_vec" , mul_mat_reg_tile_constants);
1864-
18651866 }
18661867
18671868 std::vector<wgpu::ConstantEntry> mul_mat_vec_constants (3 );
@@ -1872,20 +1873,20 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
18721873 mul_mat_vec_constants[2 ].key = " OUTPUTS_PER_WG" ;
18731874 mul_mat_vec_constants[2 ].value = WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG;
18741875
1875- webgpu_ctx->mul_mat_vec_pipelines [GGML_TYPE_F32][GGML_TYPE_F32][0 ] =
1876- ggml_webgpu_create_pipeline2 ( webgpu_ctx->device , wgsl_mul_mat_vec_f32_f32, " mul_mat_vec_f32_f32" , mul_mat_vec_constants);
1877- webgpu_ctx->mul_mat_vec_pipelines [GGML_TYPE_F32][GGML_TYPE_F32][1 ] =
1878- ggml_webgpu_create_pipeline2 ( webgpu_ctx->device , wgsl_mul_mat_vec_f32_f32_vec, " mul_mat_vec_f32_f32_vec" , mul_mat_vec_constants);
1879- webgpu_ctx->mul_mat_vec_pipelines [GGML_TYPE_F16][GGML_TYPE_F32][0 ] =
1880- ggml_webgpu_create_pipeline2 ( webgpu_ctx->device , wgsl_mul_mat_vec_f16_f32, " mul_mat_vec_f16_f32" , mul_mat_vec_constants);
1881- webgpu_ctx->mul_mat_vec_pipelines [GGML_TYPE_F16][GGML_TYPE_F32][1 ] =
1882- ggml_webgpu_create_pipeline2 ( webgpu_ctx->device , wgsl_mul_mat_vec_f16_f32_vec, " mul_mat_vec_f16_f32_vec" , mul_mat_vec_constants);
1883- webgpu_ctx->mul_mat_vec_pipelines [GGML_TYPE_F16][GGML_TYPE_F16][0 ] =
1884- ggml_webgpu_create_pipeline2 ( webgpu_ctx->device , wgsl_mul_mat_vec_f16_f16, " mul_mat_vec_f16_f16" , mul_mat_vec_constants);
1885- webgpu_ctx->mul_mat_vec_pipelines [GGML_TYPE_F16][GGML_TYPE_F16][1 ] =
1886- ggml_webgpu_create_pipeline2 ( webgpu_ctx->device , wgsl_mul_mat_vec_f16_f16_vec, " mul_mat_vec_f16_f16_vec" , mul_mat_vec_constants);
1887- webgpu_ctx->mul_mat_vec_pipelines [GGML_TYPE_Q4_0][GGML_TYPE_F32][0 ] =
1888- ggml_webgpu_create_pipeline2 ( webgpu_ctx->device , wgsl_mul_mat_vec_q4_0_f32, " mul_mat_vec_q4_0_f32" , mul_mat_vec_constants);
1876+ webgpu_ctx->mul_mat_vec_pipelines [GGML_TYPE_F32][GGML_TYPE_F32][0 ] = ggml_webgpu_create_pipeline2 (
1877+ webgpu_ctx->device , wgsl_mul_mat_vec_f32_f32, " mul_mat_vec_f32_f32" , mul_mat_vec_constants);
1878+ webgpu_ctx->mul_mat_vec_pipelines [GGML_TYPE_F32][GGML_TYPE_F32][1 ] = ggml_webgpu_create_pipeline2 (
1879+ webgpu_ctx->device , wgsl_mul_mat_vec_f32_f32_vec, " mul_mat_vec_f32_f32_vec" , mul_mat_vec_constants);
1880+ webgpu_ctx->mul_mat_vec_pipelines [GGML_TYPE_F16][GGML_TYPE_F32][0 ] = ggml_webgpu_create_pipeline2 (
1881+ webgpu_ctx->device , wgsl_mul_mat_vec_f16_f32, " mul_mat_vec_f16_f32" , mul_mat_vec_constants);
1882+ webgpu_ctx->mul_mat_vec_pipelines [GGML_TYPE_F16][GGML_TYPE_F32][1 ] = ggml_webgpu_create_pipeline2 (
1883+ webgpu_ctx->device , wgsl_mul_mat_vec_f16_f32_vec, " mul_mat_vec_f16_f32_vec" , mul_mat_vec_constants);
1884+ webgpu_ctx->mul_mat_vec_pipelines [GGML_TYPE_F16][GGML_TYPE_F16][0 ] = ggml_webgpu_create_pipeline2 (
1885+ webgpu_ctx->device , wgsl_mul_mat_vec_f16_f16, " mul_mat_vec_f16_f16" , mul_mat_vec_constants);
1886+ webgpu_ctx->mul_mat_vec_pipelines [GGML_TYPE_F16][GGML_TYPE_F16][1 ] = ggml_webgpu_create_pipeline2 (
1887+ webgpu_ctx->device , wgsl_mul_mat_vec_f16_f16_vec, " mul_mat_vec_f16_f16_vec" , mul_mat_vec_constants);
1888+ webgpu_ctx->mul_mat_vec_pipelines [GGML_TYPE_Q4_0][GGML_TYPE_F32][0 ] = ggml_webgpu_create_pipeline2 (
1889+ webgpu_ctx->device , wgsl_mul_mat_vec_q4_0_f32, " mul_mat_vec_q4_0_f32" , mul_mat_vec_constants);
18891890}
18901891
18911892static void ggml_webgpu_init_set_rows_pipeline (webgpu_context & webgpu_ctx) {
@@ -2382,12 +2383,12 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
23822383 webgpu_context ctx = reg_ctx->webgpu_ctx ;
23832384
23842385 // TODO: track need for these toggles: https://issues.chromium.org/issues/42251215
2385- const char * const adapterEnabledToggles[] = { " vulkan_enable_f16_on_nvidia" , " use_vulkan_memory_model" };
2386+ const char * const adapterEnabledToggles[] = { " vulkan_enable_f16_on_nvidia" , " use_vulkan_memory_model" };
23862387 wgpu::DawnTogglesDescriptor adapterTogglesDesc;
2387- adapterTogglesDesc.enabledToggles = adapterEnabledToggles;
2388- adapterTogglesDesc.enabledToggleCount = 2 ;
2389- wgpu::RequestAdapterOptions options = {};
2390- options.nextInChain = &adapterTogglesDesc;
2388+ adapterTogglesDesc.enabledToggles = adapterEnabledToggles;
2389+ adapterTogglesDesc.enabledToggleCount = 2 ;
2390+ wgpu::RequestAdapterOptions options = {};
2391+ options.nextInChain = &adapterTogglesDesc;
23912392 ctx->instance .WaitAny (ctx->instance .RequestAdapter (
23922393 &options, wgpu::CallbackMode::AllowSpontaneous,
23932394 [&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
@@ -2432,7 +2433,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
24322433
24332434 // For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
24342435 // Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
2435- ctx->subgroup_size = info.subgroupMaxSize ;
2436+ ctx->subgroup_size = info.subgroupMaxSize ;
24362437 ctx->supports_subgroup_matrix = valid_subgroup_matrix_config;
24372438
24382439 // Initialize device
0 commit comments