Skip to content

Commit 9726640

Browse files
committed
Formatting
1 parent 4ec09e4 commit 9726640

File tree

4 files changed

+32
-32
lines changed

4 files changed

+32
-32
lines changed

ggml/src/ggml-webgpu/ggml-webgpu.cpp

Lines changed: 30 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,8 @@ struct webgpu_context_struct {
278278
webgpu_pipeline memset_pipeline;
279279

280280
std::map<int, std::map<int, std::map<int, webgpu_pipeline>>> mul_mat_pipelines; // src0_type, src1_type, vectorized
281-
std::map<int, std::map<int, std::map<int, webgpu_pipeline>>> mul_mat_vec_pipelines; // src0_type, src1_type, vectorized
281+
std::map<int, std::map<int, std::map<int, webgpu_pipeline>>>
282+
mul_mat_vec_pipelines; // src0_type, src1_type, vectorized
282283

283284
webgpu_pipeline mul_mat_pipeline[30][2];
284285
webgpu_pipeline set_rows_pipeline[1][2]; // dst->type, vectorized
@@ -972,13 +973,14 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
972973
int vectorized = src0->ne[0] % 4 == 0 && dst->ne[0] % 4 == 0 && dst->ne[1] % 4 == 0;
973974
if (dst->ne[1] == 1) {
974975
// We don't support vectorized mul_mat_vec for quantized types
975-
vectorized = vectorized && (src0->type < 2);
976-
pipeline = ctx->mul_mat_vec_pipelines[src0->type][src1->type][vectorized];
977-
uint32_t batches = dst->ne[2] * dst->ne[3];
978-
uint32_t output_groups = (dst->ne[0] + WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG - 1) / WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG;
979-
uint32_t total_wg = output_groups * batches;
980-
wg_x = total_wg % ctx->limits.maxComputeWorkgroupsPerDimension;
981-
wg_y = (total_wg + ctx->limits.maxComputeWorkgroupsPerDimension - 1) /
976+
vectorized = vectorized && (src0->type < 2);
977+
pipeline = ctx->mul_mat_vec_pipelines[src0->type][src1->type][vectorized];
978+
uint32_t batches = dst->ne[2] * dst->ne[3];
979+
uint32_t output_groups =
980+
(dst->ne[0] + WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG - 1) / WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG;
981+
uint32_t total_wg = output_groups * batches;
982+
wg_x = total_wg % ctx->limits.maxComputeWorkgroupsPerDimension;
983+
wg_y = (total_wg + ctx->limits.maxComputeWorkgroupsPerDimension - 1) /
982984
ctx->limits.maxComputeWorkgroupsPerDimension;
983985
} else {
984986
pipeline = ctx->mul_mat_pipelines[src0->type][src1->type][vectorized];
@@ -1861,7 +1863,6 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
18611863
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
18621864
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32_vec.c_str(),
18631865
"mul_mat_reg_tile_q4_0_f32_vec", mul_mat_reg_tile_constants);
1864-
18651866
}
18661867

18671868
std::vector<wgpu::ConstantEntry> mul_mat_vec_constants(3);
@@ -1872,20 +1873,20 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
18721873
mul_mat_vec_constants[2].key = "OUTPUTS_PER_WG";
18731874
mul_mat_vec_constants[2].value = WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG;
18741875

1875-
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] =
1876-
ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_mul_mat_vec_f32_f32, "mul_mat_vec_f32_f32", mul_mat_vec_constants);
1877-
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
1878-
ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_mul_mat_vec_f32_f32_vec, "mul_mat_vec_f32_f32_vec", mul_mat_vec_constants);
1879-
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] =
1880-
ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_mul_mat_vec_f16_f32, "mul_mat_vec_f16_f32", mul_mat_vec_constants);
1881-
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
1882-
ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_mul_mat_vec_f16_f32_vec, "mul_mat_vec_f16_f32_vec", mul_mat_vec_constants);
1883-
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] =
1884-
ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_mul_mat_vec_f16_f16, "mul_mat_vec_f16_f16", mul_mat_vec_constants);
1885-
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
1886-
ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_mul_mat_vec_f16_f16_vec, "mul_mat_vec_f16_f16_vec", mul_mat_vec_constants);
1887-
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] =
1888-
ggml_webgpu_create_pipeline2(webgpu_ctx->device, wgsl_mul_mat_vec_q4_0_f32, "mul_mat_vec_q4_0_f32", mul_mat_vec_constants);
1876+
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
1877+
webgpu_ctx->device, wgsl_mul_mat_vec_f32_f32, "mul_mat_vec_f32_f32", mul_mat_vec_constants);
1878+
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
1879+
webgpu_ctx->device, wgsl_mul_mat_vec_f32_f32_vec, "mul_mat_vec_f32_f32_vec", mul_mat_vec_constants);
1880+
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
1881+
webgpu_ctx->device, wgsl_mul_mat_vec_f16_f32, "mul_mat_vec_f16_f32", mul_mat_vec_constants);
1882+
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
1883+
webgpu_ctx->device, wgsl_mul_mat_vec_f16_f32_vec, "mul_mat_vec_f16_f32_vec", mul_mat_vec_constants);
1884+
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
1885+
webgpu_ctx->device, wgsl_mul_mat_vec_f16_f16, "mul_mat_vec_f16_f16", mul_mat_vec_constants);
1886+
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] = ggml_webgpu_create_pipeline2(
1887+
webgpu_ctx->device, wgsl_mul_mat_vec_f16_f16_vec, "mul_mat_vec_f16_f16_vec", mul_mat_vec_constants);
1888+
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
1889+
webgpu_ctx->device, wgsl_mul_mat_vec_q4_0_f32, "mul_mat_vec_q4_0_f32", mul_mat_vec_constants);
18891890
}
18901891

18911892
static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) {
@@ -2382,12 +2383,12 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
23822383
webgpu_context ctx = reg_ctx->webgpu_ctx;
23832384

23842385
// TODO: track need for these toggles: https://issues.chromium.org/issues/42251215
2385-
const char * const adapterEnabledToggles[] = { "vulkan_enable_f16_on_nvidia", "use_vulkan_memory_model" };
2386+
const char * const adapterEnabledToggles[] = { "vulkan_enable_f16_on_nvidia", "use_vulkan_memory_model" };
23862387
wgpu::DawnTogglesDescriptor adapterTogglesDesc;
2387-
adapterTogglesDesc.enabledToggles = adapterEnabledToggles;
2388-
adapterTogglesDesc.enabledToggleCount = 2;
2389-
wgpu::RequestAdapterOptions options = {};
2390-
options.nextInChain = &adapterTogglesDesc;
2388+
adapterTogglesDesc.enabledToggles = adapterEnabledToggles;
2389+
adapterTogglesDesc.enabledToggleCount = 2;
2390+
wgpu::RequestAdapterOptions options = {};
2391+
options.nextInChain = &adapterTogglesDesc;
23912392
ctx->instance.WaitAny(ctx->instance.RequestAdapter(
23922393
&options, wgpu::CallbackMode::AllowSpontaneous,
23932394
[&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
@@ -2432,7 +2433,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
24322433

24332434
// For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
24342435
// Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
2435-
ctx->subgroup_size = info.subgroupMaxSize;
2436+
ctx->subgroup_size = info.subgroupMaxSize;
24362437
ctx->supports_subgroup_matrix = valid_subgroup_matrix_config;
24372438

24382439
// Initialize device

ggml/src/ggml-webgpu/wgsl-shaders/mat_mul_decls.tmpl renamed to ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,4 +95,3 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
9595
}
9696

9797
#enddecl(INIT_SRC0_SHMEM_Q4_0)
98-

ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
200200

201201
for (var k_outer = 0u; k_outer < params.k; k_outer += TILE_K) {
202202

203-
// see mat_mul_decls.tmpl
203+
// see mul_mat_decls.tmpl
204204
init_shmem_src0(thread_id, src0_batch_offset, offset_m, k_outer);
205205
init_shmem_src1(thread_id, src1_batch_offset, offset_n, k_outer);
206206

ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
218218

219219
for (var k_outer = 0u; k_outer < params.k; k_outer += TILE_K) {
220220

221-
// see mat_mul_decls.tmpl
221+
// see mul_mat_decls.tmpl
222222
init_shmem_src0(thread_id, src0_batch_offset, offset_m, k_outer);
223223
init_shmem_src1(thread_id, src1_batch_offset, offset_n, k_outer);
224224

0 commit comments

Comments
 (0)