Skip to content

Commit 39aa11d

Browse files
Neha AbbasNeha Abbas
authored andcommitted
f32 add all tests passing
1 parent 96d107e commit 39aa11d

File tree

2 files changed

+65
-32
lines changed

2 files changed

+65
-32
lines changed

ggml/src/ggml-webgpu/ggml-webgpu.cpp

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,16 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t
451451
ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline, params, entries, wg_x);
452452
}
453453

454+
// sample test
455+
// ADD(type=f32, ne=[10,5,4,3], nr=[2,1,1,1], nf=1)
456+
// ne: number of elements in each dimension of tensor b
457+
// nr: number of repetitions in each dimension
458+
// tensor b is the smaller tensor, and is broadcasted with repetitions to match the size of a
459+
// broadcasted with ne * nr
460+
// 10*2, 5*1, 4*1, 3*1 = [20, 5, 4, 3] is the shape of dst and a
461+
// essentially, if nr[x] is > 1, that dimension of b is repeated
462+
// nf: number of fused operations (1 means singular addition)
463+
454464
// adds src0 and src1 and puts in dst
455465
static void ggml_webgpu_add(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
456466
// each tensor in GGML is stored inside a buffer on the GPU
@@ -464,27 +474,27 @@ static void ggml_webgpu_add(webgpu_context & ctx, ggml_tensor * src0, ggml_tenso
464474
src0_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
465475

466476
size_t src1_offset = ggml_backend_webgpu_tensor_offset(src1);
467-
// assumes power of 2 offset alignment
468477
size_t src1_misalignment = src1_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
469-
// align to minimum offset alignment
470478
src1_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
471479

472480
size_t dst_offset = ggml_backend_webgpu_tensor_offset(dst);
473481
size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
474482
dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
475-
483+
476484
// set up parameters
477485
std::vector<uint32_t> params = {
478486
// number of elements-- determines how many threads to dispatch (one for each addition operation)
479487
(uint32_t) ggml_nelements(dst),
480488

481489
// even though tensors are 4d, the actual data is stored linearly
482490
// stride = how many elements (or bytes) we must skip in memory to move from one value to another along a certain dimension
483-
// i.e.
484-
// nb[0] = 1 // each element is next to the previous
485-
// nb[1] = nb[0] * ne[0] = 5 // to move to next row, skip 5 elements
486-
// nb[2] = nb[1] * ne[1] = 20 // to next matrix, skip 20 elements
487-
// nb[3] = nb[2] * ne[2] = 60 // to next batch, skip 60 elements
491+
// i.e. tensor: [5, 6, 3, 2], ggml_type_size: 4 (each number is 4 bytes)
492+
// (nb = number of bytes to skip for each element (stride))
493+
// (ne = number of elements in that dimension)
494+
// nb[0] = 4 // each element is next to the previous, so only 4 bytes in between
495+
// nb[1] = nb[0] * ne[0] = 4 * 5 = 20 // to move to next row, skip 20 bytes
496+
// nb[2] = nb[1] * ne[1] = 20 * 6 = 120 // to next matrix, skip 120 elements
497+
// nb[3] = nb[2] * ne[2] = 120 * 3 = 360 // to next batch, skip 60 elements
488498

489499
// calculate element strides for each tensor
490500
(uint32_t) (src0->nb[0] / ggml_type_size(src0->type)),
@@ -502,16 +512,24 @@ static void ggml_webgpu_add(webgpu_context & ctx, ggml_tensor * src0, ggml_tenso
502512
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
503513
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
504514

505-
// number of elements in each dimension
515+
// number of elements in each dimension of larger tensors (src0 and dst)
506516
(uint32_t) dst->ne[0],
507517
(uint32_t) dst->ne[1],
508518
(uint32_t) dst->ne[2],
509519
(uint32_t) dst->ne[3],
510520

521+
// number of elements in each dimension of smaller tensor to be broadcasted (src1)
522+
(uint32_t) src1->ne[0],
523+
(uint32_t) src1->ne[1],
524+
(uint32_t) src1->ne[2],
525+
(uint32_t) src1->ne[3],
526+
511527
// offsets in terms of elements instead of bytes
512528
(uint32_t) (src0_misalignment / ggml_type_size(src0->type)),
513529
(uint32_t) (src1_misalignment / ggml_type_size(src1->type)),
514530
(uint32_t) (dst_misalignment / ggml_type_size(dst->type)),
531+
532+
515533
};
516534

517535
// bind group = groups together several GPU resources that shaders will use (e.g., buffers holding tensor data)

ggml/src/ggml-webgpu/wgsl-shaders/add.wgsl

Lines changed: 38 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ var<storage, read_write> src1: array<f32>;
1010
var<storage, read_write> dst: array<f32>;
1111

1212
struct Params {
13-
ne: u32, // total number of elements
13+
ne: u32,
1414

1515
stride_src0_0: u32,
1616
stride_src0_1: u32,
@@ -27,10 +27,15 @@ struct Params {
2727
stride_dst_2: u32,
2828
stride_dst_3: u32,
2929

30-
ne0: u32,
31-
ne1: u32,
32-
ne2: u32,
33-
ne3: u32,
30+
a_ne0: u32,
31+
a_ne1: u32,
32+
a_ne2: u32,
33+
a_ne3: u32,
34+
35+
b_ne0: u32,
36+
b_ne1: u32,
37+
b_ne2: u32,
38+
b_ne3: u32,
3439

3540
// offsets in elements
3641
offset_src0: u32,
@@ -48,31 +53,41 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
4853
return;
4954
}
5055

51-
var i = gid.x; // i = thread id
52-
53-
// compute indexes for each dimension of the tensor
54-
let i3 = i / (params.ne2 * params.ne1 * params.ne0);
55-
i = i % (params.ne2 * params.ne1 * params.ne0);
56+
// i = thread id, ranges from 0 --> total ne - 1
57+
// represents the position in the flat array a we are adding with array b
58+
var i = gid.x;
5659

57-
let i2 = i / (params.ne1 * params.ne0);
58-
i = i % (params.ne1 * params.ne0);
60+
// given the index of linear a, we want to compute the 4d index [a_i0, a_i1, a_i2, a_i3]
61+
// we need this because tensor a and b are different shapes
62+
// so the same linear index won't work for b, and we can only compute b's linear index from the 4d index of a
63+
64+
let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
65+
i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0);
5966

60-
let i1 = i / params.ne0;
67+
let a_i2 = i / (params.a_ne1 * params.a_ne0);
68+
i = i % (params.a_ne1 * params.a_ne0);
6169

62-
let i0 = i % params.ne0;
70+
let a_i1 = i / params.a_ne0;
6371

64-
// compute indexes for position in each flat array
65-
let src0_idx = i0 * params.stride_src0_0 + i1 * params.stride_src0_1 +
66-
i2 * params.stride_src0_2 + i3 * params.stride_src0_3;
72+
let a_i0 = i % params.a_ne0;
6773

68-
let src1_idx = i0 * params.stride_src1_0 + i1 * params.stride_src1_1 +
69-
i2 * params.stride_src1_2 + i3 * params.stride_src1_3;
7074

71-
let dst_idx = i0 * params.stride_dst_0 + i1 * params.stride_dst_1 +
72-
i2 * params.stride_dst_2 + i3 * params.stride_dst_3;
75+
// handle repetition of b
76+
// index loops back to the beginning and repeats after elements are exhausted = modulo
77+
let b_i0 = a_i0 % params.b_ne0;
78+
let b_i1 = a_i1 % params.b_ne1;
79+
let b_i2 = a_i2 % params.b_ne2;
80+
let b_i3 = a_i3 % params.b_ne3;
7381

7482

75-
// dst[dst_idx] = src0[src0_idx] + src1[src1_idx];
83+
// compute index for position in b's flat array
84+
let src1_idx = b_i0 * params.stride_src1_0 +
85+
b_i1 * params.stride_src1_1 +
86+
b_i2 * params.stride_src1_2 +
87+
b_i3 * params.stride_src1_3;
7688

77-
dst[params.offset_dst + dst_idx] = src0[params.offset_src0 + src0_idx] + src1[params.offset_src1 + src1_idx];
89+
// actual addition operation, now that the indexes are all figured out
90+
// ensuring that the offsets are included
91+
// gid.x used for flat indexing into dst and a, since variable i was modified during calcs
92+
dst[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] + src1[params.offset_src1 + src1_idx];
7893
}

0 commit comments

Comments
 (0)