Skip to content

Commit eabab9e

Browse files
committed
Add new pool for timestamp queries and clean up logging
1 parent 26c44f8 commit eabab9e

File tree

1 file changed

+92
-57
lines changed

1 file changed

+92
-57
lines changed

ggml/src/ggml-webgpu/ggml-webgpu.cpp

Lines changed: 92 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
std::chrono::duration<double, std::milli>(cpu_total_end_##id - cpu_total_start_##id).count(); \
3737
(ctx)->cpu_time_ms[#id] += cpu_total_time_##id;
3838

39-
// fine-grained timing (diagnostics only)
39+
// fine-grained timing (not included in totals)
4040
# define WEBGPU_CPU_PROFILE_DETAIL_START(id) auto cpu_detail_start_##id = std::chrono::high_resolution_clock::now();
4141

4242
# define WEBGPU_CPU_PROFILE_DETAIL_END(id, ctx) \
@@ -52,7 +52,7 @@
5252
#endif // GGML_WEBGPU_CPU_PROFILE
5353

5454
#ifdef GGML_WEBGPU_GPU_PROFILE
55-
# define WEBGPU_NUM_TIMESTAMP_QUERY_BUFS 100
55+
# define WEBGPU_NUM_TIMESTAMP_QUERY_BUFS 24
5656
# define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16 // e.g. enough for two timestamps
5757
#endif
5858

@@ -67,7 +67,7 @@
6767
/* Constants */
6868

6969
#define WEBGPU_MUL_MAT_WG_SIZE 256
70-
#define WEBGPU_NUM_PARAM_BUFS 100
70+
#define WEBGPU_NUM_PARAM_BUFS 32
7171
#define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters
7272
#define WEBGPU_NUM_SET_ROWS_ERROR_BUFS 32
7373
#define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
@@ -149,6 +149,68 @@ struct webgpu_buf_pool {
149149
}
150150
};
151151

152+
#ifdef GGML_WEBGPU_GPU_PROFILE
153+
struct webgpu_gpu_profile_bufs {
154+
wgpu::Buffer host_buf;
155+
wgpu::Buffer dev_buf;
156+
wgpu::QuerySet query_set;
157+
};
158+
159+
// Holds a pool of parameter buffers for WebGPU operations
160+
struct webgpu_gpu_profile_buf_pool {
161+
std::vector<webgpu_gpu_profile_bufs> free;
162+
163+
std::mutex mutex;
164+
165+
std::condition_variable cv;
166+
167+
void init(wgpu::Device device,
168+
int num_bufs,
169+
size_t buf_size,
170+
wgpu::BufferUsage dev_buf_usage,
171+
wgpu::BufferUsage host_buf_usage) {
172+
for (int i = 0; i < num_bufs; i++) {
173+
wgpu::Buffer host_buf;
174+
wgpu::Buffer dev_buf;
175+
ggml_webgpu_create_buffer(device, host_buf, buf_size, host_buf_usage, "ggml_webgpu_host_profile_buf");
176+
ggml_webgpu_create_buffer(device, dev_buf, buf_size, dev_buf_usage, "ggml_webgpu_dev_profile_buf");
177+
// Create a query set for 2 timestamps
178+
wgpu::QuerySetDescriptor ts_query_set_desc = {};
179+
180+
ts_query_set_desc.type = wgpu::QueryType::Timestamp;
181+
ts_query_set_desc.count = 2;
182+
wgpu::QuerySet ts_query_set = device.CreateQuerySet(&ts_query_set_desc);
183+
184+
free.push_back({ host_buf, dev_buf, ts_query_set });
185+
}
186+
}
187+
188+
webgpu_gpu_profile_bufs alloc_bufs() {
189+
std::unique_lock<std::mutex> lock(mutex);
190+
cv.wait(lock, [this] { return !free.empty(); });
191+
webgpu_gpu_profile_bufs bufs = free.back();
192+
free.pop_back();
193+
return bufs;
194+
}
195+
196+
void free_bufs(std::vector<webgpu_gpu_profile_bufs> bufs) {
197+
std::lock_guard<std::mutex> lock(mutex);
198+
free.insert(free.end(), bufs.begin(), bufs.end());
199+
cv.notify_all();
200+
}
201+
202+
void cleanup() {
203+
std::lock_guard<std::mutex> lock(mutex);
204+
for (auto & bufs : free) {
205+
bufs.host_buf.Destroy();
206+
bufs.dev_buf.Destroy();
207+
bufs.query_set.Destroy();
208+
}
209+
free.clear();
210+
}
211+
};
212+
#endif
213+
152214
struct webgpu_pipeline {
153215
wgpu::ComputePipeline pipeline;
154216
std::string name;
@@ -159,8 +221,8 @@ struct webgpu_command {
159221
webgpu_pool_bufs params_bufs;
160222
std::optional<webgpu_pool_bufs> set_rows_error_bufs;
161223
#ifdef GGML_WEBGPU_GPU_PROFILE
162-
webgpu_pool_bufs timestamp_query_bufs;
163-
std::string pipeline_name;
224+
webgpu_gpu_profile_bufs timestamp_query_bufs;
225+
std::string pipeline_name;
164226
#endif
165227
};
166228

@@ -218,7 +280,7 @@ struct webgpu_context_struct {
218280
// Profiling: per-shader GPU time in ms
219281
std::unordered_map<std::string, double> shader_gpu_time_ms;
220282
// Profiling: pool of timestamp query buffers (one per operation)
221-
webgpu_buf_pool timestamp_query_buf_pool;
283+
webgpu_gpu_profile_buf_pool timestamp_query_buf_pool;
222284
#endif
223285
};
224286

@@ -259,8 +321,6 @@ static void ggml_webgpu_create_pipeline(wgpu::Device &
259321
const char * shader_code,
260322
const char * label,
261323
const std::vector<wgpu::ConstantEntry> & constants = {}) {
262-
WEBGPU_LOG_DEBUG("ggml_webgpu_create_pipeline()");
263-
264324
wgpu::ShaderSourceWGSL shader_source;
265325
shader_source.code = shader_code;
266326

@@ -286,8 +346,6 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device,
286346
size_t size,
287347
wgpu::BufferUsage usage,
288348
const char * label) {
289-
WEBGPU_LOG_DEBUG("ggml_webgpu_create_buffer()");
290-
291349
wgpu::BufferDescriptor buffer_desc;
292350
buffer_desc.size = size;
293351
buffer_desc.usage = usage;
@@ -326,28 +384,6 @@ static void ggml_backend_webgpu_wait(webgpu_context &
326384
}
327385
}
328386

329-
#ifdef GGML_WEBGPU_GPU_PROFILE
330-
static wgpu::FutureWaitInfo ggml_backend_webgpu_process_timestamps(webgpu_context & ctx,
331-
webgpu_pool_bufs ts_bufs,
332-
std::string label) {
333-
wgpu::Future f = ts_bufs.host_buf.MapAsync(
334-
wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
335-
[ctx, ts_bufs, label](wgpu::MapAsyncStatus status, wgpu::StringView message) {
336-
if (status != wgpu::MapAsyncStatus::Success) {
337-
GGML_LOG_ERROR("ggml_webgpu: Failed to map timestamp buffer: %s\n", std::string(message).c_str());
338-
} else {
339-
const uint64_t * ts_data = (const uint64_t *) ts_bufs.host_buf.GetConstMappedRange();
340-
// WebGPU timestamps are in ticks; convert to ms using device timestamp period if available
341-
double elapsed_ms = double(ts_data[1] - ts_data[0]) * 1e-6; // TODO: use actual timestamp period
342-
ctx->shader_gpu_time_ms[label] += elapsed_ms;
343-
// We can't unmap in here due to WebGPU reentrancy limitations.
344-
ctx->timestamp_query_buf_pool.free_bufs({ ts_bufs });
345-
}
346-
});
347-
return { f };
348-
}
349-
#endif
350-
351387
static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
352388
wgpu::Buffer & buffer,
353389
wgpu::MapMode mode,
@@ -390,7 +426,7 @@ static std::vector<wgpu::FutureWaitInfo> ggml_backend_webgpu_submit(webgpu_conte
390426
std::vector<webgpu_pool_bufs> params_bufs;
391427
std::vector<webgpu_pool_bufs> set_rows_error_bufs;
392428
#ifdef GGML_WEBGPU_GPU_PROFILE
393-
std::vector<std::pair<std::string, webgpu_pool_bufs>> pipeline_name_and_ts_bufs;
429+
std::vector<std::pair<std::string, webgpu_gpu_profile_bufs>> pipeline_name_and_ts_bufs;
394430
#endif
395431

396432
for (const auto & command : commands) {
@@ -399,9 +435,6 @@ static std::vector<wgpu::FutureWaitInfo> ggml_backend_webgpu_submit(webgpu_conte
399435
if (command.set_rows_error_bufs) {
400436
set_rows_error_bufs.push_back(command.set_rows_error_bufs.value());
401437
}
402-
#ifdef GGML_WEBGPU_GPU_PROFILE
403-
pipeline_name_and_ts_bufs.push_back({ command.pipeline_name, command.timestamp_query_bufs });
404-
#endif
405438
}
406439
ctx->queue.Submit(command_buffers.size(), command_buffers.data());
407440

@@ -437,9 +470,25 @@ static std::vector<wgpu::FutureWaitInfo> ggml_backend_webgpu_submit(webgpu_conte
437470
}
438471

439472
#ifdef GGML_WEBGPU_GPU_PROFILE
440-
for (const auto & name_and_bufs : pipeline_name_and_ts_bufs) {
441-
wgpu::FutureWaitInfo f = ggml_backend_webgpu_process_timestamps(ctx, name_and_bufs.second, name_and_bufs.first);
442-
futures.push_back(f);
473+
for (const auto & command : commands) {
474+
auto label = command.pipeline_name;
475+
auto ts_bufs = command.timestamp_query_bufs;
476+
477+
wgpu::Future f = ts_bufs.host_buf.MapAsync(
478+
wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
479+
[ctx, ts_bufs, label](wgpu::MapAsyncStatus status, wgpu::StringView message) {
480+
if (status != wgpu::MapAsyncStatus::Success) {
481+
GGML_LOG_ERROR("ggml_webgpu: Failed to map timestamp buffer: %s\n", std::string(message).c_str());
482+
} else {
483+
const uint64_t * ts_data = (const uint64_t *) ts_bufs.host_buf.GetConstMappedRange();
484+
// WebGPU timestamps are in ns; convert to ms
485+
double elapsed_ms = double(ts_data[1] - ts_data[0]) * 1e-6;
486+
ctx->shader_gpu_time_ms[label] += elapsed_ms;
487+
// We can't unmap in here due to WebGPU reentrancy limitations.
488+
ctx->timestamp_query_buf_pool.free_bufs({ ts_bufs });
489+
}
490+
});
491+
futures.push_back({ f });
443492
}
444493
#endif
445494
return futures;
@@ -480,18 +529,12 @@ static webgpu_command ggml_backend_webgpu_build(webgpu_context &
480529
#ifdef GGML_WEBGPU_GPU_PROFILE
481530
// --- Profiling: GPU timestamp queries ---
482531
// Allocate a timestamp query buffer (2 timestamps: start/end)
483-
webgpu_pool_bufs ts_bufs = ctx->timestamp_query_buf_pool.alloc_bufs();
532+
webgpu_gpu_profile_bufs ts_bufs = ctx->timestamp_query_buf_pool.alloc_bufs();
484533
if (ts_bufs.host_buf.GetMapState() == wgpu::BufferMapState::Mapped) {
485534
ts_bufs.host_buf.Unmap();
486535
}
487536

488-
// Create a query set for 2 timestamps
489-
wgpu::QuerySetDescriptor ts_query_set_desc = {};
490-
ts_query_set_desc.type = wgpu::QueryType::Timestamp;
491-
ts_query_set_desc.count = 2;
492-
wgpu::QuerySet ts_query_set = ctx->device.CreateQuerySet(&ts_query_set_desc);
493-
494-
wgpu::PassTimestampWrites ts_writes = { .querySet = ts_query_set,
537+
wgpu::PassTimestampWrites ts_writes = { .querySet = ts_bufs.query_set,
495538
.beginningOfPassWriteIndex = 0,
496539
.endOfPassWriteIndex = 1 };
497540
wgpu::ComputePassDescriptor pass_desc = { .timestampWrites = &ts_writes };
@@ -506,7 +549,7 @@ static webgpu_command ggml_backend_webgpu_build(webgpu_context &
506549

507550
#ifdef GGML_WEBGPU_GPU_PROFILE
508551
// Resolve the query set into the device buffer
509-
encoder.ResolveQuerySet(ts_query_set, 0, 2, ts_bufs.dev_buf, 0);
552+
encoder.ResolveQuerySet(ts_bufs.query_set, 0, 2, ts_bufs.dev_buf, 0);
510553
encoder.CopyBufferToBuffer(ts_bufs.dev_buf, 0, ts_bufs.host_buf, 0, ts_bufs.host_buf.GetSize());
511554
#endif
512555

@@ -1137,7 +1180,7 @@ static webgpu_command ggml_webgpu_soft_max(webgpu_context & ctx,
11371180
ggml_nrows(dst));
11381181
}
11391182

1140-
// Returns true if node has enqueued work into the queue, false otherwise
1183+
// Returns the encoded command, or std::nullopt if the operation is a no-op
11411184
static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
11421185
if (ggml_is_empty(node)) {
11431186
return std::nullopt;
@@ -1208,7 +1251,6 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
12081251

12091252
WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute);
12101253

1211-
WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_encode);
12121254
std::vector<webgpu_command> commands;
12131255
std::vector<std::vector<wgpu::FutureWaitInfo>> futures;
12141256
for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -1227,14 +1269,8 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
12271269
std::vector<wgpu::FutureWaitInfo> new_futures = ggml_backend_webgpu_submit(ctx, commands);
12281270
futures.push_back({ new_futures });
12291271
}
1230-
WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_encode, ctx);
1231-
1232-
WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_wait);
12331272
ggml_backend_webgpu_wait(ctx, futures);
1234-
WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_wait, ctx);
1235-
12361273
WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx);
1237-
12381274
return GGML_STATUS_SUCCESS;
12391275
}
12401276

@@ -1260,7 +1296,6 @@ static ggml_backend_i ggml_backend_webgpu_i = {
12601296
/* GGML Backend Buffer Interface */
12611297

12621298
static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1263-
WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_free_buffer()");
12641299
ggml_backend_webgpu_buffer_context * ctx = static_cast<ggml_backend_webgpu_buffer_context *>(buffer->context);
12651300
ctx->buffer.Destroy();
12661301
}

0 commit comments

Comments
 (0)