3636 std::chrono::duration<double , std::milli>(cpu_total_end_##id - cpu_total_start_##id).count(); \
3737 (ctx)->cpu_time_ms[#id] += cpu_total_time_##id;
3838
39- // fine-grained timing (diagnostics only )
39+ // fine-grained timing (not included in totals )
4040# define WEBGPU_CPU_PROFILE_DETAIL_START (id ) auto cpu_detail_start_##id = std::chrono::high_resolution_clock::now();
4141
4242# define WEBGPU_CPU_PROFILE_DETAIL_END (id, ctx ) \
5252#endif // GGML_WEBGPU_CPU_PROFILE
5353
5454#ifdef GGML_WEBGPU_GPU_PROFILE
55- # define WEBGPU_NUM_TIMESTAMP_QUERY_BUFS 100
55+ # define WEBGPU_NUM_TIMESTAMP_QUERY_BUFS 24
5656# define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16 // e.g. enough for two timestamps
5757#endif
5858
6767/* Constants */
6868
6969#define WEBGPU_MUL_MAT_WG_SIZE 256
70- #define WEBGPU_NUM_PARAM_BUFS 100
70+ #define WEBGPU_NUM_PARAM_BUFS 32
7171#define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters
7272#define WEBGPU_NUM_SET_ROWS_ERROR_BUFS 32
7373#define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
@@ -149,6 +149,68 @@ struct webgpu_buf_pool {
149149 }
150150};
151151
152+ #ifdef GGML_WEBGPU_GPU_PROFILE
153+ struct webgpu_gpu_profile_bufs {
154+ wgpu::Buffer host_buf;
155+ wgpu::Buffer dev_buf;
156+ wgpu::QuerySet query_set;
157+ };
158+
159+ // Holds a pool of parameter buffers for WebGPU operations
160+ struct webgpu_gpu_profile_buf_pool {
161+ std::vector<webgpu_gpu_profile_bufs> free;
162+
163+ std::mutex mutex;
164+
165+ std::condition_variable cv;
166+
167+ void init (wgpu::Device device,
168+ int num_bufs,
169+ size_t buf_size,
170+ wgpu::BufferUsage dev_buf_usage,
171+ wgpu::BufferUsage host_buf_usage) {
172+ for (int i = 0 ; i < num_bufs; i++) {
173+ wgpu::Buffer host_buf;
174+ wgpu::Buffer dev_buf;
175+ ggml_webgpu_create_buffer (device, host_buf, buf_size, host_buf_usage, " ggml_webgpu_host_profile_buf" );
176+ ggml_webgpu_create_buffer (device, dev_buf, buf_size, dev_buf_usage, " ggml_webgpu_dev_profile_buf" );
177+ // Create a query set for 2 timestamps
178+ wgpu::QuerySetDescriptor ts_query_set_desc = {};
179+
180+ ts_query_set_desc.type = wgpu::QueryType::Timestamp;
181+ ts_query_set_desc.count = 2 ;
182+ wgpu::QuerySet ts_query_set = device.CreateQuerySet (&ts_query_set_desc);
183+
184+ free.push_back ({ host_buf, dev_buf, ts_query_set });
185+ }
186+ }
187+
188+ webgpu_gpu_profile_bufs alloc_bufs () {
189+ std::unique_lock<std::mutex> lock (mutex);
190+ cv.wait (lock, [this ] { return !free.empty (); });
191+ webgpu_gpu_profile_bufs bufs = free.back ();
192+ free.pop_back ();
193+ return bufs;
194+ }
195+
196+ void free_bufs (std::vector<webgpu_gpu_profile_bufs> bufs) {
197+ std::lock_guard<std::mutex> lock (mutex);
198+ free.insert (free.end (), bufs.begin (), bufs.end ());
199+ cv.notify_all ();
200+ }
201+
202+ void cleanup () {
203+ std::lock_guard<std::mutex> lock (mutex);
204+ for (auto & bufs : free) {
205+ bufs.host_buf .Destroy ();
206+ bufs.dev_buf .Destroy ();
207+ bufs.query_set .Destroy ();
208+ }
209+ free.clear ();
210+ }
211+ };
212+ #endif
213+
152214struct webgpu_pipeline {
153215 wgpu::ComputePipeline pipeline;
154216 std::string name;
@@ -159,8 +221,8 @@ struct webgpu_command {
159221 webgpu_pool_bufs params_bufs;
160222 std::optional<webgpu_pool_bufs> set_rows_error_bufs;
161223#ifdef GGML_WEBGPU_GPU_PROFILE
162- webgpu_pool_bufs timestamp_query_bufs;
163- std::string pipeline_name;
224+ webgpu_gpu_profile_bufs timestamp_query_bufs;
225+ std::string pipeline_name;
164226#endif
165227};
166228
@@ -218,7 +280,7 @@ struct webgpu_context_struct {
218280 // Profiling: per-shader GPU time in ms
219281 std::unordered_map<std::string, double > shader_gpu_time_ms;
220282 // Profiling: pool of timestamp query buffers (one per operation)
221- webgpu_buf_pool timestamp_query_buf_pool;
283+ webgpu_gpu_profile_buf_pool timestamp_query_buf_pool;
222284#endif
223285};
224286
@@ -259,8 +321,6 @@ static void ggml_webgpu_create_pipeline(wgpu::Device &
259321 const char * shader_code,
260322 const char * label,
261323 const std::vector<wgpu::ConstantEntry> & constants = {}) {
262- WEBGPU_LOG_DEBUG (" ggml_webgpu_create_pipeline()" );
263-
264324 wgpu::ShaderSourceWGSL shader_source;
265325 shader_source.code = shader_code;
266326
@@ -286,8 +346,6 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device,
286346 size_t size,
287347 wgpu::BufferUsage usage,
288348 const char * label) {
289- WEBGPU_LOG_DEBUG (" ggml_webgpu_create_buffer()" );
290-
291349 wgpu::BufferDescriptor buffer_desc;
292350 buffer_desc.size = size;
293351 buffer_desc.usage = usage;
@@ -326,28 +384,6 @@ static void ggml_backend_webgpu_wait(webgpu_context &
326384 }
327385}
328386
329- #ifdef GGML_WEBGPU_GPU_PROFILE
330- static wgpu::FutureWaitInfo ggml_backend_webgpu_process_timestamps (webgpu_context & ctx,
331- webgpu_pool_bufs ts_bufs,
332- std::string label) {
333- wgpu::Future f = ts_bufs.host_buf .MapAsync (
334- wgpu::MapMode::Read, 0 , ts_bufs.host_buf .GetSize (), wgpu::CallbackMode::AllowSpontaneous,
335- [ctx, ts_bufs, label](wgpu::MapAsyncStatus status, wgpu::StringView message) {
336- if (status != wgpu::MapAsyncStatus::Success) {
337- GGML_LOG_ERROR (" ggml_webgpu: Failed to map timestamp buffer: %s\n " , std::string (message).c_str ());
338- } else {
339- const uint64_t * ts_data = (const uint64_t *) ts_bufs.host_buf .GetConstMappedRange ();
340- // WebGPU timestamps are in ticks; convert to ms using device timestamp period if available
341- double elapsed_ms = double (ts_data[1 ] - ts_data[0 ]) * 1e-6 ; // TODO: use actual timestamp period
342- ctx->shader_gpu_time_ms [label] += elapsed_ms;
343- // We can't unmap in here due to WebGPU reentrancy limitations.
344- ctx->timestamp_query_buf_pool .free_bufs ({ ts_bufs });
345- }
346- });
347- return { f };
348- }
349- #endif
350-
351387static void ggml_backend_webgpu_map_buffer (webgpu_context & ctx,
352388 wgpu::Buffer & buffer,
353389 wgpu::MapMode mode,
@@ -390,7 +426,7 @@ static std::vector<wgpu::FutureWaitInfo> ggml_backend_webgpu_submit(webgpu_conte
390426 std::vector<webgpu_pool_bufs> params_bufs;
391427 std::vector<webgpu_pool_bufs> set_rows_error_bufs;
392428#ifdef GGML_WEBGPU_GPU_PROFILE
393- std::vector<std::pair<std::string, webgpu_pool_bufs >> pipeline_name_and_ts_bufs;
429+ std::vector<std::pair<std::string, webgpu_gpu_profile_bufs >> pipeline_name_and_ts_bufs;
394430#endif
395431
396432 for (const auto & command : commands) {
@@ -399,9 +435,6 @@ static std::vector<wgpu::FutureWaitInfo> ggml_backend_webgpu_submit(webgpu_conte
399435 if (command.set_rows_error_bufs ) {
400436 set_rows_error_bufs.push_back (command.set_rows_error_bufs .value ());
401437 }
402- #ifdef GGML_WEBGPU_GPU_PROFILE
403- pipeline_name_and_ts_bufs.push_back ({ command.pipeline_name , command.timestamp_query_bufs });
404- #endif
405438 }
406439 ctx->queue .Submit (command_buffers.size (), command_buffers.data ());
407440
@@ -437,9 +470,25 @@ static std::vector<wgpu::FutureWaitInfo> ggml_backend_webgpu_submit(webgpu_conte
437470 }
438471
439472#ifdef GGML_WEBGPU_GPU_PROFILE
440- for (const auto & name_and_bufs : pipeline_name_and_ts_bufs) {
441- wgpu::FutureWaitInfo f = ggml_backend_webgpu_process_timestamps (ctx, name_and_bufs.second , name_and_bufs.first );
442- futures.push_back (f);
473+ for (const auto & command : commands) {
474+ auto label = command.pipeline_name ;
475+ auto ts_bufs = command.timestamp_query_bufs ;
476+
477+ wgpu::Future f = ts_bufs.host_buf .MapAsync (
478+ wgpu::MapMode::Read, 0 , ts_bufs.host_buf .GetSize (), wgpu::CallbackMode::AllowSpontaneous,
479+ [ctx, ts_bufs, label](wgpu::MapAsyncStatus status, wgpu::StringView message) {
480+ if (status != wgpu::MapAsyncStatus::Success) {
481+ GGML_LOG_ERROR (" ggml_webgpu: Failed to map timestamp buffer: %s\n " , std::string (message).c_str ());
482+ } else {
483+ const uint64_t * ts_data = (const uint64_t *) ts_bufs.host_buf .GetConstMappedRange ();
484+ // WebGPU timestamps are in ns; convert to ms
485+ double elapsed_ms = double (ts_data[1 ] - ts_data[0 ]) * 1e-6 ;
486+ ctx->shader_gpu_time_ms [label] += elapsed_ms;
487+ // We can't unmap in here due to WebGPU reentrancy limitations.
488+ ctx->timestamp_query_buf_pool .free_bufs ({ ts_bufs });
489+ }
490+ });
491+ futures.push_back ({ f });
443492 }
444493#endif
445494 return futures;
@@ -480,18 +529,12 @@ static webgpu_command ggml_backend_webgpu_build(webgpu_context &
480529#ifdef GGML_WEBGPU_GPU_PROFILE
481530 // --- Profiling: GPU timestamp queries ---
482531 // Allocate a timestamp query buffer (2 timestamps: start/end)
483- webgpu_pool_bufs ts_bufs = ctx->timestamp_query_buf_pool .alloc_bufs ();
532+ webgpu_gpu_profile_bufs ts_bufs = ctx->timestamp_query_buf_pool .alloc_bufs ();
484533 if (ts_bufs.host_buf .GetMapState () == wgpu::BufferMapState::Mapped) {
485534 ts_bufs.host_buf .Unmap ();
486535 }
487536
488- // Create a query set for 2 timestamps
489- wgpu::QuerySetDescriptor ts_query_set_desc = {};
490- ts_query_set_desc.type = wgpu::QueryType::Timestamp;
491- ts_query_set_desc.count = 2 ;
492- wgpu::QuerySet ts_query_set = ctx->device .CreateQuerySet (&ts_query_set_desc);
493-
494- wgpu::PassTimestampWrites ts_writes = { .querySet = ts_query_set,
537+ wgpu::PassTimestampWrites ts_writes = { .querySet = ts_bufs.query_set ,
495538 .beginningOfPassWriteIndex = 0 ,
496539 .endOfPassWriteIndex = 1 };
497540 wgpu::ComputePassDescriptor pass_desc = { .timestampWrites = &ts_writes };
@@ -506,7 +549,7 @@ static webgpu_command ggml_backend_webgpu_build(webgpu_context &
506549
507550#ifdef GGML_WEBGPU_GPU_PROFILE
508551 // Resolve the query set into the device buffer
509- encoder.ResolveQuerySet (ts_query_set , 0 , 2 , ts_bufs.dev_buf , 0 );
552+ encoder.ResolveQuerySet (ts_bufs. query_set , 0 , 2 , ts_bufs.dev_buf , 0 );
510553 encoder.CopyBufferToBuffer (ts_bufs.dev_buf , 0 , ts_bufs.host_buf , 0 , ts_bufs.host_buf .GetSize ());
511554#endif
512555
@@ -1137,7 +1180,7 @@ static webgpu_command ggml_webgpu_soft_max(webgpu_context & ctx,
11371180 ggml_nrows (dst));
11381181}
11391182
1140- // Returns true if node has enqueued work into the queue, false otherwise
1183+ // Returns the encoded command, or std::nullopt if the operation is a no-op
11411184static std::optional<webgpu_command> ggml_webgpu_encode_node (webgpu_context ctx, ggml_tensor * node) {
11421185 if (ggml_is_empty (node)) {
11431186 return std::nullopt ;
@@ -1208,7 +1251,6 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
12081251
12091252 WEBGPU_CPU_PROFILE_TOTAL_START (graph_compute);
12101253
1211- WEBGPU_CPU_PROFILE_DETAIL_START (graph_compute_encode);
12121254 std::vector<webgpu_command> commands;
12131255 std::vector<std::vector<wgpu::FutureWaitInfo>> futures;
12141256 for (int i = 0 ; i < cgraph->n_nodes ; i++) {
@@ -1227,14 +1269,8 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
12271269 std::vector<wgpu::FutureWaitInfo> new_futures = ggml_backend_webgpu_submit (ctx, commands);
12281270 futures.push_back ({ new_futures });
12291271 }
1230- WEBGPU_CPU_PROFILE_DETAIL_END (graph_compute_encode, ctx);
1231-
1232- WEBGPU_CPU_PROFILE_DETAIL_START (graph_compute_wait);
12331272 ggml_backend_webgpu_wait (ctx, futures);
1234- WEBGPU_CPU_PROFILE_DETAIL_END (graph_compute_wait, ctx);
1235-
12361273 WEBGPU_CPU_PROFILE_TOTAL_END (graph_compute, ctx);
1237-
12381274 return GGML_STATUS_SUCCESS;
12391275}
12401276
@@ -1260,7 +1296,6 @@ static ggml_backend_i ggml_backend_webgpu_i = {
12601296/* GGML Backend Buffer Interface */
12611297
12621298static void ggml_backend_webgpu_buffer_free_buffer (ggml_backend_buffer_t buffer) {
1263- WEBGPU_LOG_DEBUG (" ggml_backend_webgpu_buffer_free_buffer()" );
12641299 ggml_backend_webgpu_buffer_context * ctx = static_cast <ggml_backend_webgpu_buffer_context *>(buffer->context );
12651300 ctx->buffer .Destroy ();
12661301}
0 commit comments