diff --git a/docs/DEBUG_GUIDE.md b/docs/DEBUG_GUIDE.md index 74639a59..b40f18c1 100644 --- a/docs/DEBUG_GUIDE.md +++ b/docs/DEBUG_GUIDE.md @@ -165,7 +165,70 @@ with open('/tmp/musa_telemetry.json') as f: ## 3. 常用调试组合 -### 3.1 性能与热点(TensorFlow 侧) +### 3.1 三条流(计算/H2D/D2H)是怎么安排的? + +结合 `musa_ext/kernels/math/musa_matmul_op.cc` 和 +`musa_ext/mu/device/musa_device.cc`,当前插件的行为可以概括为: + +- **MatMul 只负责把计算 kernel 提交到计算流**。`MusaMatMulOp::Compute()` + 通过 `GetHandleByCtx(ctx)` 取得对应设备的 muDNN handle;该 handle + 在 `MusaDevice::MusaDevice()` 中已经通过 `mudnn_handle_->SetStream(stream_)` + 绑定到了 `stream_`(计算流)。 +- `IsExpensive()` **只是告诉 TensorFlow 这是个计算密集型算子**,方便 host 侧调度做成本估计; + **它不会让 TensorFlow 自动把一个 MatMul 拆成“计算/H2D/D2H 三条流排流水”**。 +- 三条流是 **MUSA 插件自己显式创建并赋予职责** 的: + - `stream_`:计算流 + - `h2d_stream_`:Host → Device 拷贝流 + - `d2h_stream_`:Device → Host 拷贝流 + +也就是说,**不是 TensorFlow 会针对单个 MatMul 自动设计出一套“三流并行流水线”**, +而是 `MusaDeviceContext` 这层在处理拷贝时,明确决定把哪些 memcpy 放到专用 +copy stream、以及何时插入 event/wait。 + +### 3.1.1 会自动排流水吗? + +**会有一定程度的异步排队,但不是 TensorFlow 通用地、全自动地替你规划三流流水线。** + +- 同一条流内部,MUSA runtime 保证 **FIFO 顺序执行**。 +- 不同流之间是否能并发,取决于: + 1. 插件是否把工作提交到了不同的流; + 2. 插件是否插入了 `musaEventRecord` / `musaStreamWaitEvent`; + 3. 底层硬件是否支持计算与拷贝重叠。 +- 在本实现里: + - **MatMul 计算**固定进入 `stream_` + - **H2D 拷贝**通常进入 `h2d_stream_` + - **D2H 拷贝**通常进入 `d2h_stream_` + +因此,**如果图里本来就存在“前后独立”的计算与拷贝**,这些工作有机会在不同流上重叠; +但**不是** TensorFlow 自动把单个 MatMul 操作改写成“三段式流水”。 + +### 3.1.2 会自动决定同步 / 异步吗? + +**部分会,但“怎么同步、何时异步”主要是插件代码写死/显式决定的,不是 TensorFlow +统一自动推断出来的。** + +- **MatMul kernel launch 本身是异步入队** 到计算流的;`Compute()` 返回前通常不会阻塞到 + kernel 执行完成。 +- **H2D / D2H 拷贝** 是否走异步路径,由 `MusaDeviceContext` 决定: + - pinned host memory:优先 `musaMemcpyAsync(...)` + - pageable host memory:小拷贝或失败回退时会走同步 `musaMemcpy(...)` + - 环境变量 + `MUSA_PAGEABLE_H2D_ON_COMPUTE_STREAM=1` / + `MUSA_PINNED_H2D_ON_COMPUTE_STREAM=1` + 还可以把部分 H2D 直接改到计算流上 +- **跨流同步** 也是显式写出来的: + - H2D 前,会在计算流上 record event,再让 `h2d_stream_` wait,避免覆盖仍被旧计算使用的目标 buffer + - D2H 前,会在计算流上 record event,再让 `d2h_stream_` wait,保证先看到最新计算结果 + - 拷贝完成后,再通过 `done` callback / `event_mgr_` 通知 TensorFlow 后续可以继续调度 + +结论: + +- **TensorFlow 不会自动替这个 MatMul op 设计完整的三流流水策略** +- **当前仓库确实使用了三条流** +- **异步/同步与跨流依赖,主要由 `MusaDeviceContext` 中的 memcpy + event/wait 逻辑显式控制** +- **MatMul 只是稳定地跑在计算流上,本身不决定 H2D/D2H 的排流水策略** + +### 3.2 性能与热点(TensorFlow 侧) 内核级计时宏已移除。可结合 TensorFlow 自带能力做性能分析,例如: @@ -180,7 +243,7 @@ export TF_CPP_VMODULE="musa_graph_optimizer=1" python test_runner.py --single ops/matmul_op_test.py ``` -### 3.2 图优化调试 +### 3.3 图优化调试 ```bash # 查看图优化器的详细日志 @@ -197,7 +260,7 @@ export MUSA_DUMP_GRAPHDEF_DIR=/tmp/graphs python test_runner.py ``` -### 3.3 脏数据诊断 +### 3.4 脏数据诊断 ```bash # 启用遥测进行脏数据追溯 @@ -212,14 +275,14 @@ cd test && python test_runner.py grep "dirty_data_detected" /tmp/telemetry.json ``` -### 3.4 静音模式(仅显示错误) +### 3.5 静音模式(仅显示错误) ```bash export TF_CPP_MIN_LOG_LEVEL=2 python test_runner.py ``` -### 3.5 恢复默认配置 +### 3.6 恢复默认配置 ```bash unset MUSA_TELEMETRY_ENABLED MUSA_TELEMETRY_LOG_PATH diff --git a/musa_ext/kernels/math/musa_matmul_op.cc b/musa_ext/kernels/math/musa_matmul_op.cc index 71af405e..08b3e691 100644 --- a/musa_ext/kernels/math/musa_matmul_op.cc +++ b/musa_ext/kernels/math/musa_matmul_op.cc @@ -60,7 +60,10 @@ class MusaMatMulOp : public MusaOpKernel { } // MatMul/BatchMatMul is computationally intensive - // Mark as expensive for optimal scheduling (async execution) + // Mark as expensive for TensorFlow's op scheduler. + // This affects host-side scheduling priority/costing only; the actual muDNN + // kernel launch still goes to the device compute stream that is bound in + // MusaDevice::MusaDevice via mudnn_handle_->SetStream(stream_). bool IsExpensive() override { return true; } void Compute(OpKernelContext* ctx) override { @@ -98,6 +101,9 @@ class MusaMatMulOp : public MusaOpKernel { auto flat_out = out->flat(); return; } + // MatMul itself does not choose between compute/H2D/D2H streams. It always + // launches on the compute stream attached to the per-device muDNN handle. + // Host-device transfers are orchestrated separately by MusaDeviceContext. auto& handle = GetHandleByCtx(ctx); handle.SetAllowTF32(tf32_enabled_); // Use TF32 setting from constructor mTensor mt_a = CreateMTensor(in0); @@ -192,4 +198,4 @@ REGISTER_MUSA_MATMUL_ALL(bfloat16); #undef REGISTER_MUSA_MATMUL_ALL } // namespace musa -} // namespace tensorflow \ No newline at end of file +} // namespace tensorflow diff --git a/musa_ext/mu/device/musa_device.cc b/musa_ext/mu/device/musa_device.cc index 2a25643c..42f45cbb 100644 --- a/musa_ext/mu/device/musa_device.cc +++ b/musa_ext/mu/device/musa_device.cc @@ -100,6 +100,13 @@ void MusaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor, return Status::OK(); } + // The copy policy is implemented here by the plugin, not inferred + // automatically by TensorFlow from a particular op: + // 1. Record an event on the compute stream so H2D does not overwrite a + // destination buffer still referenced by earlier kernels. + // 2. Enqueue the H2D copy on the dedicated H2D stream after that wait. + // 3. Report completion through `done`, which lets TensorFlow continue + // scheduling downstream work only after the copy finishes. musaEvent_t sync_event; musaError_t err = musaEventCreateWithFlags(&sync_event, musaEventDisableTiming); @@ -496,6 +503,15 @@ MusaDevice::MusaDevice(Env* env, const DeviceAttributes& attributes, << " total_memory=" << total_memory << " free_memory=" << free_memory << " bfc_memory_limit=" << bfc_memory_limit; + // This device owns three streams with fixed roles: + // * stream_ : compute kernels (e.g. MatMul via muDNN/muBLAS handles) + // * h2d_stream_ : host-to-device copies + // * d2h_stream_ : device-to-host copies + // + // TensorFlow does not automatically turn every op into a generic 3-stream + // pipeline. The plugin decides when copies stay on dedicated copy streams, + // when they fall back to synchronous musaMemcpy, and when explicit + // cross-stream event waits are inserted. // Create main compute stream musaError_t stream_err = musaStreamCreate(&stream_); if (stream_err != musaSuccess) {