Fixes for TX HDS and added to main HDS config (#1236)

cliffburdick · greptile-apps[bot] · web-flow · commit 5a16b6d0b9b2 · 2025-11-17T15:09:28.000-05:00
&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

* **Bug Fixes**
* Fixed packet segment chaining in scatter-gather transmit paths,
improving transmission reliability.

* **New Features**
* Added a large CPU-side TX memory region with many small buffers to
increase transmit capacity.

* **Improvements**
* Converted header/data split to a boolean flag across TX configs for
clearer behavior.
* Refined GPU-direct, header and payload handling for more consistent
transmission.

* **Chores**
  * Minor config formatting and comment updates.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

---------

Signed-off-by: Cliff Burdick &lt;cburdick@nvidia.com&gt;
Co-authored-by: greptile-apps[bot] &lt;165735046+greptile-apps[bot]@users.noreply.github.com&gt;
diff --git a/applications/adv_networking_bench/adv_networking_bench_default_sw_loopback.yaml b/applications/adv_networking_bench/adv_networking_bench_default_sw_loopback.yaml
@@ -77,7 +77,7 @@ bench_rx:
 bench_tx:
   interface_name: "loopback_ports" # Name of the TX port from the advanced_network config
   gpu_direct: true          # Set to true if using a GPU region for the Tx queues.
-  split_boundary: 0         # Byte boundary where header and data is split, 0 if no split
+  split_boundary: false     # True if header-data split is enabled
   batch_size: 10240
   payload_size: 1000
   header_size: 64
diff --git a/applications/adv_networking_bench/adv_networking_bench_default_tx_rx.yaml b/applications/adv_networking_bench/adv_networking_bench_default_tx_rx.yaml
@@ -88,7 +88,7 @@ bench_rx:
 bench_tx:
   interface_name: "tx_port" # Name of the TX port from the advanced_network config
   gpu_direct: true          # Set to true if using a GPU region for the Tx queues.
-  split_boundary: 0         # Byte boundary where header and data is split, 0 if no split
+  split_boundary: false     # True if header-data split is enabled
   batch_size: 10240
   payload_size: 1000
   header_size: 64
diff --git a/applications/adv_networking_bench/adv_networking_bench_default_tx_rx_hds.yaml b/applications/adv_networking_bench/adv_networking_bench_default_tx_rx_hds.yaml
@@ -31,6 +31,11 @@ advanced_network:
     loopback: ""
 
     memory_regions:
+    - name: "Data_TX_CPU"
+      kind: "huge"
+      affinity: 0
+      num_bufs: 51200
+      buf_size: 64    
     - name: "Data_TX_GPU"
       kind: "device"
       affinity: 0
@@ -57,11 +62,12 @@ advanced_network:
           batch_size: 10240
           cpu_core: 11
           memory_regions:
+            - "Data_TX_CPU"
             - "Data_TX_GPU"
           offloads:
             - "tx_eth_src"
     - name: "rx_port"
-      address: <0000:00:00.0>       # The BUS address of the interface doing Rx
+      address: <0000:00:00.0>      # The BUS address of the interface doing Rx
       rx:
         flow_isolation: true
         queues:
@@ -94,7 +100,7 @@ bench_rx:
 bench_tx:
   interface_name: "tx_port" # Name of the TX port from the advanced_network config
   gpu_direct: true          # Set to true if using a GPU region for the Tx queues.
-  split_boundary: 0         # Byte boundary where header and data is split, 0 if no split
+  split_boundary: true         # Whether header and data is split (Header to CPU, payload to GPU)
   batch_size: 10240
   payload_size: 1000
   header_size: 64
diff --git a/applications/adv_networking_bench/adv_networking_bench_default_tx_rx_multi_q_hds.yaml b/applications/adv_networking_bench/adv_networking_bench_default_tx_rx_multi_q_hds.yaml
@@ -340,7 +340,7 @@ bench_rx:
 bench_tx:
   interface_name: tx_port
   gpu_direct: false
-  split_boundary: 0
+  split_boundary: false
   batch_size: 10240
   payload_size: 1000
   header_size: 64
diff --git a/applications/adv_networking_bench/cpp/default_bench_op_tx.h b/applications/adv_networking_bench/cpp/default_bench_op_tx.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -144,7 +144,7 @@ class AdvNetworkingBenchDefaultTxOp : public Operator {
     // This section simply serves as an example to get an Eth+IP+UDP header onto the GPU,
     // but this header will not be correct without modification of the IP and MAC. In a
     // real situation the header would likely be constructed on the GPU
-    if (gpu_direct_.get() && hds_.get() == 0) {
+    if (gpu_direct_.get() && !hds_.get()) {
       cudaMalloc(&gds_header_, header_size_.get());
       cudaMemset(gds_header_, 0, header_size_.get());
 
@@ -168,15 +168,15 @@ class AdvNetworkingBenchDefaultTxOp : public Operator {
                          "Payload size",
                          "Payload size to send including HDS portion",
                          1400);
-    spec.param<int>(hds_,
-                    "split_boundary",
-                    "Header-data split boundary",
-                    "Byte boundary where header and data is split",
-                    0);
+    spec.param<bool>(hds_,
+                     "split_boundary",
+                     "Header-data split boundary",
+                     "Whether header and data is split (Header to CPU, payload to GPU)",
+                     false);
     spec.param<bool>(gpu_direct_,
                      "gpu_direct",
                      "GPUDirect enabled",
-                     "Byte boundary where header and data is split",
+                     "Whether GPUDirect is enabled",
                      false);
     spec.param<std::string>(udp_src_port_str_,
           "udp_src_port", "UDP source port",
@@ -212,7 +212,11 @@ class AdvNetworkingBenchDefaultTxOp : public Operator {
     }
 
     auto msg = create_tx_burst_params();
-    set_header(msg, port_id_, queue_id, batch_size_.get(), hds_.get() > 0 ? 2 : 1);
+    set_header(msg,
+               port_id_,
+               queue_id,
+               batch_size_.get(),
+               (gpu_direct_.get() && hds_.get()) ? 2 : 1);
 
     /**
      * Spin waiting until a buffer is free. This can be stalled by sending faster than the NIC can
@@ -242,7 +246,7 @@ class AdvNetworkingBenchDefaultTxOp : public Operator {
 
     // For HDS mode or CPU mode populate the packet headers
     for (int num_pkt = 0; num_pkt < get_num_packets(msg); num_pkt++) {
-      if (!gpu_direct_.get() || hds_.get() > 0) {
+      if (!gpu_direct_.get() || hds_.get()) {
         if ((ret = set_eth_header(msg, num_pkt, eth_dst_)) != Status::SUCCESS) {
           HOLOSCAN_LOG_ERROR("Failed to set Ethernet header for packet {}", num_pkt);
           free_all_packets_and_burst_tx(msg);
@@ -273,7 +277,7 @@ class AdvNetworkingBenchDefaultTxOp : public Operator {
         udp_dst_idx_ = (++udp_dst_idx_ % udp_dst_ports_.size());
 
         // Only set payload on CPU buffer if we're not in HDS mode
-        if (hds_.get() == 0) {
+        if (!hds_.get()) {
           if ((ret = set_udp_payload(
                    msg,
                    num_pkt,
@@ -287,10 +291,10 @@ class AdvNetworkingBenchDefaultTxOp : public Operator {
       }
 
       // Figure out the CPU and GPU length portions for advanced_network
-      if (gpu_direct_.get() && hds_.get() > 0) {
+      if (gpu_direct_.get() && hds_.get()) {
         gpu_bufs[cur_idx][num_pkt] =
             reinterpret_cast<uint8_t*>(get_segment_packet_ptr(msg, 1, num_pkt));
-        if ((ret = set_packet_lengths(msg, num_pkt, {hds_.get(), payload_size_.get()})) !=
+        if ((ret = set_packet_lengths(msg, num_pkt, {header_size_.get(), payload_size_.get()})) !=
             Status::SUCCESS) {
           HOLOSCAN_LOG_ERROR("Failed to set lengths for packet {}", num_pkt);
           free_all_packets_and_burst_tx(msg);
@@ -313,7 +317,7 @@ class AdvNetworkingBenchDefaultTxOp : public Operator {
     }
 
     // In GPU-only mode copy the header
-    if (gpu_direct_.get() && hds_.get() == 0) {
+    if (gpu_direct_.get() && !hds_.get()) {
       copy_headers(gpu_bufs[cur_idx],
                    gds_header_,
                    header_size_.get(),
@@ -323,7 +327,7 @@ class AdvNetworkingBenchDefaultTxOp : public Operator {
 
     // Populate packets with 16-bit numbers of {0,0}, {1,1}, ...
     if (gpu_direct_.get()) {
-      const auto offset = (hds_.get() > 0) ? 0 : header_size_.get();
+      const auto offset = hds_.get() ? 0 : header_size_.get();
       populate_packets(gpu_bufs[cur_idx],
                        payload_size_.get(),
                        get_num_packets(msg),
@@ -371,7 +375,7 @@ class AdvNetworkingBenchDefaultTxOp : public Operator {
   size_t udp_dst_idx_ = 0;
   std::vector<uint16_t> udp_src_ports_;
   std::vector<uint16_t> udp_dst_ports_;
-  Parameter<int> hds_;          // Header-data split point
+  Parameter<bool> hds_;         // Header-data split enabled
   Parameter<bool> gpu_direct_;  // GPUDirect enabled
   Parameter<uint32_t> batch_size_;
   Parameter<uint16_t> header_size_;  // Header size of packet
diff --git a/operators/advanced_network/advanced_network/managers/dpdk/adv_network_dpdk_mgr.cpp b/operators/advanced_network/advanced_network/managers/dpdk/adv_network_dpdk_mgr.cpp
@@ -2220,11 +2220,13 @@ int DpdkMgr::tx_core_worker(void* arg) {
     // Scatter mode needs to chain all the buffers
     if (msg->hdr.hdr.num_segs > 1) {
       for (size_t p = 0; p < msg->hdr.hdr.num_pkts; p++) {
-        for (int seg = 0; seg < msg->hdr.hdr.num_segs; seg++) {
+        for (int seg = 0; seg < msg->hdr.hdr.num_segs - 1; seg++) {
           auto* mbuf = reinterpret_cast<struct rte_mbuf*>(msg->pkts[seg][p]);
           mbuf->next = reinterpret_cast<struct rte_mbuf*>(msg->pkts[seg + 1][p]);
         }
 
+        // The next pointer of the last segment should be nullptr
+        reinterpret_cast<struct rte_mbuf*>(msg->pkts[msg->hdr.hdr.num_segs - 1][p])->next = nullptr;
         reinterpret_cast<struct rte_mbuf*>(msg->pkts[0][p])->nb_segs = msg->hdr.hdr.num_segs;
       }
     }

Original file line number	Diff line number	Diff line change
`@@ -2220,11 +2220,13 @@ int DpdkMgr::tx_core_worker(void* arg) {`
`2220`	`2220`	`// Scatter mode needs to chain all the buffers`
`2221`	`2221`	`if (msg->hdr.hdr.num_segs > 1) {`
`2222`	`2222`	`for (size_t p = 0; p < msg->hdr.hdr.num_pkts; p++) {`
`2223`		`- for (int seg = 0; seg < msg->hdr.hdr.num_segs; seg++) {`
	`2223`	`+ for (int seg = 0; seg < msg->hdr.hdr.num_segs - 1; seg++) {`
`2224`	`2224`	`auto* mbuf = reinterpret_cast<struct rte_mbuf*>(msg->pkts[seg][p]);`
`2225`	`2225`	`mbuf->next = reinterpret_cast<struct rte_mbuf*>(msg->pkts[seg + 1][p]);`
`2226`	`2226`	`}`
`2227`	`2227`
	`2228`	`+ // The next pointer of the last segment should be nullptr`
	`2229`	`+ reinterpret_cast<struct rte_mbuf*>(msg->pkts[msg->hdr.hdr.num_segs - 1][p])->next = nullptr;`
`2228`	`2230`	`reinterpret_cast<struct rte_mbuf*>(msg->pkts[0][p])->nb_segs = msg->hdr.hdr.num_segs;`
`2229`	`2231`	`}`
`2230`	`2232`	`}`