Skip to content

bond eth drop packets in af-packet #3659

@PFangwang

Description

@PFangwang

At https://github.com/FDio/vpp/blob/master/src/plugins/af_packet/node.c:467 and https://github.com/FDio/vpp/blob/master/src/plugins/af_packet/node.c:721.
trace:
00:14:41:954139: af-packet-input
af_packet: hw_if_index 8 rx-queue 2 next-index 4
tpacket2_hdr:
status 0x20000051 len 60 snaplen 60 mac 66 net 80
sec 0x6937cb50 nsec 0x3b159f97 vlan 401 vlan_tpid 33024
vnet-hdr:
flags 0x00 gso_type 0x00 hdr_len 0
gso_size 0 csum_start 0 csum_offset 0
00:14:41:954148: ethernet-input
ARP: 00:f1:f7:a0:39:0c -> 00:f1:f7:a0:39:1e 802.1q vlan 401
00:14:41:954155: error-drop
rx:eth7
00:14:41:954157: drop
ethernet-input: unknown vlan

Forcing index to be set to VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT results in bond-input not receiving the packets.

My solution:

`c
always_inline uword
af_packet_v3_device_input_fn (vlib_main_t *vm, vlib_node_runtime_t *node,
vlib_frame_t *frame, af_packet_if_t *apif,
u16 queue_id, u8 is_cksum_gso_enabled)
{
af_packet_main_t *apm = &af_packet_main;
af_packet_queue_t *rx_queue = vec_elt_at_index (apif->rx_queues, queue_id);
tpacket3_hdr_t *tph;
u32 next_index;
u32 n_free_bufs;
u32 n_rx_packets = 0;
u32 n_rx_bytes = 0;
u32 timedout_blk = 0;
u32 total = 0;
u32 *to_next = 0;
u32 block = rx_queue->next_rx_block;
u32 block_nr = rx_queue->rx_req->req3.tp_block_nr;
u8 *block_start = 0;
uword n_trace = vlib_get_trace_count (vm, node);
u32 thread_index = vm->thread_index;
u32 n_buffer_bytes = vlib_buffer_get_default_data_size (vm);
u32 min_bufs = rx_queue->rx_req->req3.tp_frame_size / n_buffer_bytes;
u32 num_pkts = 0;
u32 rx_frame_offset = 0;
block_desc_t *bd = 0;
vlib_buffer_t bt = {};
u8 is_ip = (apif->mode == AF_PACKET_IF_MODE_IP);
u32 next_index_template;

if (is_ip)
next_index_template = VNET_DEVICE_INPUT_NEXT_IP4_INPUT;
else
{
next_index_template = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
if (PREDICT_FALSE (apif->per_interface_next_index != ~0))
next_index_template = apif->per_interface_next_index;

  /* redirect if feature path enabled */
  vnet_feature_start_device_input (apif->sw_if_index, &next_index_template,
			       &bt);
}

next_index = next_index_template;

if ((((block_desc_t *) (block_start = rx_queue->rx_ring[block]))
->hdr.bh1.block_status &
TP_STATUS_USER) != 0)
{
u32 n_required = 0;
bd = (block_desc_t *) block_start;

  if (PREDICT_FALSE (rx_queue->is_rx_pending))
{
  num_pkts = rx_queue->num_rx_pkts;
  rx_frame_offset = rx_queue->rx_frame_offset;
  rx_queue->is_rx_pending = 0;
}
  else
{
  num_pkts = bd->hdr.bh1.num_pkts;
  rx_frame_offset = bd->hdr.bh1.offset_to_first_pkt;
  total++;

  if (TP_STATUS_BLK_TMO & bd->hdr.bh1.block_status)
    timedout_blk++;
}

  n_required = clib_max (num_pkts, VLIB_FRAME_SIZE);
  n_free_bufs = vec_len (apm->rx_buffers[thread_index]);
  if (PREDICT_FALSE (n_free_bufs < n_required))
{
  vec_validate (apm->rx_buffers[thread_index],
		n_required + n_free_bufs - 1);
  n_free_bufs += vlib_buffer_alloc (
    vm, &apm->rx_buffers[thread_index][n_free_bufs], n_required);
  vec_set_len (apm->rx_buffers[thread_index], n_free_bufs);
}

  while (num_pkts && (n_free_bufs >= min_bufs))
{
  u32 n_left_to_next;

  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

  while (num_pkts && n_left_to_next && (n_free_bufs >= min_bufs))
    {
      tph = (tpacket3_hdr_t *) (block_start + rx_frame_offset);

      if (num_pkts > 1)
	CLIB_PREFETCH (block_start + rx_frame_offset +
			 tph->tp_next_offset,
		       2 * CLIB_CACHE_LINE_BYTES, LOAD);

      vlib_buffer_t *b0 = 0, *first_b0 = 0, *prev_b0 = 0;
      vnet_virtio_net_hdr_t *vnet_hdr = 0;
      u32 data_len = tph->tp_snaplen;
      u32 offset = 0;
      u32 bi0 = ~0, first_bi0 = ~0;
      u8 l4_hdr_sz = 0;
      u32 next0 = next_index_template;

      if (is_cksum_gso_enabled)
	vnet_hdr =
	  (vnet_virtio_net_hdr_t *) ((u8 *) tph + tph->tp_mac -
				     sizeof (vnet_virtio_net_hdr_t));

      // save current state and return
      if (PREDICT_FALSE (((data_len / n_buffer_bytes) + 1) >
			 vec_len (apm->rx_buffers[thread_index])))
	{
	  rx_queue->rx_frame_offset = rx_frame_offset;
	  rx_queue->num_rx_pkts = num_pkts;
	  rx_queue->is_rx_pending = 1;
	  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
	  goto done;
	}

      while (data_len)
	{
	  /* grab free buffer */
	  u32 last_empty_buffer =
	    vec_len (apm->rx_buffers[thread_index]) - 1;
	  bi0 = apm->rx_buffers[thread_index][last_empty_buffer];
	  vec_set_len (apm->rx_buffers[thread_index],
		       last_empty_buffer);
	  n_free_bufs--;

	  /* copy data */
	  u32 bytes_to_copy =
	    data_len > n_buffer_bytes ? n_buffer_bytes : data_len;
	  u32 vlan_len = 0;
	  u32 bytes_copied = 0;

	  b0 = vlib_get_buffer (vm, bi0);
	  b0->current_data = 0;

	  /* Kernel removes VLAN headers, so reconstruct VLAN */
	  if (PREDICT_FALSE (tph->tp_status & TP_STATUS_VLAN_VALID))
	    {
	      if (PREDICT_TRUE (offset == 0))
		{
		  clib_memcpy_fast (vlib_buffer_get_current (b0),
				    (u8 *) tph + tph->tp_mac,
				    sizeof (ethernet_header_t));
		  ethernet_header_t *eth =
		    vlib_buffer_get_current (b0);
		  ethernet_vlan_header_t *vlan =
		    (ethernet_vlan_header_t *) (eth + 1);
		  vlan->priority_cfi_and_id =
		    clib_host_to_net_u16 (tph->hv1.tp_vlan_tci);
		  vlan->type = eth->type;
		  eth->type =
		    clib_host_to_net_u16 (ETHERNET_TYPE_VLAN);
		  vlan_len = sizeof (ethernet_vlan_header_t);
		  bytes_copied = sizeof (ethernet_header_t);
		}
	    }
	  clib_memcpy_fast (((u8 *) vlib_buffer_get_current (b0)) +
			      bytes_copied + vlan_len,
			    (u8 *) tph + tph->tp_mac + offset +
			      bytes_copied,
			    (bytes_to_copy - bytes_copied));

	  /* fill buffer header */
	  b0->current_length = bytes_to_copy + vlan_len;

	  if (offset == 0)
	    {
	      b0->total_length_not_including_first_buffer = 0;
	      b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
	      vnet_buffer (b0)->sw_if_index[VLIB_RX] =
		apif->sw_if_index;
	      vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~0;
	      first_b0 = b0;
	      first_bi0 = bi0;
	      if (is_cksum_gso_enabled)
		{
		  if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
		    fill_cksum_offload (first_b0, &l4_hdr_sz, is_ip);
		  if (vnet_hdr->gso_type & (VIRTIO_NET_HDR_GSO_TCPV4 |
					    VIRTIO_NET_HDR_GSO_TCPV6))
		    fill_gso_offload (first_b0, vnet_hdr->gso_size,
				      l4_hdr_sz);
		}
	    }
	  else
	    buffer_add_to_chain (b0, first_b0, prev_b0, bi0);

	  prev_b0 = b0;
	  offset += bytes_to_copy;
	  data_len -= bytes_to_copy;
	}
      n_rx_packets++;
      n_rx_bytes += tph->tp_snaplen;
      to_next[0] = first_bi0;
      to_next += 1;
      n_left_to_next--;

      /* drop partial packets */
      if (PREDICT_FALSE (tph->tp_len != tph->tp_snaplen))
	{
	  next0 = VNET_DEVICE_INPUT_NEXT_DROP;
	  first_b0->error =
	    node->errors[AF_PACKET_INPUT_ERROR_PARTIAL_PKT];
	}
      else
	{
	  if (PREDICT_FALSE (apif->mode == AF_PACKET_IF_MODE_IP))
	    {
	      switch (first_b0->data[0] & 0xf0)
		{
		case 0x40:
		  next0 = VNET_DEVICE_INPUT_NEXT_IP4_INPUT;
		  break;
		case 0x60:
		  next0 = VNET_DEVICE_INPUT_NEXT_IP6_INPUT;
		  break;
		default:
		  next0 = VNET_DEVICE_INPUT_NEXT_DROP;
		  break;
		}
	      if (PREDICT_FALSE (apif->per_interface_next_index != ~0))
		next0 = apif->per_interface_next_index;
	    }
	  else
	    {
	      /* copy feature arc data from template */
	      first_b0->current_config_index = bt.current_config_index;
	      vnet_buffer (first_b0)->feature_arc_index =
		vnet_buffer (&bt)->feature_arc_index;
	    }
	}

      /* trace */
      if (PREDICT_FALSE (n_trace > 0 &&
			 vlib_trace_buffer (vm, node, next0, first_b0,
					    /* follow_chain */ 0)))
	{
	  af_packet_input_trace_t *tr;
	  vlib_set_trace_count (vm, node, --n_trace);
	  tr = vlib_add_trace (vm, node, first_b0, sizeof (*tr));
	  tr->is_v3 = 1;
	  tr->next_index = next0;
	  tr->hw_if_index = apif->hw_if_index;
	  tr->queue_id = queue_id;
	  tr->block = block;
	  tr->block_start = bd;
	  tr->pkt_num = bd->hdr.bh1.num_pkts - num_pkts;
	  clib_memcpy_fast (&tr->bd, bd, sizeof (block_desc_t));
	  clib_memcpy_fast (&tr->tph3, tph, sizeof (tpacket3_hdr_t));
	  if (is_cksum_gso_enabled)
	    clib_memcpy_fast (&tr->vnet_hdr, vnet_hdr,
			      sizeof (vnet_virtio_net_hdr_t));
	  else
	    clib_memset_u8 (&tr->vnet_hdr, 0,
			    sizeof (vnet_virtio_net_hdr_t));
	}

      /* enque and take next packet */
      vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
				       n_left_to_next, first_bi0,
				       next0);

      /* next packet */
      num_pkts--;
      rx_frame_offset += tph->tp_next_offset;
    }

  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
}

  if (PREDICT_TRUE (num_pkts == 0))
{
  bd->hdr.bh1.block_status = TP_STATUS_KERNEL;
  block = (block + 1) % block_nr;
}
  else
{
  rx_queue->rx_frame_offset = rx_frame_offset;
  rx_queue->num_rx_pkts = num_pkts;
  rx_queue->is_rx_pending = 1;
}
}

rx_queue->next_rx_block = block;

done:

if (apm->polling_count == 0)
{
if ((((block_desc_t *) (block_start = rx_queue->rx_ring[block]))
->hdr.bh1.block_status &
TP_STATUS_USER) != 0)
vlib_node_set_state (vm, node->node_index, VLIB_NODE_STATE_POLLING);
else
vlib_node_set_state (vm, node->node_index, VLIB_NODE_STATE_INTERRUPT);
}

vlib_error_count (vm, node->node_index, AF_PACKET_INPUT_ERROR_TOTAL_RECV_BLK,
total);
vlib_error_count (vm, node->node_index, AF_PACKET_INPUT_ERROR_TIMEDOUT_BLK,
timedout_blk);

vlib_increment_combined_counter
(vnet_get_main ()->interface_main.combined_sw_if_counters
+ VNET_INTERFACE_COUNTER_RX,
vlib_get_thread_index (), apif->hw_if_index, n_rx_packets, n_rx_bytes);

vnet_device_increment_rx_packets (thread_index, n_rx_packets);
return n_rx_packets;
}
`
Use next_index_template and remove mandatory assignment.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions