1414 * limitations under the License.
1515 */
1616
17+ #include <linux/tcp.h>
1718#include <stdint.h>
19+ #include <stdlib.h>
20+ #include <sys/mman.h>
1821#include <time.h>
22+ #include <unistd.h>
1923
2024#include "common.h"
2125#include "flow.h"
26+ #include "logging.h"
2227#include "socket.h"
23- #include "thread.h"
2428#include "stats.h"
29+ #include "thread.h"
2530
2631/*
2732 * We define the flow struct locally to this file to force outside users to go
@@ -41,6 +46,10 @@ struct flow {
4146 uint32_t f_events ; /* pending epoll events */
4247
4348 struct neper_stat * f_stat ;
49+
50+ /* TCP RX zerocopy state. */
51+ void * f_rx_zerocopy_buffer ;
52+ size_t f_rx_zerocopy_buffer_sz ;
4453};
4554
4655int flow_fd (const struct flow * f )
@@ -120,7 +129,24 @@ void flow_reconnect(struct flow *f, flow_handler fh, uint32_t events)
120129 flow_ctl (f , EPOLL_CTL_ADD , fh , events , true);
121130}
122131
123- void flow_create (const struct flow_create_args * args )
132+ void flow_init_rx_zerocopy (struct flow * f , int buffer_size , struct callbacks * cb )
133+ {
134+ // Use RCVLOWAT to reduce syscall overhead.
135+ int rcvlowat = buffer_size ;
136+ if (setsockopt (f -> f_fd , SOL_SOCKET , SO_RCVLOWAT , & rcvlowat ,
137+ sizeof (rcvlowat )) == -1 )
138+ PLOG_FATAL (cb , "setsockopt(SO_RCVLOWAT)" );
139+
140+ // Zerocopy requires mmap'd pages. Each flow has its own pages.
141+ f -> f_rx_zerocopy_buffer = mmap (NULL , buffer_size , PROT_READ ,
142+ MAP_SHARED , f -> f_fd , 0 );
143+ if (f -> f_rx_zerocopy_buffer == (void * )-1 )
144+ PLOG_FATAL (cb , "failed to map RX zerocopy buffer" );
145+
146+ f -> f_rx_zerocopy_buffer_sz = buffer_size ;
147+ }
148+
149+ struct flow * flow_create (const struct flow_create_args * args )
124150{
125151 struct thread * t = args -> thread ;
126152 struct flow * f = calloc_or_die (1 , sizeof (struct flow ), t -> cb );
@@ -164,6 +190,7 @@ void flow_create(const struct flow_create_args *args)
164190 events &= (f -> f_id & 1 ) ? EPOLLOUT : EPOLLIN ;
165191
166192 flow_ctl (f , EPOLL_CTL_ADD , args -> handler , events , true);
193+ return f ;
167194}
168195
169196/* Returns true if the deadline for the flow has expired.
@@ -292,10 +319,58 @@ void flow_delete(struct flow *f)
292319 */
293320 if (f -> f_mbuf != f -> f_thread -> f_mbuf )
294321 free (f -> f_mbuf );
322+
323+ /* Cleanup TCP RX zerocopy. */
324+ if (f -> f_rx_zerocopy_buffer )
325+ munmap (f -> f_rx_zerocopy_buffer , f -> f_rx_zerocopy_buffer_sz );
326+
295327 free (f );
296328}
297329
298330void flow_update_next_event (struct flow * f , uint64_t duration )
299331{
300332 f -> f_next_event += duration ;
301333}
334+
335+ ssize_t flow_recv_zerocopy (struct flow * f , void * copybuf , size_t copybuf_len ) {
336+ struct tcp_zerocopy_receive zc = {0 };
337+ socklen_t zc_len = sizeof (zc );
338+ int result ;
339+
340+ /* Setup both the mmap address and extra buffer for bytes that aren't
341+ * zerocopy-able.
342+ */
343+ zc .address = (__u64 )f -> f_rx_zerocopy_buffer ;
344+ zc .length = copybuf_len ; /* Same size used as zerocopy buffer. */
345+
346+ /* The kernel will effectively use copybuf_len as a hint as to what the
347+ * cutoff point between zerocopy and recv is. So passing a large copybuf
348+ * causes less zerocopy. Thus we pass just under a page to maximize
349+ * zerocopying.
350+ */
351+ zc .copybuf_address = (__u64 )copybuf ;
352+ zc .copybuf_len = copybuf_len < 4096 ? copybuf_len : 4095 ;
353+
354+ result = getsockopt (f -> f_fd , IPPROTO_TCP , TCP_ZEROCOPY_RECEIVE , & zc ,
355+ & zc_len );
356+ if (result == -1 )
357+ return result ;
358+
359+ /* Handle overflow data, i.e. bytes that couldn't be zerocopied. */
360+ if (zc .recv_skip_hint ) {
361+ int read_len = zc .recv_skip_hint < copybuf_len ?
362+ zc .recv_skip_hint : copybuf_len ;
363+ result = read (f -> f_fd , copybuf , read_len );
364+ if (result < 0 )
365+ PLOG_FATAL (f -> f_thread -> cb , "failed to read extra "
366+ "bytes" );
367+ }
368+
369+ /* Handle zerocopy data. */
370+ if (zc .length ) {
371+ flow_thread (f )-> io_stats .rx_zc_bytes += zc .length ;
372+ madvise (f -> f_rx_zerocopy_buffer , zc .length , MADV_DONTNEED );
373+ }
374+
375+ return zc .recv_skip_hint + zc .length + zc .copybuf_len ;
376+ }
0 commit comments