From cfc0969ad4417f17f8c5226441ade35b2ba4ad3e Mon Sep 17 00:00:00 2001
From: Scott Long <scottl@FreeBSD.org>
Date: Fri, 31 Aug 2012 10:07:38 +0000
Subject: [PATCH] Heavily optimize the case of small RX packets of 160 bytes or
 less.  For this case, allocate a plain mbuf and copy the frame into it, then
 send the copy up the stack, leaving the original mbuf+cluster in place in the
 receive ring for immediate re-use.  This saves a trip through 2 of the 3
 zones of the compound mbuf allocator, a trip through busdma, and a trip
 through the 1 of the 3 mbuf destructors.  For our load at Netflix, this can
 lower CPU consumption by as much as 20%.  The copy algorithm is based on
 investigative work from Luigi Rizzo earlier in the year.

Reviewed by:	jfv
Obtained from:	Netflix
---
 sys/dev/ixgbe/ixgbe.c       | 68 ++++++++++++++++++++++++++++---------
 sys/dev/ixgbe/ixgbe.h       | 17 ++++++++++
 sys/dev/ixgbe/ixgbe_osdep.h | 19 +++++++++++
 3 files changed, 88 insertions(+), 16 deletions(-)

diff --git a/sys/dev/ixgbe/ixgbe.c b/sys/dev/ixgbe/ixgbe.c
index 5da5fec651ed..911249da8c30 100644
--- a/sys/dev/ixgbe/ixgbe.c
+++ b/sys/dev/ixgbe/ixgbe.c
@@ -3734,21 +3734,30 @@ ixgbe_refresh_mbufs(struct rx_ring *rxr, int limit)
 			mp = rxbuf->m_pack;
 
 		mp->m_pkthdr.len = mp->m_len = adapter->rx_mbuf_sz;
-		/* Get the memory mapping */
-		error = bus_dmamap_load_mbuf_sg(rxr->ptag,
-		    rxbuf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT);
-		if (error != 0) {
-			printf("Refresh mbufs: payload dmamap load"
-			    " failure - %d\n", error);
-			m_free(mp);
-			rxbuf->m_pack = NULL;
-			goto update;
+
+		/* If we're dealing with an mbuf that was copied rather
+		 * than replaced, there's no need to go through busdma.
+		 */
+		if ((rxbuf->flags & IXGBE_RX_COPY) == 0) {
+			/* Get the memory mapping */
+			error = bus_dmamap_load_mbuf_sg(rxr->ptag,
+			    rxbuf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT);
+			if (error != 0) {
+				printf("Refresh mbufs: payload dmamap load"
+				    " failure - %d\n", error);
+				m_free(mp);
+				rxbuf->m_pack = NULL;
+				goto update;
+			}
+			rxbuf->m_pack = mp;
+			bus_dmamap_sync(rxr->ptag, rxbuf->pmap,
+			    BUS_DMASYNC_PREREAD);
+			rxbuf->paddr = rxr->rx_base[i].read.pkt_addr =
+			    htole64(pseg[0].ds_addr);
+		} else {
+			rxr->rx_base[i].read.pkt_addr = rxbuf->paddr;
+			rxbuf->flags &= ~IXGBE_RX_COPY;
 		}
-		rxbuf->m_pack = mp;
-		bus_dmamap_sync(rxr->ptag, rxbuf->pmap,
-		    BUS_DMASYNC_PREREAD);
-		rxr->rx_base[i].read.pkt_addr =
-		    htole64(pseg[0].ds_addr);
 
 		refreshed = TRUE;
 		/* Next is precalculated */
@@ -4061,6 +4070,7 @@ ixgbe_setup_receive_ring(struct rx_ring *rxr)
 	rxr->next_to_refresh = 0;
 	rxr->lro_enabled = FALSE;
 	rxr->rx_split_packets = 0;
+	rxr->rx_copies = 0;
 	rxr->rx_bytes = 0;
 	rxr->discard = FALSE;
 	rxr->vtag_strip = FALSE;
@@ -4618,14 +4628,37 @@ ixgbe_rxeof(struct ix_queue *que, int count)
 			** that determines what we are
 			*/
 			sendmp = rbuf->fmp;
-			rbuf->m_pack = rbuf->fmp = NULL;
 
 			if (sendmp != NULL) {  /* secondary frag */
+				rbuf->m_pack = rbuf->fmp = NULL;
 				mp->m_flags &= ~M_PKTHDR;
 				sendmp->m_pkthdr.len += mp->m_len;
 			} else {
+				/*
+				 * Optimize.  This might be a small packet,
+				 * maybe just a TCP ACK.  Do a fast copy that
+				 * is cache aligned into a new mbuf, and
+				 * leave the old mbuf+cluster for re-use.
+				 */
+				if (eop && plen <= IXGBE_RX_COPY_LEN) {
+					prefetch(mp->m_data);
+					sendmp = m_gethdr(M_DONTWAIT, MT_DATA);
+					if (sendmp != NULL) {
+						sendmp->m_data +=
+						    IXGBE_RX_COPY_ALIGN;
+						ixgbe_bcopy(mp->m_data,
+						    sendmp->m_data, plen);
+						sendmp->m_len = plen;
+						rxr->rx_copies++;
+						rbuf->flags |= IXGBE_RX_COPY;
+					}
+				}
+				if (sendmp == NULL) {
+					rbuf->m_pack = rbuf->fmp = NULL;
+					sendmp = mp;
+				}
+
 				/* first desc of a non-ps chain */
-				sendmp = mp;
 				sendmp->m_flags |= M_PKTHDR;
 				sendmp->m_pkthdr.len = mp->m_len;
 				if (staterr & IXGBE_RXD_STAT_VP) {
@@ -5476,6 +5509,9 @@ ixgbe_add_hw_stats(struct adapter *adapter)
 		SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_bytes",
 				CTLFLAG_RD, &rxr->rx_bytes,
 				"Queue Bytes Received");
+		SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_copies",
+				CTLFLAG_RD, &rxr->rx_copies,
+				"Copied RX Frames");
 		SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "lro_queued",
 				CTLFLAG_RD, &lro->lro_queued, 0,
 				"LRO Queued");
diff --git a/sys/dev/ixgbe/ixgbe.h b/sys/dev/ixgbe/ixgbe.h
index 8e8b24dc44a2..d167306172f1 100644
--- a/sys/dev/ixgbe/ixgbe.h
+++ b/sys/dev/ixgbe/ixgbe.h
@@ -154,6 +154,19 @@
 #define IXGBE_FC_HI		0x20000
 #define IXGBE_FC_LO		0x10000
 
+/*
+ * Used for optimizing small rx mbufs.  Effort is made to keep the copy
+ * small and aligned for the CPU L1 cache.
+ * 
+ * MHLEN is typically 168 bytes, giving us 8-byte alignment.  Getting
+ * 32 byte alignment needed for the fast bcopy results in 8 bytes being
+ * wasted.  Getting 64 byte alignment, which _should_ be ideal for
+ * modern Intel CPUs, results in 40 bytes wasted and a significant drop
+ * in observed efficiency of the optimization, 97.9% -> 81.8%.
+ */
+#define IXGBE_RX_COPY_LEN	160
+#define IXGBE_RX_COPY_ALIGN	(MHLEN - IXGBE_RX_COPY_LEN)
+
 /* Keep older OS drivers building... */
 #if !defined(SYSCTL_ADD_UQUAD)
 #define SYSCTL_ADD_UQUAD SYSCTL_ADD_QUAD
@@ -245,6 +258,9 @@ struct ixgbe_rx_buf {
 	struct mbuf	*fmp;
 	bus_dmamap_t	hmap;
 	bus_dmamap_t	pmap;
+	u_int		flags;
+#define IXGBE_RX_COPY	0x01
+	uint64_t	paddr;
 };
 
 /*
@@ -339,6 +355,7 @@ struct rx_ring {
 	/* Soft stats */
 	u64			rx_irq;
 	u64			rx_split_packets;
+	u64			rx_copies;
 	u64			rx_packets;
 	u64 			rx_bytes;
 	u64 			rx_discarded;
diff --git a/sys/dev/ixgbe/ixgbe_osdep.h b/sys/dev/ixgbe/ixgbe_osdep.h
index 0b0c177ca154..524c3548465f 100644
--- a/sys/dev/ixgbe/ixgbe_osdep.h
+++ b/sys/dev/ixgbe/ixgbe_osdep.h
@@ -143,6 +143,25 @@ void prefetch(void *x)
 #define prefetch(x)
 #endif
 
+/*
+ * Optimized bcopy thanks to Luigi Rizzo's investigative work.  Assumes
+ * non-overlapping regions and 32-byte padding on both src and dst.
+ */
+static __inline int
+ixgbe_bcopy(void *_src, void *_dst, int l)
+{
+	uint64_t *src = _src;
+	uint64_t *dst = _dst;
+
+	for (; l > 0; l -= 32) {
+		*dst++ = *src++;
+		*dst++ = *src++;
+		*dst++ = *src++;
+		*dst++ = *src++;
+	}
+	return (0);
+}
+
 struct ixgbe_osdep
 {
 	bus_space_tag_t    mem_bus_space_tag;