From cfc0969ad4417f17f8c5226441ade35b2ba4ad3e Mon Sep 17 00:00:00 2001 From: Scott Long Date: Fri, 31 Aug 2012 10:07:38 +0000 Subject: [PATCH] Heavily optimize the case of small RX packets of 160 bytes or less. For this case, allocate a plain mbuf and copy the frame into it, then send the copy up the stack, leaving the original mbuf+cluster in place in the receive ring for immediate re-use. This saves a trip through 2 of the 3 zones of the compound mbuf allocator, a trip through busdma, and a trip through the 1 of the 3 mbuf destructors. For our load at Netflix, this can lower CPU consumption by as much as 20%. The copy algorithm is based on investigative work from Luigi Rizzo earlier in the year. Reviewed by: jfv Obtained from: Netflix --- sys/dev/ixgbe/ixgbe.c | 68 ++++++++++++++++++++++++++++--------- sys/dev/ixgbe/ixgbe.h | 17 ++++++++++ sys/dev/ixgbe/ixgbe_osdep.h | 19 +++++++++++ 3 files changed, 88 insertions(+), 16 deletions(-) diff --git a/sys/dev/ixgbe/ixgbe.c b/sys/dev/ixgbe/ixgbe.c index 5da5fec651ed..911249da8c30 100644 --- a/sys/dev/ixgbe/ixgbe.c +++ b/sys/dev/ixgbe/ixgbe.c @@ -3734,21 +3734,30 @@ ixgbe_refresh_mbufs(struct rx_ring *rxr, int limit) mp = rxbuf->m_pack; mp->m_pkthdr.len = mp->m_len = adapter->rx_mbuf_sz; - /* Get the memory mapping */ - error = bus_dmamap_load_mbuf_sg(rxr->ptag, - rxbuf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT); - if (error != 0) { - printf("Refresh mbufs: payload dmamap load" - " failure - %d\n", error); - m_free(mp); - rxbuf->m_pack = NULL; - goto update; + + /* If we're dealing with an mbuf that was copied rather + * than replaced, there's no need to go through busdma. + */ + if ((rxbuf->flags & IXGBE_RX_COPY) == 0) { + /* Get the memory mapping */ + error = bus_dmamap_load_mbuf_sg(rxr->ptag, + rxbuf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT); + if (error != 0) { + printf("Refresh mbufs: payload dmamap load" + " failure - %d\n", error); + m_free(mp); + rxbuf->m_pack = NULL; + goto update; + } + rxbuf->m_pack = mp; + bus_dmamap_sync(rxr->ptag, rxbuf->pmap, + BUS_DMASYNC_PREREAD); + rxbuf->paddr = rxr->rx_base[i].read.pkt_addr = + htole64(pseg[0].ds_addr); + } else { + rxr->rx_base[i].read.pkt_addr = rxbuf->paddr; + rxbuf->flags &= ~IXGBE_RX_COPY; } - rxbuf->m_pack = mp; - bus_dmamap_sync(rxr->ptag, rxbuf->pmap, - BUS_DMASYNC_PREREAD); - rxr->rx_base[i].read.pkt_addr = - htole64(pseg[0].ds_addr); refreshed = TRUE; /* Next is precalculated */ @@ -4061,6 +4070,7 @@ ixgbe_setup_receive_ring(struct rx_ring *rxr) rxr->next_to_refresh = 0; rxr->lro_enabled = FALSE; rxr->rx_split_packets = 0; + rxr->rx_copies = 0; rxr->rx_bytes = 0; rxr->discard = FALSE; rxr->vtag_strip = FALSE; @@ -4618,14 +4628,37 @@ ixgbe_rxeof(struct ix_queue *que, int count) ** that determines what we are */ sendmp = rbuf->fmp; - rbuf->m_pack = rbuf->fmp = NULL; if (sendmp != NULL) { /* secondary frag */ + rbuf->m_pack = rbuf->fmp = NULL; mp->m_flags &= ~M_PKTHDR; sendmp->m_pkthdr.len += mp->m_len; } else { + /* + * Optimize. This might be a small packet, + * maybe just a TCP ACK. Do a fast copy that + * is cache aligned into a new mbuf, and + * leave the old mbuf+cluster for re-use. + */ + if (eop && plen <= IXGBE_RX_COPY_LEN) { + prefetch(mp->m_data); + sendmp = m_gethdr(M_DONTWAIT, MT_DATA); + if (sendmp != NULL) { + sendmp->m_data += + IXGBE_RX_COPY_ALIGN; + ixgbe_bcopy(mp->m_data, + sendmp->m_data, plen); + sendmp->m_len = plen; + rxr->rx_copies++; + rbuf->flags |= IXGBE_RX_COPY; + } + } + if (sendmp == NULL) { + rbuf->m_pack = rbuf->fmp = NULL; + sendmp = mp; + } + /* first desc of a non-ps chain */ - sendmp = mp; sendmp->m_flags |= M_PKTHDR; sendmp->m_pkthdr.len = mp->m_len; if (staterr & IXGBE_RXD_STAT_VP) { @@ -5476,6 +5509,9 @@ ixgbe_add_hw_stats(struct adapter *adapter) SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_bytes", CTLFLAG_RD, &rxr->rx_bytes, "Queue Bytes Received"); + SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_copies", + CTLFLAG_RD, &rxr->rx_copies, + "Copied RX Frames"); SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "lro_queued", CTLFLAG_RD, &lro->lro_queued, 0, "LRO Queued"); diff --git a/sys/dev/ixgbe/ixgbe.h b/sys/dev/ixgbe/ixgbe.h index 8e8b24dc44a2..d167306172f1 100644 --- a/sys/dev/ixgbe/ixgbe.h +++ b/sys/dev/ixgbe/ixgbe.h @@ -154,6 +154,19 @@ #define IXGBE_FC_HI 0x20000 #define IXGBE_FC_LO 0x10000 +/* + * Used for optimizing small rx mbufs. Effort is made to keep the copy + * small and aligned for the CPU L1 cache. + * + * MHLEN is typically 168 bytes, giving us 8-byte alignment. Getting + * 32 byte alignment needed for the fast bcopy results in 8 bytes being + * wasted. Getting 64 byte alignment, which _should_ be ideal for + * modern Intel CPUs, results in 40 bytes wasted and a significant drop + * in observed efficiency of the optimization, 97.9% -> 81.8%. + */ +#define IXGBE_RX_COPY_LEN 160 +#define IXGBE_RX_COPY_ALIGN (MHLEN - IXGBE_RX_COPY_LEN) + /* Keep older OS drivers building... */ #if !defined(SYSCTL_ADD_UQUAD) #define SYSCTL_ADD_UQUAD SYSCTL_ADD_QUAD @@ -245,6 +258,9 @@ struct ixgbe_rx_buf { struct mbuf *fmp; bus_dmamap_t hmap; bus_dmamap_t pmap; + u_int flags; +#define IXGBE_RX_COPY 0x01 + uint64_t paddr; }; /* @@ -339,6 +355,7 @@ struct rx_ring { /* Soft stats */ u64 rx_irq; u64 rx_split_packets; + u64 rx_copies; u64 rx_packets; u64 rx_bytes; u64 rx_discarded; diff --git a/sys/dev/ixgbe/ixgbe_osdep.h b/sys/dev/ixgbe/ixgbe_osdep.h index 0b0c177ca154..524c3548465f 100644 --- a/sys/dev/ixgbe/ixgbe_osdep.h +++ b/sys/dev/ixgbe/ixgbe_osdep.h @@ -143,6 +143,25 @@ void prefetch(void *x) #define prefetch(x) #endif +/* + * Optimized bcopy thanks to Luigi Rizzo's investigative work. Assumes + * non-overlapping regions and 32-byte padding on both src and dst. + */ +static __inline int +ixgbe_bcopy(void *_src, void *_dst, int l) +{ + uint64_t *src = _src; + uint64_t *dst = _dst; + + for (; l > 0; l -= 32) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } + return (0); +} + struct ixgbe_osdep { bus_space_tag_t mem_bus_space_tag;