1
0
mirror of https://git.FreeBSD.org/src.git synced 2025-01-01 12:19:28 +00:00
freebsd/sys/dev/sfxge/sfxge_rx.c
Philip Paeps e948693ed7 Add the sfxge(4) device driver, providing support for 10Gb Ethernet adapters
based on Solarflare SFC9000 family controllers.  The driver supports jumbo
frames, transmit/receive checksum offload, TCP Segmentation Offload (TSO),
Large Receive Offload (LRO), VLAN checksum offload, VLAN TSO, and Receive Side
Scaling (RSS) using MSI-X interrupts.

This work was sponsored by Solarflare Communications, Inc.

My sincere thanks to Ben Hutchings for doing a lot of the hard work!

Sponsored by:	Solarflare Communications, Inc.
MFC after:	3 weeks
2011-11-16 17:11:13 +00:00

1234 lines
30 KiB
C

/*-
* Copyright (c) 2010-2011 Solarflare Communications, Inc.
* All rights reserved.
*
* This software was developed in part by Philip Paeps under contract for
* Solarflare Communications, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/mbuf.h>
#include <sys/smp.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
#include <sys/limits.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <net/if_vlan_var.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <netinet/tcp.h>
#include <machine/in_cksum.h>
#include "common/efx.h"
#include "sfxge.h"
#include "sfxge_rx.h"
#define RX_REFILL_THRESHOLD (EFX_RXQ_LIMIT(SFXGE_NDESCS) * 9 / 10)
#define RX_REFILL_THRESHOLD_2 (RX_REFILL_THRESHOLD / 2)
/* Size of the LRO hash table. Must be a power of 2. A larger table
* means we can accelerate a larger number of streams.
*/
static unsigned lro_table_size = 128;
/* Maximum length of a hash chain. If chains get too long then the lookup
* time increases and may exceed the benefit of LRO.
*/
static unsigned lro_chain_max = 20;
/* Maximum time (in ticks) that a connection can be idle before it's LRO
* state is discarded.
*/
static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
/* Number of packets with payload that must arrive in-order before a
* connection is eligible for LRO. The idea is we should avoid coalescing
* segments when the sender is in slow-start because reducing the ACK rate
* can damage performance.
*/
static int lro_slow_start_packets = 2000;
/* Number of packets with payload that must arrive in-order following loss
* before a connection is eligible for LRO. The idea is we should avoid
* coalescing segments when the sender is recovering from loss, because
* reducing the ACK rate can damage performance.
*/
static int lro_loss_packets = 20;
/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
#define SFXGE_LRO_L2_ID_VLAN 0x4000
#define SFXGE_LRO_L2_ID_IPV6 0x8000
#define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
#define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
/* Compare IPv6 addresses, avoiding conditional branches */
static __inline unsigned long ipv6_addr_cmp(const struct in6_addr *left,
const struct in6_addr *right)
{
#if LONG_BIT == 64
const uint64_t *left64 = (const uint64_t *)left;
const uint64_t *right64 = (const uint64_t *)right;
return (left64[0] - right64[0]) | (left64[1] - right64[1]);
#else
return (left->s6_addr32[0] - right->s6_addr32[0]) |
(left->s6_addr32[1] - right->s6_addr32[1]) |
(left->s6_addr32[2] - right->s6_addr32[2]) |
(left->s6_addr32[3] - right->s6_addr32[3]);
#endif
}
void
sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
{
rxq->flush_state = SFXGE_FLUSH_DONE;
}
void
sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
{
rxq->flush_state = SFXGE_FLUSH_FAILED;
}
static uint8_t toep_key[] = {
0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
};
static void
sfxge_rx_post_refill(void *arg)
{
struct sfxge_rxq *rxq = arg;
struct sfxge_softc *sc;
unsigned int index;
struct sfxge_evq *evq;
uint16_t magic;
sc = rxq->sc;
index = rxq->index;
evq = sc->evq[index];
magic = SFXGE_MAGIC_RX_QREFILL | index;
/* This is guaranteed due to the start/stop order of rx and ev */
KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
("evq not started"));
KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
("rxq not started"));
efx_ev_qpost(evq->common, magic);
}
static void
sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
{
/* Initially retry after 100 ms, but back off in case of
* repeated failures as we probably have to wait for the
* administrator to raise the pool limit. */
if (retrying)
rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
else
rxq->refill_delay = hz / 10;
callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
sfxge_rx_post_refill, rxq);
}
static inline struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
{
struct mb_args args;
struct mbuf *m;
/* Allocate mbuf structure */
args.flags = M_PKTHDR;
args.type = MT_DATA;
m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_DONTWAIT);
/* Allocate (and attach) packet buffer */
if (m && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_DONTWAIT)) {
uma_zfree(zone_mbuf, m);
m = NULL;
}
return m;
}
#define SFXGE_REFILL_BATCH 64
static void
sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
{
struct sfxge_softc *sc;
unsigned int index;
struct sfxge_evq *evq;
unsigned int batch;
unsigned int rxfill;
unsigned int mblksize;
int ntodo;
efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
sc = rxq->sc;
index = rxq->index;
evq = sc->evq[index];
prefetch_read_many(sc->enp);
prefetch_read_many(rxq->common);
mtx_assert(&evq->lock, MA_OWNED);
if (rxq->init_state != SFXGE_RXQ_STARTED)
return;
rxfill = rxq->added - rxq->completed;
KASSERT(rxfill <= EFX_RXQ_LIMIT(SFXGE_NDESCS),
("rxfill > EFX_RXQ_LIMIT(SFXGE_NDESCS)"));
ntodo = min(EFX_RXQ_LIMIT(SFXGE_NDESCS) - rxfill, target);
KASSERT(ntodo <= EFX_RXQ_LIMIT(SFXGE_NDESCS),
("ntodo > EFX_RQX_LIMIT(SFXGE_NDESCS)"));
if (ntodo == 0)
return;
batch = 0;
mblksize = sc->rx_buffer_size;
while (ntodo-- > 0) {
unsigned int id;
struct sfxge_rx_sw_desc *rx_desc;
bus_dma_segment_t seg;
struct mbuf *m;
id = (rxq->added + batch) & (SFXGE_NDESCS - 1);
rx_desc = &rxq->queue[id];
KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
rx_desc->flags = EFX_DISCARD;
m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
if (m == NULL)
break;
sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
addr[batch++] = seg.ds_addr;
if (batch == SFXGE_REFILL_BATCH) {
efx_rx_qpost(rxq->common, addr, mblksize, batch,
rxq->completed, rxq->added);
rxq->added += batch;
batch = 0;
}
}
if (ntodo != 0)
sfxge_rx_schedule_refill(rxq, retrying);
if (batch != 0) {
efx_rx_qpost(rxq->common, addr, mblksize, batch,
rxq->completed, rxq->added);
rxq->added += batch;
}
/* Make the descriptors visible to the hardware */
bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
BUS_DMASYNC_PREWRITE);
efx_rx_qpush(rxq->common, rxq->added);
}
void
sfxge_rx_qrefill(struct sfxge_rxq *rxq)
{
if (rxq->init_state != SFXGE_RXQ_STARTED)
return;
/* Make sure the queue is full */
sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_TRUE);
}
static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
{
struct ifnet *ifp = sc->ifnet;
m->m_pkthdr.rcvif = ifp;
m->m_pkthdr.header = m->m_data;
m->m_pkthdr.csum_data = 0xffff;
ifp->if_input(ifp, m);
}
static void
sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
{
struct mbuf *m = rx_desc->mbuf;
int csum_flags;
/* Convert checksum flags */
csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ?
(CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
if (rx_desc->flags & EFX_CKSUM_TCPUDP)
csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
#ifdef SFXGE_HAVE_MQ
/* The hash covers a 4-tuple for TCP only */
if (rx_desc->flags & EFX_PKT_TCP) {
m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
mtod(m, uint8_t *));
m->m_flags |= M_FLOWID;
}
#endif
m->m_data += sc->rx_prefix_size;
m->m_len = rx_desc->size - sc->rx_prefix_size;
m->m_pkthdr.len = m->m_len;
m->m_pkthdr.csum_flags = csum_flags;
__sfxge_rx_deliver(sc, rx_desc->mbuf);
rx_desc->flags = EFX_DISCARD;
rx_desc->mbuf = NULL;
}
static void
sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
{
struct sfxge_softc *sc = st->sc;
struct mbuf *m = c->mbuf;
struct tcphdr *c_th;
int csum_flags;
KASSERT(m, ("no mbuf to deliver"));
++st->n_bursts;
/* Finish off packet munging and recalculate IP header checksum. */
if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
struct ip *iph = c->nh;
iph->ip_len = htons(iph->ip_len);
iph->ip_sum = 0;
iph->ip_sum = in_cksum_hdr(iph);
c_th = (struct tcphdr *)(iph + 1);
csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
CSUM_IP_CHECKED | CSUM_IP_VALID);
} else {
struct ip6_hdr *iph = c->nh;
iph->ip6_plen = htons(iph->ip6_plen);
c_th = (struct tcphdr *)(iph + 1);
csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
}
c_th->th_win = c->th_last->th_win;
c_th->th_ack = c->th_last->th_ack;
if (c_th->th_off == c->th_last->th_off) {
/* Copy TCP options (take care to avoid going negative). */
int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
memcpy(c_th + 1, c->th_last + 1, optlen);
}
#ifdef SFXGE_HAVE_MQ
m->m_pkthdr.flowid = c->conn_hash;
m->m_flags |= M_FLOWID;
#endif
m->m_pkthdr.csum_flags = csum_flags;
__sfxge_rx_deliver(sc, m);
c->mbuf = NULL;
c->delivered = 1;
}
/* Drop the given connection, and add it to the free list. */
static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
{
unsigned bucket;
KASSERT(!c->mbuf, ("found orphaned mbuf"));
if (c->next_buf.mbuf) {
sfxge_rx_deliver(rxq->sc, &c->next_buf);
LIST_REMOVE(c, active_link);
}
bucket = c->conn_hash & rxq->lro.conns_mask;
KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
--rxq->lro.conns_n[bucket];
TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
}
/* Stop tracking connections that have gone idle in order to keep hash
* chains short.
*/
static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
{
struct sfxge_lro_conn *c;
unsigned i;
KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
("found active connections"));
rxq->lro.last_purge_ticks = now;
for (i = 0; i <= rxq->lro.conns_mask; ++i) {
if (TAILQ_EMPTY(&rxq->lro.conns[i]))
continue;
c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
if (now - c->last_pkt_ticks > lro_idle_ticks) {
++rxq->lro.n_drop_idle;
sfxge_lro_drop(rxq, c);
}
}
}
static void
sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
struct mbuf *mbuf, struct tcphdr *th)
{
struct tcphdr *c_th;
/* Tack the new mbuf onto the chain. */
KASSERT(!mbuf->m_next, ("mbuf already chained"));
c->mbuf_tail->m_next = mbuf;
c->mbuf_tail = mbuf;
/* Increase length appropriately */
c->mbuf->m_pkthdr.len += mbuf->m_len;
/* Update the connection state flags */
if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
struct ip *iph = c->nh;
iph->ip_len += mbuf->m_len;
c_th = (struct tcphdr *)(iph + 1);
} else {
struct ip6_hdr *iph = c->nh;
iph->ip6_plen += mbuf->m_len;
c_th = (struct tcphdr *)(iph + 1);
}
c_th->th_flags |= (th->th_flags & TH_PUSH);
c->th_last = th;
++st->n_merges;
/* Pass packet up now if another segment could overflow the IP
* length.
*/
if (c->mbuf->m_pkthdr.len > 65536 - 9200)
sfxge_lro_deliver(st, c);
}
static void
sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
struct mbuf *mbuf, void *nh, struct tcphdr *th)
{
/* Start the chain */
c->mbuf = mbuf;
c->mbuf_tail = c->mbuf;
c->nh = nh;
c->th_last = th;
mbuf->m_pkthdr.len = mbuf->m_len;
/* Mangle header fields for later processing */
if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
struct ip *iph = nh;
iph->ip_len = ntohs(iph->ip_len);
} else {
struct ip6_hdr *iph = nh;
iph->ip6_plen = ntohs(iph->ip6_plen);
}
}
/* Try to merge or otherwise hold or deliver (as appropriate) the
* packet buffered for this connection (c->next_buf). Return a flag
* indicating whether the connection is still active for LRO purposes.
*/
static int
sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
{
struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
char *eh = c->next_eh;
int data_length, hdr_length, dont_merge;
unsigned th_seq, pkt_length;
struct tcphdr *th;
unsigned now;
if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
struct ip *iph = c->next_nh;
th = (struct tcphdr *)(iph + 1);
pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
} else {
struct ip6_hdr *iph = c->next_nh;
th = (struct tcphdr *)(iph + 1);
pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
}
hdr_length = (char *) th + th->th_off * 4 - eh;
data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
hdr_length);
th_seq = ntohl(th->th_seq);
dont_merge = ((data_length <= 0)
| (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
/* Check for options other than aligned timestamp. */
if (th->th_off != 5) {
const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
if (th->th_off == 8 &&
opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP)) {
/* timestamp option -- okay */
} else {
dont_merge = 1;
}
}
if (__predict_false(th_seq != c->next_seq)) {
/* Out-of-order, so start counting again. */
if (c->mbuf)
sfxge_lro_deliver(&rxq->lro, c);
c->n_in_order_pkts -= lro_loss_packets;
c->next_seq = th_seq + data_length;
++rxq->lro.n_misorder;
goto deliver_buf_out;
}
c->next_seq = th_seq + data_length;
now = ticks;
if (now - c->last_pkt_ticks > lro_idle_ticks) {
++rxq->lro.n_drop_idle;
if (c->mbuf)
sfxge_lro_deliver(&rxq->lro, c);
sfxge_lro_drop(rxq, c);
return 0;
}
c->last_pkt_ticks = ticks;
if (c->n_in_order_pkts < lro_slow_start_packets) {
/* May be in slow-start, so don't merge. */
++rxq->lro.n_slow_start;
++c->n_in_order_pkts;
goto deliver_buf_out;
}
if (__predict_false(dont_merge)) {
if (c->mbuf)
sfxge_lro_deliver(&rxq->lro, c);
if (th->th_flags & (TH_FIN | TH_RST)) {
++rxq->lro.n_drop_closed;
sfxge_lro_drop(rxq, c);
return 0;
}
goto deliver_buf_out;
}
rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
if (__predict_true(c->mbuf != NULL)) {
/* Remove headers and any padding */
rx_buf->mbuf->m_data += hdr_length;
rx_buf->mbuf->m_len = data_length;
sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
} else {
/* Remove any padding */
rx_buf->mbuf->m_len = pkt_length;
sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
}
rx_buf->mbuf = NULL;
return 1;
deliver_buf_out:
sfxge_rx_deliver(rxq->sc, rx_buf);
return 1;
}
static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
uint16_t l2_id, void *nh, struct tcphdr *th)
{
unsigned bucket = conn_hash & st->conns_mask;
struct sfxge_lro_conn *c;
if (st->conns_n[bucket] >= lro_chain_max) {
++st->n_too_many;
return;
}
if (!TAILQ_EMPTY(&st->free_conns)) {
c = TAILQ_FIRST(&st->free_conns);
TAILQ_REMOVE(&st->free_conns, c, link);
} else {
c = malloc(sizeof(*c), M_SFXGE, M_DONTWAIT);
if (c == NULL)
return;
c->mbuf = NULL;
c->next_buf.mbuf = NULL;
}
/* Create the connection tracking data */
++st->conns_n[bucket];
TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
c->l2_id = l2_id;
c->conn_hash = conn_hash;
c->source = th->th_sport;
c->dest = th->th_dport;
c->n_in_order_pkts = 0;
c->last_pkt_ticks = *(volatile int *)&ticks;
c->delivered = 0;
++st->n_new_stream;
/* NB. We don't initialise c->next_seq, and it doesn't matter what
* value it has. Most likely the next packet received for this
* connection will not match -- no harm done.
*/
}
/* Process mbuf and decide whether to dispatch it to the stack now or
* later.
*/
static void
sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
{
struct sfxge_softc *sc = rxq->sc;
struct mbuf *m = rx_buf->mbuf;
struct ether_header *eh;
struct sfxge_lro_conn *c;
uint16_t l2_id;
uint16_t l3_proto;
void *nh;
struct tcphdr *th;
uint32_t conn_hash;
unsigned bucket;
/* Get the hardware hash */
conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
mtod(m, uint8_t *));
eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
SFXGE_LRO_L2_ID_VLAN;
l3_proto = veh->evl_proto;
nh = veh + 1;
} else {
l2_id = 0;
l3_proto = eh->ether_type;
nh = eh + 1;
}
/* Check whether this is a suitable packet (unfragmented
* TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
* length, and compute a hash if necessary. If not, return.
*/
if (l3_proto == htons(ETHERTYPE_IP)) {
struct ip *iph = nh;
if ((iph->ip_p - IPPROTO_TCP) |
(iph->ip_hl - (sizeof(*iph) >> 2u)) |
(iph->ip_off & htons(IP_MF | IP_OFFMASK)))
goto deliver_now;
th = (struct tcphdr *)(iph + 1);
} else if (l3_proto == htons(ETHERTYPE_IPV6)) {
struct ip6_hdr *iph = nh;
if (iph->ip6_nxt != IPPROTO_TCP)
goto deliver_now;
l2_id |= SFXGE_LRO_L2_ID_IPV6;
th = (struct tcphdr *)(iph + 1);
} else {
goto deliver_now;
}
bucket = conn_hash & rxq->lro.conns_mask;
TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
continue;
if ((c->source - th->th_sport) | (c->dest - th->th_dport))
continue;
if (c->mbuf) {
if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
struct ip *c_iph, *iph = nh;
c_iph = c->nh;
if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
(c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
continue;
} else {
struct ip6_hdr *c_iph, *iph = nh;
c_iph = c->nh;
if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
continue;
}
}
/* Re-insert at head of list to reduce lookup time. */
TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
if (c->next_buf.mbuf) {
if (!sfxge_lro_try_merge(rxq, c))
goto deliver_now;
} else {
LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
active_link);
}
c->next_buf = *rx_buf;
c->next_eh = eh;
c->next_nh = nh;
rx_buf->mbuf = NULL;
rx_buf->flags = EFX_DISCARD;
return;
}
sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
deliver_now:
sfxge_rx_deliver(sc, rx_buf);
}
static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
{
struct sfxge_lro_state *st = &rxq->lro;
struct sfxge_lro_conn *c;
unsigned t;
while (!LIST_EMPTY(&st->active_conns)) {
c = LIST_FIRST(&st->active_conns);
if (!c->delivered && c->mbuf)
sfxge_lro_deliver(st, c);
if (sfxge_lro_try_merge(rxq, c)) {
if (c->mbuf)
sfxge_lro_deliver(st, c);
LIST_REMOVE(c, active_link);
}
c->delivered = 0;
}
t = *(volatile int *)&ticks;
if (__predict_false(t != st->last_purge_ticks))
sfxge_lro_purge_idle(rxq, t);
}
void
sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
{
struct sfxge_softc *sc = rxq->sc;
int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
unsigned int index;
struct sfxge_evq *evq;
unsigned int completed;
unsigned int level;
struct mbuf *m;
struct sfxge_rx_sw_desc *prev = NULL;
index = rxq->index;
evq = sc->evq[index];
mtx_assert(&evq->lock, MA_OWNED);
completed = rxq->completed;
while (completed != rxq->pending) {
unsigned int id;
struct sfxge_rx_sw_desc *rx_desc;
id = completed++ & (SFXGE_NDESCS - 1);
rx_desc = &rxq->queue[id];
m = rx_desc->mbuf;
if (rxq->init_state != SFXGE_RXQ_STARTED)
goto discard;
if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
goto discard;
prefetch_read_many(mtod(m, caddr_t));
/* Check for loopback packets */
if (!(rx_desc->flags & EFX_PKT_IPV4) &&
!(rx_desc->flags & EFX_PKT_IPV6)) {
struct ether_header *etherhp;
/*LINTED*/
etherhp = mtod(m, struct ether_header *);
if (etherhp->ether_type ==
htons(SFXGE_ETHERTYPE_LOOPBACK)) {
EFSYS_PROBE(loopback);
rxq->loopback++;
goto discard;
}
}
/* Pass packet up the stack or into LRO (pipelined) */
if (prev != NULL) {
if (lro_enabled)
sfxge_lro(rxq, prev);
else
sfxge_rx_deliver(sc, prev);
}
prev = rx_desc;
continue;
discard:
/* Return the packet to the pool */
m_free(m);
rx_desc->mbuf = NULL;
}
rxq->completed = completed;
level = rxq->added - rxq->completed;
/* Pass last packet up the stack or into LRO */
if (prev != NULL) {
if (lro_enabled)
sfxge_lro(rxq, prev);
else
sfxge_rx_deliver(sc, prev);
}
/*
* If there are any pending flows and this is the end of the
* poll then they must be completed.
*/
if (eop)
sfxge_lro_end_of_burst(rxq);
/* Top up the queue if necessary */
if (level < RX_REFILL_THRESHOLD)
sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_FALSE);
}
static void
sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
{
struct sfxge_rxq *rxq;
struct sfxge_evq *evq;
unsigned int count;
rxq = sc->rxq[index];
evq = sc->evq[index];
mtx_lock(&evq->lock);
KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
("rxq not started"));
rxq->init_state = SFXGE_RXQ_INITIALIZED;
callout_stop(&rxq->refill_callout);
again:
rxq->flush_state = SFXGE_FLUSH_PENDING;
/* Flush the receive queue */
efx_rx_qflush(rxq->common);
mtx_unlock(&evq->lock);
count = 0;
do {
/* Spin for 100 ms */
DELAY(100000);
if (rxq->flush_state != SFXGE_FLUSH_PENDING)
break;
} while (++count < 20);
mtx_lock(&evq->lock);
if (rxq->flush_state == SFXGE_FLUSH_FAILED)
goto again;
rxq->flush_state = SFXGE_FLUSH_DONE;
rxq->pending = rxq->added;
sfxge_rx_qcomplete(rxq, B_TRUE);
KASSERT(rxq->completed == rxq->pending,
("rxq->completed != rxq->pending"));
rxq->added = 0;
rxq->pending = 0;
rxq->completed = 0;
rxq->loopback = 0;
/* Destroy the common code receive queue. */
efx_rx_qdestroy(rxq->common);
efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
EFX_RXQ_NBUFS(SFXGE_NDESCS));
mtx_unlock(&evq->lock);
}
static int
sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
{
struct sfxge_rxq *rxq;
efsys_mem_t *esmp;
struct sfxge_evq *evq;
int rc;
rxq = sc->rxq[index];
esmp = &rxq->mem;
evq = sc->evq[index];
KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
("evq->init_state != SFXGE_EVQ_STARTED"));
/* Program the buffer table. */
if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
EFX_RXQ_NBUFS(SFXGE_NDESCS))) != 0)
return rc;
/* Create the common code receive queue. */
if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
esmp, SFXGE_NDESCS, rxq->buf_base_id, evq->common,
&rxq->common)) != 0)
goto fail;
mtx_lock(&evq->lock);
/* Enable the receive queue. */
efx_rx_qenable(rxq->common);
rxq->init_state = SFXGE_RXQ_STARTED;
/* Try to fill the queue from the pool. */
sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_FALSE);
mtx_unlock(&evq->lock);
return (0);
fail:
efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
EFX_RXQ_NBUFS(SFXGE_NDESCS));
return rc;
}
void
sfxge_rx_stop(struct sfxge_softc *sc)
{
struct sfxge_intr *intr;
int index;
intr = &sc->intr;
/* Stop the receive queue(s) */
index = intr->n_alloc;
while (--index >= 0)
sfxge_rx_qstop(sc, index);
sc->rx_prefix_size = 0;
sc->rx_buffer_size = 0;
efx_rx_fini(sc->enp);
}
int
sfxge_rx_start(struct sfxge_softc *sc)
{
struct sfxge_intr *intr;
int index;
int rc;
intr = &sc->intr;
/* Initialize the common code receive module. */
if ((rc = efx_rx_init(sc->enp)) != 0)
return (rc);
/* Calculate the receive packet buffer size. */
sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
sc->rx_prefix_size);
/* Select zone for packet buffers */
if (sc->rx_buffer_size <= MCLBYTES)
sc->rx_buffer_zone = zone_clust;
else if (sc->rx_buffer_size <= MJUMPAGESIZE)
sc->rx_buffer_zone = zone_jumbop;
else if (sc->rx_buffer_size <= MJUM9BYTES)
sc->rx_buffer_zone = zone_jumbo9;
else
sc->rx_buffer_zone = zone_jumbo16;
/*
* Set up the scale table. Enable all hash types and hash insertion.
*/
for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
sc->rx_indir_table[index] = index % sc->intr.n_alloc;
if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
SFXGE_RX_SCALE_MAX)) != 0)
goto fail;
(void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
(1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
(1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
sizeof(toep_key))) != 0)
goto fail;
/* Start the receive queue(s). */
for (index = 0; index < intr->n_alloc; index++) {
if ((rc = sfxge_rx_qstart(sc, index)) != 0)
goto fail2;
}
return (0);
fail2:
while (--index >= 0)
sfxge_rx_qstop(sc, index);
fail:
efx_rx_fini(sc->enp);
return (rc);
}
static void sfxge_lro_init(struct sfxge_rxq *rxq)
{
struct sfxge_lro_state *st = &rxq->lro;
unsigned i;
st->conns_mask = lro_table_size - 1;
KASSERT(!((st->conns_mask + 1) & st->conns_mask),
("lro_table_size must be a power of 2"));
st->sc = rxq->sc;
st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
M_SFXGE, M_WAITOK);
st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
M_SFXGE, M_WAITOK);
for (i = 0; i <= st->conns_mask; ++i) {
TAILQ_INIT(&st->conns[i]);
st->conns_n[i] = 0;
}
LIST_INIT(&st->active_conns);
TAILQ_INIT(&st->free_conns);
}
static void sfxge_lro_fini(struct sfxge_rxq *rxq)
{
struct sfxge_lro_state *st = &rxq->lro;
struct sfxge_lro_conn *c;
unsigned i;
/* Return cleanly if sfxge_lro_init() has not been called. */
if (st->conns == NULL)
return;
KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
for (i = 0; i <= st->conns_mask; ++i) {
while (!TAILQ_EMPTY(&st->conns[i])) {
c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
sfxge_lro_drop(rxq, c);
}
}
while (!TAILQ_EMPTY(&st->free_conns)) {
c = TAILQ_FIRST(&st->free_conns);
TAILQ_REMOVE(&st->free_conns, c, link);
KASSERT(!c->mbuf, ("found orphaned mbuf"));
free(c, M_SFXGE);
}
free(st->conns_n, M_SFXGE);
free(st->conns, M_SFXGE);
st->conns = NULL;
}
static void
sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
{
struct sfxge_rxq *rxq;
rxq = sc->rxq[index];
KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
/* Free the context array and the flow table. */
free(rxq->queue, M_SFXGE);
sfxge_lro_fini(rxq);
/* Release DMA memory. */
sfxge_dma_free(&rxq->mem);
sc->rxq[index] = NULL;
free(rxq, M_SFXGE);
}
static int
sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
{
struct sfxge_rxq *rxq;
struct sfxge_evq *evq;
efsys_mem_t *esmp;
int rc;
KASSERT(index < sc->intr.n_alloc, ("index >= %d", sc->intr.n_alloc));
rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
rxq->sc = sc;
rxq->index = index;
sc->rxq[index] = rxq;
esmp = &rxq->mem;
evq = sc->evq[index];
/* Allocate and zero DMA space. */
if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(SFXGE_NDESCS), esmp)) != 0)
return (rc);
(void)memset(esmp->esm_base, 0, EFX_RXQ_SIZE(SFXGE_NDESCS));
/* Allocate buffer table entries. */
sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(SFXGE_NDESCS),
&rxq->buf_base_id);
/* Allocate the context array and the flow table. */
rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * SFXGE_NDESCS,
M_SFXGE, M_WAITOK | M_ZERO);
sfxge_lro_init(rxq);
callout_init(&rxq->refill_callout, B_TRUE);
rxq->init_state = SFXGE_RXQ_INITIALIZED;
return (0);
}
static const struct {
const char *name;
size_t offset;
} sfxge_rx_stats[] = {
#define SFXGE_RX_STAT(name, member) \
{ #name, offsetof(struct sfxge_rxq, member) }
SFXGE_RX_STAT(lro_merges, lro.n_merges),
SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
};
static int
sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
{
struct sfxge_softc *sc = arg1;
unsigned int id = arg2;
unsigned int sum, index;
/* Sum across all RX queues */
sum = 0;
for (index = 0; index < sc->intr.n_alloc; index++)
sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
sfxge_rx_stats[id].offset);
return SYSCTL_OUT(req, &sum, sizeof(sum));
}
static void
sfxge_rx_stat_init(struct sfxge_softc *sc)
{
struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
struct sysctl_oid_list *stat_list;
unsigned int id;
stat_list = SYSCTL_CHILDREN(sc->stats_node);
for (id = 0;
id < sizeof(sfxge_rx_stats) / sizeof(sfxge_rx_stats[0]);
id++) {
SYSCTL_ADD_PROC(
ctx, stat_list,
OID_AUTO, sfxge_rx_stats[id].name,
CTLTYPE_UINT|CTLFLAG_RD,
sc, id, sfxge_rx_stat_handler, "IU",
"");
}
}
void
sfxge_rx_fini(struct sfxge_softc *sc)
{
struct sfxge_intr *intr;
int index;
intr = &sc->intr;
index = intr->n_alloc;
while (--index >= 0)
sfxge_rx_qfini(sc, index);
}
int
sfxge_rx_init(struct sfxge_softc *sc)
{
struct sfxge_intr *intr;
int index;
int rc;
if (lro_idle_ticks == 0)
lro_idle_ticks = hz / 10 + 1; /* 100 ms */
intr = &sc->intr;
KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
("intr->state != SFXGE_INTR_INITIALIZED"));
/* Initialize the receive queue(s) - one per interrupt. */
for (index = 0; index < intr->n_alloc; index++) {
if ((rc = sfxge_rx_qinit(sc, index)) != 0)
goto fail;
}
sfxge_rx_stat_init(sc);
return (0);
fail:
/* Tear down the receive queue(s). */
while (--index >= 0)
sfxge_rx_qfini(sc, index);
return (rc);
}