2016-05-18 04:35:58 +00:00
|
|
|
/*-
|
2018-04-12 14:35:37 +00:00
|
|
|
* Copyright (c) 2014-2018, Matthew Macy <mmacy@mattmacy.io>
|
2016-05-18 04:35:58 +00:00
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions are met:
|
|
|
|
*
|
|
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
|
|
* this list of conditions and the following disclaimer.
|
|
|
|
*
|
|
|
|
* 2. Neither the name of Matthew Macy nor the names of its
|
|
|
|
* contributors may be used to endorse or promote products derived from
|
|
|
|
* this software without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
2016-05-18 14:18:03 +00:00
|
|
|
#include "opt_inet.h"
|
|
|
|
#include "opt_inet6.h"
|
|
|
|
#include "opt_acpi.h"
|
2017-12-20 01:03:34 +00:00
|
|
|
#include "opt_sched.h"
|
2016-05-18 14:18:03 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/bus.h>
|
|
|
|
#include <sys/eventhandler.h>
|
|
|
|
#include <sys/kernel.h>
|
|
|
|
#include <sys/lock.h>
|
|
|
|
#include <sys/mutex.h>
|
|
|
|
#include <sys/module.h>
|
|
|
|
#include <sys/kobj.h>
|
|
|
|
#include <sys/rman.h>
|
|
|
|
#include <sys/sbuf.h>
|
|
|
|
#include <sys/smp.h>
|
|
|
|
#include <sys/socket.h>
|
2018-05-11 20:08:28 +00:00
|
|
|
#include <sys/sockio.h>
|
2016-05-18 04:35:58 +00:00
|
|
|
#include <sys/sysctl.h>
|
|
|
|
#include <sys/syslog.h>
|
|
|
|
#include <sys/taskqueue.h>
|
2016-08-12 21:29:44 +00:00
|
|
|
#include <sys/limits.h>
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
#include <net/if.h>
|
|
|
|
#include <net/if_var.h>
|
|
|
|
#include <net/if_types.h>
|
|
|
|
#include <net/if_media.h>
|
|
|
|
#include <net/bpf.h>
|
|
|
|
#include <net/ethernet.h>
|
|
|
|
#include <net/mp_ring.h>
|
2019-10-17 16:23:03 +00:00
|
|
|
#include <net/debugnet.h>
|
2019-04-24 13:32:04 +00:00
|
|
|
#include <net/pfil.h>
|
2017-11-06 16:23:21 +00:00
|
|
|
#include <net/vnet.h>
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
#include <netinet/in.h>
|
|
|
|
#include <netinet/in_pcb.h>
|
|
|
|
#include <netinet/tcp_lro.h>
|
|
|
|
#include <netinet/in_systm.h>
|
|
|
|
#include <netinet/if_ether.h>
|
|
|
|
#include <netinet/ip.h>
|
|
|
|
#include <netinet/ip6.h>
|
|
|
|
#include <netinet/tcp.h>
|
2017-11-06 16:23:21 +00:00
|
|
|
#include <netinet/ip_var.h>
|
|
|
|
#include <netinet6/ip6_var.h>
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
#include <machine/bus.h>
|
|
|
|
#include <machine/in_cksum.h>
|
|
|
|
|
|
|
|
#include <vm/vm.h>
|
|
|
|
#include <vm/pmap.h>
|
|
|
|
|
|
|
|
#include <dev/led/led.h>
|
|
|
|
#include <dev/pci/pcireg.h>
|
|
|
|
#include <dev/pci/pcivar.h>
|
|
|
|
#include <dev/pci/pci_private.h>
|
|
|
|
|
|
|
|
#include <net/iflib.h>
|
2018-05-11 20:08:28 +00:00
|
|
|
#include <net/iflib_private.h>
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
#include "ifdi_if.h"
|
|
|
|
|
2018-10-12 22:40:54 +00:00
|
|
|
#ifdef PCI_IOV
|
|
|
|
#include <dev/pci/pci_iov.h>
|
|
|
|
#endif
|
|
|
|
|
2017-07-03 18:23:35 +00:00
|
|
|
#include <sys/bitstring.h>
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
2017-03-13 22:53:06 +00:00
|
|
|
* enable accounting of every mbuf as it comes in to and goes out of
|
|
|
|
* iflib's software descriptor references
|
2016-05-18 04:35:58 +00:00
|
|
|
*/
|
|
|
|
#define MEMORY_LOGGING 0
|
|
|
|
/*
|
|
|
|
* Enable mbuf vectors for compressing long mbuf chains
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* NB:
|
|
|
|
* - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead
|
|
|
|
* we prefetch needs to be determined by the time spent in m_free vis a vis
|
|
|
|
* the cost of a prefetch. This will of course vary based on the workload:
|
|
|
|
* - NFLX's m_free path is dominated by vm-based M_EXT manipulation which
|
|
|
|
* is quite expensive, thus suggesting very little prefetch.
|
|
|
|
* - small packet forwarding which is just returning a single mbuf to
|
|
|
|
* UMA will typically be very fast vis a vis the cost of a memory
|
|
|
|
* access.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* File organization:
|
|
|
|
* - private structures
|
|
|
|
* - iflib private utility functions
|
|
|
|
* - ifnet functions
|
|
|
|
* - vlan registry and other exported functions
|
|
|
|
* - iflib public core functions
|
|
|
|
*
|
|
|
|
*
|
|
|
|
*/
|
2018-05-11 20:08:28 +00:00
|
|
|
MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library");
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2020-02-12 08:30:07 +00:00
|
|
|
#define IFLIB_RXEOF_MORE (1U << 0)
|
|
|
|
#define IFLIB_RXEOF_EMPTY (2U << 0)
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
struct iflib_txq;
|
|
|
|
typedef struct iflib_txq *iflib_txq_t;
|
|
|
|
struct iflib_rxq;
|
|
|
|
typedef struct iflib_rxq *iflib_rxq_t;
|
|
|
|
struct iflib_fl;
|
|
|
|
typedef struct iflib_fl *iflib_fl_t;
|
|
|
|
|
2017-01-15 00:50:10 +00:00
|
|
|
struct iflib_ctx;
|
|
|
|
|
2017-10-30 21:14:31 +00:00
|
|
|
static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid);
|
2018-07-20 17:24:45 +00:00
|
|
|
static void iflib_timer(void *arg);
|
2020-12-07 14:52:57 +00:00
|
|
|
static void iflib_tqg_detach(if_ctx_t ctx);
|
2017-10-30 21:14:31 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
typedef struct iflib_filter_info {
|
|
|
|
driver_filter_t *ifi_filter;
|
|
|
|
void *ifi_filter_arg;
|
|
|
|
struct grouptask *ifi_task;
|
2017-03-13 22:53:06 +00:00
|
|
|
void *ifi_ctx;
|
2016-05-18 04:35:58 +00:00
|
|
|
} *iflib_filter_info_t;
|
|
|
|
|
|
|
|
struct iflib_ctx {
|
|
|
|
KOBJ_FIELDS;
|
2018-10-12 22:40:54 +00:00
|
|
|
/*
|
|
|
|
* Pointer to hardware driver's softc
|
|
|
|
*/
|
2016-05-18 04:35:58 +00:00
|
|
|
void *ifc_softc;
|
|
|
|
device_t ifc_dev;
|
|
|
|
if_t ifc_ifp;
|
|
|
|
|
|
|
|
cpuset_t ifc_cpus;
|
|
|
|
if_shared_ctx_t ifc_sctx;
|
|
|
|
struct if_softc_ctx ifc_softc_ctx;
|
|
|
|
|
2018-05-03 17:02:31 +00:00
|
|
|
struct sx ifc_ctx_sx;
|
2018-04-12 14:35:37 +00:00
|
|
|
struct mtx ifc_state_mtx;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
iflib_txq_t ifc_txqs;
|
|
|
|
iflib_rxq_t ifc_rxqs;
|
|
|
|
uint32_t ifc_if_flags;
|
|
|
|
uint32_t ifc_flags;
|
|
|
|
uint32_t ifc_max_fl_buf_size;
|
2019-03-19 17:59:56 +00:00
|
|
|
uint32_t ifc_rx_mbuf_sz;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
int ifc_link_state;
|
|
|
|
int ifc_watchdog_events;
|
|
|
|
struct cdev *ifc_led_dev;
|
|
|
|
struct resource *ifc_msix_mem;
|
|
|
|
|
|
|
|
struct if_irq ifc_legacy_irq;
|
|
|
|
struct grouptask ifc_admin_task;
|
|
|
|
struct grouptask ifc_vflr_task;
|
|
|
|
struct iflib_filter_info ifc_filter_info;
|
|
|
|
struct ifmedia ifc_media;
|
2019-05-03 20:05:31 +00:00
|
|
|
struct ifmedia *ifc_mediap;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
struct sysctl_oid *ifc_sysctl_node;
|
|
|
|
uint16_t ifc_sysctl_ntxqs;
|
|
|
|
uint16_t ifc_sysctl_nrxqs;
|
2016-08-12 21:29:44 +00:00
|
|
|
uint16_t ifc_sysctl_qs_eq_override;
|
2017-09-23 01:37:01 +00:00
|
|
|
uint16_t ifc_sysctl_rx_budget;
|
2018-07-20 17:45:26 +00:00
|
|
|
uint16_t ifc_sysctl_tx_abdicate;
|
2019-04-25 21:24:56 +00:00
|
|
|
uint16_t ifc_sysctl_core_offset;
|
|
|
|
#define CORE_OFFSET_UNSPECIFIED 0xffff
|
|
|
|
uint8_t ifc_sysctl_separate_txrx;
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
uint8_t ifc_sysctl_use_logical_cores;
|
|
|
|
bool ifc_cpus_are_physical_cores;
|
2016-08-12 21:29:44 +00:00
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
qidx_t ifc_sysctl_ntxds[8];
|
|
|
|
qidx_t ifc_sysctl_nrxds[8];
|
2016-05-18 04:35:58 +00:00
|
|
|
struct if_txrx ifc_txrx;
|
|
|
|
#define isc_txd_encap ifc_txrx.ift_txd_encap
|
|
|
|
#define isc_txd_flush ifc_txrx.ift_txd_flush
|
|
|
|
#define isc_txd_credits_update ifc_txrx.ift_txd_credits_update
|
|
|
|
#define isc_rxd_available ifc_txrx.ift_rxd_available
|
|
|
|
#define isc_rxd_pkt_get ifc_txrx.ift_rxd_pkt_get
|
|
|
|
#define isc_rxd_refill ifc_txrx.ift_rxd_refill
|
|
|
|
#define isc_rxd_flush ifc_txrx.ift_rxd_flush
|
|
|
|
#define isc_legacy_intr ifc_txrx.ift_legacy_intr
|
|
|
|
eventhandler_tag ifc_vlan_attach_event;
|
|
|
|
eventhandler_tag ifc_vlan_detach_event;
|
2019-04-17 17:19:54 +00:00
|
|
|
struct ether_addr ifc_mac;
|
2016-05-18 04:35:58 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
void *
|
|
|
|
iflib_get_softc(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (ctx->ifc_softc);
|
|
|
|
}
|
|
|
|
|
|
|
|
device_t
|
|
|
|
iflib_get_dev(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (ctx->ifc_dev);
|
|
|
|
}
|
|
|
|
|
|
|
|
if_t
|
|
|
|
iflib_get_ifp(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (ctx->ifc_ifp);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct ifmedia *
|
|
|
|
iflib_get_media(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
|
2019-05-03 20:05:31 +00:00
|
|
|
return (ctx->ifc_mediap);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
2018-05-11 20:08:28 +00:00
|
|
|
uint32_t
|
|
|
|
iflib_get_flags(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
return (ctx->ifc_flags);
|
|
|
|
}
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
void
|
|
|
|
iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN])
|
|
|
|
{
|
|
|
|
|
2019-04-17 17:19:54 +00:00
|
|
|
bcopy(mac, ctx->ifc_mac.octet, ETHER_ADDR_LEN);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if_softc_ctx_t
|
|
|
|
iflib_get_softc_ctx(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (&ctx->ifc_softc_ctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
if_shared_ctx_t
|
|
|
|
iflib_get_sctx(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (ctx->ifc_sctx);
|
|
|
|
}
|
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
#define IP_ALIGNED(m) ((((uintptr_t)(m)->m_data) & 0x3) == 0x2)
|
2016-05-18 04:35:58 +00:00
|
|
|
#define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*))
|
2017-03-14 22:25:07 +00:00
|
|
|
#define CACHE_PTR_NEXT(ptr) ((void *)(((uintptr_t)(ptr)+CACHE_LINE_SIZE-1) & (CACHE_LINE_SIZE-1)))
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
#define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP)
|
|
|
|
#define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF)
|
|
|
|
|
2017-01-27 23:08:06 +00:00
|
|
|
typedef struct iflib_sw_rx_desc_array {
|
|
|
|
bus_dmamap_t *ifsd_map; /* bus_dma maps for packet */
|
|
|
|
struct mbuf **ifsd_m; /* pkthdr mbufs */
|
|
|
|
caddr_t *ifsd_cl; /* direct cluster pointer for rx */
|
2018-11-27 20:01:05 +00:00
|
|
|
bus_addr_t *ifsd_ba; /* bus addr of cluster for rx */
|
2017-01-27 23:08:06 +00:00
|
|
|
} iflib_rxsd_array_t;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
typedef struct iflib_sw_tx_desc_array {
|
|
|
|
bus_dmamap_t *ifsd_map; /* bus_dma maps for packet */
|
2019-01-16 05:44:14 +00:00
|
|
|
bus_dmamap_t *ifsd_tso_map; /* bus_dma maps for TSO packet */
|
2016-05-18 04:35:58 +00:00
|
|
|
struct mbuf **ifsd_m; /* pkthdr mbufs */
|
2017-03-13 22:53:06 +00:00
|
|
|
} if_txsd_vec_t;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
/* magic number that should be high enough for any hardware */
|
|
|
|
#define IFLIB_MAX_TX_SEGS 128
|
2017-03-13 22:53:06 +00:00
|
|
|
#define IFLIB_RX_COPY_THRESH 128
|
2016-05-18 04:35:58 +00:00
|
|
|
#define IFLIB_MAX_RX_REFRESH 32
|
2017-03-13 22:53:06 +00:00
|
|
|
/* The minimum descriptors per second before we start coalescing */
|
|
|
|
#define IFLIB_MIN_DESC_SEC 16384
|
|
|
|
#define IFLIB_DEFAULT_TX_UPDATE_FREQ 16
|
2016-05-18 04:35:58 +00:00
|
|
|
#define IFLIB_QUEUE_IDLE 0
|
|
|
|
#define IFLIB_QUEUE_HUNG 1
|
|
|
|
#define IFLIB_QUEUE_WORKING 2
|
2017-03-13 22:53:06 +00:00
|
|
|
/* maximum number of txqs that can share an rx interrupt */
|
|
|
|
#define IFLIB_MAX_TX_SHARED_INTR 4
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
/* this should really scale with ring size - this is a fairly arbitrary value */
|
|
|
|
#define TX_BATCH_SIZE 32
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
#define IFLIB_RESTART_BUDGET 8
|
|
|
|
|
|
|
|
#define CSUM_OFFLOAD (CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \
|
|
|
|
CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \
|
|
|
|
CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP)
|
2019-05-06 20:56:41 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
struct iflib_txq {
|
2017-03-13 22:53:06 +00:00
|
|
|
qidx_t ift_in_use;
|
|
|
|
qidx_t ift_cidx;
|
|
|
|
qidx_t ift_cidx_processed;
|
|
|
|
qidx_t ift_pidx;
|
2016-05-18 04:35:58 +00:00
|
|
|
uint8_t ift_gen;
|
2016-08-12 21:29:44 +00:00
|
|
|
uint8_t ift_br_offset;
|
2017-03-13 22:53:06 +00:00
|
|
|
uint16_t ift_npending;
|
|
|
|
uint16_t ift_db_pending;
|
|
|
|
uint16_t ift_rs_pending;
|
2016-05-18 04:35:58 +00:00
|
|
|
/* implicit pad */
|
2017-03-13 22:53:06 +00:00
|
|
|
uint8_t ift_txd_size[8];
|
2016-05-18 04:35:58 +00:00
|
|
|
uint64_t ift_processed;
|
|
|
|
uint64_t ift_cleaned;
|
2017-03-13 22:53:06 +00:00
|
|
|
uint64_t ift_cleaned_prev;
|
2016-05-18 04:35:58 +00:00
|
|
|
#if MEMORY_LOGGING
|
|
|
|
uint64_t ift_enqueued;
|
|
|
|
uint64_t ift_dequeued;
|
|
|
|
#endif
|
|
|
|
uint64_t ift_no_tx_dma_setup;
|
|
|
|
uint64_t ift_no_desc_avail;
|
|
|
|
uint64_t ift_mbuf_defrag_failed;
|
|
|
|
uint64_t ift_mbuf_defrag;
|
|
|
|
uint64_t ift_map_failed;
|
|
|
|
uint64_t ift_txd_encap_efbig;
|
|
|
|
uint64_t ift_pullups;
|
2018-07-20 17:24:45 +00:00
|
|
|
uint64_t ift_last_timer_tick;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
struct mtx ift_mtx;
|
|
|
|
struct mtx ift_db_mtx;
|
|
|
|
|
|
|
|
/* constant values */
|
|
|
|
if_ctx_t ift_ctx;
|
2017-03-13 22:53:06 +00:00
|
|
|
struct ifmp_ring *ift_br;
|
2016-05-18 04:35:58 +00:00
|
|
|
struct grouptask ift_task;
|
2017-03-13 22:53:06 +00:00
|
|
|
qidx_t ift_size;
|
2016-05-18 04:35:58 +00:00
|
|
|
uint16_t ift_id;
|
|
|
|
struct callout ift_timer;
|
2020-10-27 21:53:33 +00:00
|
|
|
#ifdef DEV_NETMAP
|
|
|
|
struct callout ift_netmap_timer;
|
|
|
|
#endif /* DEV_NETMAP */
|
2017-03-13 22:53:06 +00:00
|
|
|
|
|
|
|
if_txsd_vec_t ift_sds;
|
|
|
|
uint8_t ift_qstatus;
|
|
|
|
uint8_t ift_closed;
|
|
|
|
uint8_t ift_update_freq;
|
2016-05-18 04:35:58 +00:00
|
|
|
struct iflib_filter_info ift_filter_info;
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dma_tag_t ift_buf_tag;
|
|
|
|
bus_dma_tag_t ift_tso_buf_tag;
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_dma_info_t ift_ifdi;
|
2020-04-30 15:39:04 +00:00
|
|
|
#define MTX_NAME_LEN 32
|
2016-05-18 04:35:58 +00:00
|
|
|
char ift_mtx_name[MTX_NAME_LEN];
|
|
|
|
bus_dma_segment_t ift_segs[IFLIB_MAX_TX_SEGS] __aligned(CACHE_LINE_SIZE);
|
2017-01-02 00:56:33 +00:00
|
|
|
#ifdef IFLIB_DIAGNOSTICS
|
|
|
|
uint64_t ift_cpu_exec_count[256];
|
|
|
|
#endif
|
2016-05-18 04:35:58 +00:00
|
|
|
} __aligned(CACHE_LINE_SIZE);
|
|
|
|
|
|
|
|
struct iflib_fl {
|
2017-03-13 22:53:06 +00:00
|
|
|
qidx_t ifl_cidx;
|
|
|
|
qidx_t ifl_pidx;
|
|
|
|
qidx_t ifl_credits;
|
2016-05-18 04:35:58 +00:00
|
|
|
uint8_t ifl_gen;
|
2017-03-13 22:53:06 +00:00
|
|
|
uint8_t ifl_rxd_size;
|
2016-05-18 04:35:58 +00:00
|
|
|
#if MEMORY_LOGGING
|
|
|
|
uint64_t ifl_m_enqueued;
|
|
|
|
uint64_t ifl_m_dequeued;
|
|
|
|
uint64_t ifl_cl_enqueued;
|
|
|
|
uint64_t ifl_cl_dequeued;
|
|
|
|
#endif
|
|
|
|
/* implicit pad */
|
2017-07-03 18:23:35 +00:00
|
|
|
bitstr_t *ifl_rx_bitmap;
|
|
|
|
qidx_t ifl_fragidx;
|
2016-05-18 04:35:58 +00:00
|
|
|
/* constant */
|
2017-03-13 22:53:06 +00:00
|
|
|
qidx_t ifl_size;
|
2016-05-18 04:35:58 +00:00
|
|
|
uint16_t ifl_buf_size;
|
|
|
|
uint16_t ifl_cltype;
|
|
|
|
uma_zone_t ifl_zone;
|
2017-01-27 23:08:06 +00:00
|
|
|
iflib_rxsd_array_t ifl_sds;
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_rxq_t ifl_rxq;
|
|
|
|
uint8_t ifl_id;
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dma_tag_t ifl_buf_tag;
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_dma_info_t ifl_ifdi;
|
|
|
|
uint64_t ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE);
|
2020-07-06 14:52:21 +00:00
|
|
|
qidx_t ifl_rxd_idxs[IFLIB_MAX_RX_REFRESH];
|
2016-05-18 04:35:58 +00:00
|
|
|
} __aligned(CACHE_LINE_SIZE);
|
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
static inline qidx_t
|
|
|
|
get_inuse(int size, qidx_t cidx, qidx_t pidx, uint8_t gen)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
2017-03-13 22:53:06 +00:00
|
|
|
qidx_t used;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
if (pidx > cidx)
|
|
|
|
used = pidx - cidx;
|
|
|
|
else if (pidx < cidx)
|
|
|
|
used = size - cidx + pidx;
|
|
|
|
else if (gen == 0 && pidx == cidx)
|
|
|
|
used = 0;
|
|
|
|
else if (gen == 1 && pidx == cidx)
|
|
|
|
used = size;
|
|
|
|
else
|
|
|
|
panic("bad state");
|
|
|
|
|
|
|
|
return (used);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen))
|
|
|
|
|
|
|
|
#define IDXDIFF(head, tail, wrap) \
|
|
|
|
((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
|
|
|
|
|
|
|
|
struct iflib_rxq {
|
|
|
|
if_ctx_t ifr_ctx;
|
|
|
|
iflib_fl_t ifr_fl;
|
|
|
|
uint64_t ifr_rx_irq;
|
2019-04-24 13:32:04 +00:00
|
|
|
struct pfil_head *pfil;
|
2019-05-06 20:56:41 +00:00
|
|
|
/*
|
|
|
|
* If there is a separate completion queue (IFLIB_HAS_RXCQ), this is
|
2020-08-12 14:45:31 +00:00
|
|
|
* the completion queue consumer index. Otherwise it's unused.
|
2019-05-06 20:56:41 +00:00
|
|
|
*/
|
|
|
|
qidx_t ifr_cq_cidx;
|
2016-05-18 04:35:58 +00:00
|
|
|
uint16_t ifr_id;
|
|
|
|
uint8_t ifr_nfl;
|
2017-03-13 22:53:06 +00:00
|
|
|
uint8_t ifr_ntxqirq;
|
|
|
|
uint8_t ifr_txqid[IFLIB_MAX_TX_SHARED_INTR];
|
2019-05-06 20:56:41 +00:00
|
|
|
uint8_t ifr_fl_offset;
|
2016-05-18 04:35:58 +00:00
|
|
|
struct lro_ctrl ifr_lc;
|
|
|
|
struct grouptask ifr_task;
|
2020-02-12 08:30:07 +00:00
|
|
|
struct callout ifr_watchdog;
|
2016-05-18 04:35:58 +00:00
|
|
|
struct iflib_filter_info ifr_filter_info;
|
|
|
|
iflib_dma_info_t ifr_ifdi;
|
2017-09-16 02:41:38 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
/* dynamically allocate if any drivers need a value substantially larger than this */
|
|
|
|
struct if_rxd_frag ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE);
|
2017-01-02 00:56:33 +00:00
|
|
|
#ifdef IFLIB_DIAGNOSTICS
|
|
|
|
uint64_t ifr_cpu_exec_count[256];
|
|
|
|
#endif
|
2016-05-18 04:35:58 +00:00
|
|
|
} __aligned(CACHE_LINE_SIZE);
|
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
typedef struct if_rxsd {
|
|
|
|
caddr_t *ifsd_cl;
|
|
|
|
iflib_fl_t ifsd_fl;
|
|
|
|
} *if_rxsd_t;
|
|
|
|
|
|
|
|
/* multiple of word size */
|
|
|
|
#ifdef __LP64__
|
2017-09-16 02:41:38 +00:00
|
|
|
#define PKT_INFO_SIZE 6
|
2017-03-13 22:53:06 +00:00
|
|
|
#define RXD_INFO_SIZE 5
|
|
|
|
#define PKT_TYPE uint64_t
|
|
|
|
#else
|
2017-09-16 02:41:38 +00:00
|
|
|
#define PKT_INFO_SIZE 11
|
2017-03-13 22:53:06 +00:00
|
|
|
#define RXD_INFO_SIZE 8
|
|
|
|
#define PKT_TYPE uint32_t
|
|
|
|
#endif
|
|
|
|
#define PKT_LOOP_BOUND ((PKT_INFO_SIZE/3)*3)
|
|
|
|
#define RXD_LOOP_BOUND ((RXD_INFO_SIZE/4)*4)
|
|
|
|
|
|
|
|
typedef struct if_pkt_info_pad {
|
|
|
|
PKT_TYPE pkt_val[PKT_INFO_SIZE];
|
|
|
|
} *if_pkt_info_pad_t;
|
|
|
|
typedef struct if_rxd_info_pad {
|
|
|
|
PKT_TYPE rxd_val[RXD_INFO_SIZE];
|
|
|
|
} *if_rxd_info_pad_t;
|
|
|
|
|
|
|
|
CTASSERT(sizeof(struct if_pkt_info_pad) == sizeof(struct if_pkt_info));
|
|
|
|
CTASSERT(sizeof(struct if_rxd_info_pad) == sizeof(struct if_rxd_info));
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
pkt_info_zero(if_pkt_info_t pi)
|
|
|
|
{
|
|
|
|
if_pkt_info_pad_t pi_pad;
|
|
|
|
|
|
|
|
pi_pad = (if_pkt_info_pad_t)pi;
|
|
|
|
pi_pad->pkt_val[0] = 0; pi_pad->pkt_val[1] = 0; pi_pad->pkt_val[2] = 0;
|
|
|
|
pi_pad->pkt_val[3] = 0; pi_pad->pkt_val[4] = 0; pi_pad->pkt_val[5] = 0;
|
|
|
|
#ifndef __LP64__
|
2017-09-16 02:41:38 +00:00
|
|
|
pi_pad->pkt_val[6] = 0; pi_pad->pkt_val[7] = 0; pi_pad->pkt_val[8] = 0;
|
|
|
|
pi_pad->pkt_val[9] = 0; pi_pad->pkt_val[10] = 0;
|
2017-03-13 22:53:06 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2018-05-11 20:08:28 +00:00
|
|
|
static device_method_t iflib_pseudo_methods[] = {
|
|
|
|
DEVMETHOD(device_attach, noop_attach),
|
|
|
|
DEVMETHOD(device_detach, iflib_pseudo_detach),
|
|
|
|
DEVMETHOD_END
|
|
|
|
};
|
|
|
|
|
|
|
|
driver_t iflib_pseudodriver = {
|
|
|
|
"iflib_pseudo", iflib_pseudo_methods, sizeof(struct iflib_ctx),
|
|
|
|
};
|
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
static inline void
|
|
|
|
rxd_info_zero(if_rxd_info_t ri)
|
|
|
|
{
|
|
|
|
if_rxd_info_pad_t ri_pad;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
ri_pad = (if_rxd_info_pad_t)ri;
|
|
|
|
for (i = 0; i < RXD_LOOP_BOUND; i += 4) {
|
|
|
|
ri_pad->rxd_val[i] = 0;
|
|
|
|
ri_pad->rxd_val[i+1] = 0;
|
|
|
|
ri_pad->rxd_val[i+2] = 0;
|
|
|
|
ri_pad->rxd_val[i+3] = 0;
|
|
|
|
}
|
|
|
|
#ifdef __LP64__
|
|
|
|
ri_pad->rxd_val[RXD_INFO_SIZE-1] = 0;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
|
|
|
* Only allow a single packet to take up most 1/nth of the tx ring
|
|
|
|
*/
|
|
|
|
#define MAX_SINGLE_PACKET_FRACTION 12
|
|
|
|
#define IF_BAD_DMA (bus_addr_t)-1
|
|
|
|
|
2017-09-13 01:18:42 +00:00
|
|
|
#define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING))
|
2017-08-30 18:56:24 +00:00
|
|
|
|
2018-05-03 17:02:31 +00:00
|
|
|
#define CTX_LOCK_INIT(_sc) sx_init(&(_sc)->ifc_ctx_sx, "iflib ctx lock")
|
|
|
|
#define CTX_LOCK(ctx) sx_xlock(&(ctx)->ifc_ctx_sx)
|
|
|
|
#define CTX_UNLOCK(ctx) sx_xunlock(&(ctx)->ifc_ctx_sx)
|
|
|
|
#define CTX_LOCK_DESTROY(ctx) sx_destroy(&(ctx)->ifc_ctx_sx)
|
2018-04-12 14:35:37 +00:00
|
|
|
|
|
|
|
#define STATE_LOCK_INIT(_sc, _name) mtx_init(&(_sc)->ifc_state_mtx, _name, "iflib state lock", MTX_DEF)
|
|
|
|
#define STATE_LOCK(ctx) mtx_lock(&(ctx)->ifc_state_mtx)
|
|
|
|
#define STATE_UNLOCK(ctx) mtx_unlock(&(ctx)->ifc_state_mtx)
|
|
|
|
#define STATE_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_state_mtx)
|
2017-09-16 02:41:38 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
#define CALLOUT_LOCK(txq) mtx_lock(&txq->ift_mtx)
|
|
|
|
#define CALLOUT_UNLOCK(txq) mtx_unlock(&txq->ift_mtx)
|
|
|
|
|
2018-10-12 22:40:54 +00:00
|
|
|
void
|
|
|
|
iflib_set_detach(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
STATE_LOCK(ctx);
|
|
|
|
ctx->ifc_flags |= IFC_IN_DETACH;
|
|
|
|
STATE_UNLOCK(ctx);
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
/* Our boot-time initialization hook */
|
|
|
|
static int iflib_module_event_handler(module_t, int, void *);
|
|
|
|
|
|
|
|
static moduledata_t iflib_moduledata = {
|
|
|
|
"iflib",
|
|
|
|
iflib_module_event_handler,
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
|
|
|
DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY);
|
|
|
|
MODULE_VERSION(iflib, 1);
|
|
|
|
|
|
|
|
MODULE_DEPEND(iflib, pci, 1, 1, 1);
|
|
|
|
MODULE_DEPEND(iflib, ether, 1, 1, 1);
|
|
|
|
|
2017-09-16 02:41:38 +00:00
|
|
|
TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
|
|
|
|
TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
#ifndef IFLIB_DEBUG_COUNTERS
|
|
|
|
#ifdef INVARIANTS
|
|
|
|
#define IFLIB_DEBUG_COUNTERS 1
|
|
|
|
#else
|
|
|
|
#define IFLIB_DEBUG_COUNTERS 0
|
|
|
|
#endif /* !INVARIANTS */
|
|
|
|
#endif
|
|
|
|
|
2020-02-26 14:26:36 +00:00
|
|
|
static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
|
|
|
|
"iflib driver parameters");
|
2017-09-16 02:41:38 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
|
|
|
* XXX need to ensure that this can't accidentally cause the head to be moved backwards
|
|
|
|
*/
|
|
|
|
static int iflib_min_tx_latency = 0;
|
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW,
|
2016-11-18 04:19:21 +00:00
|
|
|
&iflib_min_tx_latency, 0, "minimize transmit latency at the possible expense of throughput");
|
2017-03-13 22:53:06 +00:00
|
|
|
static int iflib_no_tx_batch = 0;
|
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, no_tx_batch, CTLFLAG_RW,
|
|
|
|
&iflib_no_tx_batch, 0, "minimize transmit latency at the possible expense of throughput");
|
2020-12-19 01:08:33 +00:00
|
|
|
static int iflib_timer_default = 1000;
|
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, timer_default, CTLFLAG_RW,
|
|
|
|
&iflib_timer_default, 0, "number of ticks between iflib_timer calls");
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
#if IFLIB_DEBUG_COUNTERS
|
|
|
|
|
|
|
|
static int iflib_tx_seen;
|
|
|
|
static int iflib_tx_sent;
|
|
|
|
static int iflib_tx_encap;
|
|
|
|
static int iflib_rx_allocs;
|
|
|
|
static int iflib_fl_refills;
|
|
|
|
static int iflib_fl_refills_large;
|
|
|
|
static int iflib_tx_frees;
|
|
|
|
|
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD,
|
2019-05-06 20:56:41 +00:00
|
|
|
&iflib_tx_seen, 0, "# TX mbufs seen");
|
2016-05-18 04:35:58 +00:00
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD,
|
2019-05-06 20:56:41 +00:00
|
|
|
&iflib_tx_sent, 0, "# TX mbufs sent");
|
2016-05-18 04:35:58 +00:00
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD,
|
2019-05-06 20:56:41 +00:00
|
|
|
&iflib_tx_encap, 0, "# TX mbufs encapped");
|
2016-05-18 04:35:58 +00:00
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD,
|
2019-05-06 20:56:41 +00:00
|
|
|
&iflib_tx_frees, 0, "# TX frees");
|
2016-05-18 04:35:58 +00:00
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD,
|
2019-05-06 20:56:41 +00:00
|
|
|
&iflib_rx_allocs, 0, "# RX allocations");
|
2016-05-18 04:35:58 +00:00
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD,
|
|
|
|
&iflib_fl_refills, 0, "# refills");
|
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD,
|
|
|
|
&iflib_fl_refills_large, 0, "# large refills");
|
|
|
|
|
|
|
|
static int iflib_txq_drain_flushing;
|
|
|
|
static int iflib_txq_drain_oactive;
|
|
|
|
static int iflib_txq_drain_notready;
|
|
|
|
|
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD,
|
|
|
|
&iflib_txq_drain_flushing, 0, "# drain flushes");
|
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD,
|
|
|
|
&iflib_txq_drain_oactive, 0, "# drain oactives");
|
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD,
|
|
|
|
&iflib_txq_drain_notready, 0, "# drain notready");
|
|
|
|
|
|
|
|
static int iflib_encap_load_mbuf_fail;
|
2017-12-05 21:00:31 +00:00
|
|
|
static int iflib_encap_pad_mbuf_fail;
|
2016-05-18 04:35:58 +00:00
|
|
|
static int iflib_encap_txq_avail_fail;
|
|
|
|
static int iflib_encap_txd_encap_fail;
|
|
|
|
|
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD,
|
|
|
|
&iflib_encap_load_mbuf_fail, 0, "# busdma load failures");
|
2017-12-05 21:00:31 +00:00
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, encap_pad_mbuf_fail, CTLFLAG_RD,
|
|
|
|
&iflib_encap_pad_mbuf_fail, 0, "# runt frame pad failures");
|
2016-05-18 04:35:58 +00:00
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD,
|
|
|
|
&iflib_encap_txq_avail_fail, 0, "# txq avail failures");
|
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD,
|
|
|
|
&iflib_encap_txd_encap_fail, 0, "# driver encap failures");
|
|
|
|
|
|
|
|
static int iflib_task_fn_rxs;
|
|
|
|
static int iflib_rx_intr_enables;
|
|
|
|
static int iflib_fast_intrs;
|
|
|
|
static int iflib_rx_unavail;
|
|
|
|
static int iflib_rx_ctx_inactive;
|
|
|
|
static int iflib_rx_if_input;
|
|
|
|
static int iflib_rxd_flush;
|
|
|
|
|
|
|
|
static int iflib_verbose_debug;
|
|
|
|
|
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD,
|
|
|
|
&iflib_task_fn_rxs, 0, "# task_fn_rx calls");
|
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD,
|
2019-05-06 20:56:41 +00:00
|
|
|
&iflib_rx_intr_enables, 0, "# RX intr enables");
|
2016-05-18 04:35:58 +00:00
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD,
|
|
|
|
&iflib_fast_intrs, 0, "# fast_intr calls");
|
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD,
|
|
|
|
&iflib_rx_unavail, 0, "# times rxeof called with no available data");
|
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD,
|
|
|
|
&iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context");
|
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD,
|
|
|
|
&iflib_rx_if_input, 0, "# times rxeof called if_input");
|
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD,
|
|
|
|
&iflib_rxd_flush, 0, "# times rxd_flush called");
|
|
|
|
SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW,
|
|
|
|
&iflib_verbose_debug, 0, "enable verbose debugging");
|
|
|
|
|
|
|
|
#define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1)
|
2016-11-18 04:19:21 +00:00
|
|
|
static void
|
|
|
|
iflib_debug_reset(void)
|
|
|
|
{
|
|
|
|
iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs =
|
|
|
|
iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees =
|
|
|
|
iflib_txq_drain_flushing = iflib_txq_drain_oactive =
|
2018-09-06 18:51:52 +00:00
|
|
|
iflib_txq_drain_notready =
|
2017-12-05 21:00:31 +00:00
|
|
|
iflib_encap_load_mbuf_fail = iflib_encap_pad_mbuf_fail =
|
|
|
|
iflib_encap_txq_avail_fail = iflib_encap_txd_encap_fail =
|
|
|
|
iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs =
|
2018-09-06 18:51:52 +00:00
|
|
|
iflib_rx_unavail =
|
|
|
|
iflib_rx_ctx_inactive = iflib_rx_if_input =
|
2019-04-24 13:32:04 +00:00
|
|
|
iflib_rxd_flush = 0;
|
2016-11-18 04:19:21 +00:00
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
#else
|
|
|
|
#define DBG_COUNTER_INC(name)
|
2016-11-18 04:19:21 +00:00
|
|
|
static void iflib_debug_reset(void) {}
|
2016-05-18 04:35:58 +00:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#define IFLIB_DEBUG 0
|
|
|
|
|
|
|
|
static void iflib_tx_structures_free(if_ctx_t ctx);
|
|
|
|
static void iflib_rx_structures_free(if_ctx_t ctx);
|
|
|
|
static int iflib_queues_alloc(if_ctx_t ctx);
|
|
|
|
static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq);
|
2017-03-13 22:53:06 +00:00
|
|
|
static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget);
|
2016-05-18 04:35:58 +00:00
|
|
|
static int iflib_qset_structures_setup(if_ctx_t ctx);
|
|
|
|
static int iflib_msix_init(if_ctx_t ctx);
|
2018-05-29 21:56:39 +00:00
|
|
|
static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, const char *str);
|
2016-05-18 04:35:58 +00:00
|
|
|
static void iflib_txq_check_drain(iflib_txq_t txq, int budget);
|
|
|
|
static uint32_t iflib_txq_can_drain(struct ifmp_ring *);
|
2018-07-25 22:46:36 +00:00
|
|
|
#ifdef ALTQ
|
|
|
|
static void iflib_altq_if_start(if_t ifp);
|
|
|
|
static int iflib_altq_if_transmit(if_t ifp, struct mbuf *m);
|
|
|
|
#endif
|
2016-05-18 04:35:58 +00:00
|
|
|
static int iflib_register(if_ctx_t);
|
2019-08-16 23:33:44 +00:00
|
|
|
static void iflib_deregister(if_ctx_t);
|
2019-10-23 23:20:49 +00:00
|
|
|
static void iflib_unregister_vlan_handlers(if_ctx_t ctx);
|
2020-03-14 19:56:46 +00:00
|
|
|
static uint16_t iflib_get_mbuf_size_for(unsigned int size);
|
2016-05-18 04:35:58 +00:00
|
|
|
static void iflib_init_locked(if_ctx_t ctx);
|
|
|
|
static void iflib_add_device_sysctl_pre(if_ctx_t ctx);
|
|
|
|
static void iflib_add_device_sysctl_post(if_ctx_t ctx);
|
2016-11-18 04:19:21 +00:00
|
|
|
static void iflib_ifmp_purge(iflib_txq_t txq);
|
2017-01-02 00:56:33 +00:00
|
|
|
static void _iflib_pre_assert(if_softc_ctx_t scctx);
|
2017-03-13 22:53:06 +00:00
|
|
|
static void iflib_if_init_locked(if_ctx_t ctx);
|
2018-10-12 22:40:54 +00:00
|
|
|
static void iflib_free_intr_mem(if_ctx_t ctx);
|
2017-03-13 22:53:06 +00:00
|
|
|
#ifndef __NO_STRICT_ALIGNMENT
|
|
|
|
static struct mbuf * iflib_fixup_rx(struct mbuf *m);
|
|
|
|
#endif
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2019-04-25 21:24:56 +00:00
|
|
|
static SLIST_HEAD(cpu_offset_list, cpu_offset) cpu_offsets =
|
|
|
|
SLIST_HEAD_INITIALIZER(cpu_offsets);
|
|
|
|
struct cpu_offset {
|
|
|
|
SLIST_ENTRY(cpu_offset) entries;
|
|
|
|
cpuset_t set;
|
|
|
|
unsigned int refcount;
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
uint16_t next_cpuid;
|
2019-04-25 21:24:56 +00:00
|
|
|
};
|
|
|
|
static struct mtx cpu_offset_mtx;
|
|
|
|
MTX_SYSINIT(iflib_cpu_offset, &cpu_offset_mtx, "iflib_cpu_offset lock",
|
|
|
|
MTX_DEF);
|
|
|
|
|
2019-10-17 16:23:03 +00:00
|
|
|
DEBUGNET_DEFINE(iflib);
|
2018-05-06 00:57:52 +00:00
|
|
|
|
2020-07-20 21:08:56 +00:00
|
|
|
static int
|
|
|
|
iflib_num_rx_descs(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
|
|
|
|
if_shared_ctx_t sctx = ctx->ifc_sctx;
|
|
|
|
uint16_t first_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0;
|
|
|
|
|
|
|
|
return scctx->isc_nrxd[first_rxq];
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
iflib_num_tx_descs(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
|
|
|
|
if_shared_ctx_t sctx = ctx->ifc_sctx;
|
|
|
|
uint16_t first_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0;
|
|
|
|
|
|
|
|
return scctx->isc_ntxd[first_txq];
|
|
|
|
}
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
#ifdef DEV_NETMAP
|
|
|
|
#include <sys/selinfo.h>
|
|
|
|
#include <net/netmap.h>
|
|
|
|
#include <dev/netmap/netmap_kern.h>
|
|
|
|
|
|
|
|
MODULE_DEPEND(iflib, netmap, 1, 1, 1);
|
|
|
|
|
2020-08-24 11:44:20 +00:00
|
|
|
static int netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init);
|
2020-10-27 21:53:33 +00:00
|
|
|
static void iflib_netmap_timer(void *arg);
|
2017-10-30 21:14:31 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
|
|
|
* device-specific sysctl variables:
|
|
|
|
*
|
2016-07-08 17:04:21 +00:00
|
|
|
* iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
|
2016-05-18 04:35:58 +00:00
|
|
|
* During regular operations the CRC is stripped, but on some
|
|
|
|
* hardware reception of frames not multiple of 64 is slower,
|
|
|
|
* so using crcstrip=0 helps in benchmarks.
|
|
|
|
*
|
2016-07-08 17:04:21 +00:00
|
|
|
* iflib_rx_miss, iflib_rx_miss_bufs:
|
2016-05-18 04:35:58 +00:00
|
|
|
* count packets that might be missed due to lost interrupts.
|
|
|
|
*/
|
|
|
|
SYSCTL_DECL(_dev_netmap);
|
|
|
|
/*
|
|
|
|
* The xl driver by default strips CRCs and we do not override it.
|
|
|
|
*/
|
|
|
|
|
|
|
|
int iflib_crcstrip = 1;
|
|
|
|
SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip,
|
2019-05-06 20:56:41 +00:00
|
|
|
CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on RX frames");
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
int iflib_rx_miss, iflib_rx_miss_bufs;
|
|
|
|
SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss,
|
2019-05-06 20:56:41 +00:00
|
|
|
CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed RX intr");
|
2016-07-08 17:04:21 +00:00
|
|
|
SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs,
|
2019-05-06 20:56:41 +00:00
|
|
|
CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed RX intr bufs");
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Register/unregister. We are already under netmap lock.
|
|
|
|
* Only called on the first register or the last unregister.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
iflib_netmap_register(struct netmap_adapter *na, int onoff)
|
|
|
|
{
|
2019-05-06 20:56:41 +00:00
|
|
|
if_t ifp = na->ifp;
|
2016-05-18 04:35:58 +00:00
|
|
|
if_ctx_t ctx = ifp->if_softc;
|
2017-03-13 22:53:06 +00:00
|
|
|
int status;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
CTX_LOCK(ctx);
|
|
|
|
if (!CTX_IS_VF(ctx))
|
2017-01-02 00:56:33 +00:00
|
|
|
IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2020-06-14 21:07:12 +00:00
|
|
|
iflib_stop(ctx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Enable (or disable) netmap flags, and intercept (or restore)
|
|
|
|
* ifp->if_transmit. This is done once the device has been stopped
|
2021-01-10 12:00:30 +00:00
|
|
|
* to prevent race conditions. Also, this must be done after
|
|
|
|
* calling netmap_disable_all_rings() and before calling
|
|
|
|
* netmap_enable_all_rings(), so that these two functions see the
|
|
|
|
* updated state of the NAF_NETMAP_ON bit.
|
2020-06-14 21:07:12 +00:00
|
|
|
*/
|
2016-05-18 04:35:58 +00:00
|
|
|
if (onoff) {
|
|
|
|
nm_set_native_flags(na);
|
|
|
|
} else {
|
|
|
|
nm_clear_native_flags(na);
|
|
|
|
}
|
2020-06-14 21:07:12 +00:00
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
iflib_init_locked(ctx);
|
2017-01-02 00:56:33 +00:00
|
|
|
IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ?
|
2017-03-13 22:53:06 +00:00
|
|
|
status = ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1;
|
|
|
|
if (status)
|
|
|
|
nm_clear_native_flags(na);
|
2016-05-18 04:35:58 +00:00
|
|
|
CTX_UNLOCK(ctx);
|
2017-03-13 22:53:06 +00:00
|
|
|
return (status);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
2021-03-29 09:26:12 +00:00
|
|
|
static int
|
|
|
|
iflib_netmap_config(struct netmap_adapter *na, struct nm_config_info *info)
|
|
|
|
{
|
|
|
|
if_t ifp = na->ifp;
|
|
|
|
if_ctx_t ctx = ifp->if_softc;
|
|
|
|
iflib_rxq_t rxq = &ctx->ifc_rxqs[0];
|
|
|
|
iflib_fl_t fl = &rxq->ifr_fl[0];
|
|
|
|
|
|
|
|
info->num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
|
|
|
|
info->num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
|
|
|
|
info->num_tx_descs = iflib_num_tx_descs(ctx);
|
|
|
|
info->num_rx_descs = iflib_num_rx_descs(ctx);
|
|
|
|
info->rx_buf_maxsize = fl->ifl_buf_size;
|
|
|
|
nm_prinf("txr %u rxr %u txd %u rxd %u rbufsz %u",
|
|
|
|
info->num_tx_rings, info->num_rx_rings, info->num_tx_descs,
|
|
|
|
info->num_rx_descs, info->rx_buf_maxsize);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-10-30 21:14:31 +00:00
|
|
|
static int
|
2020-08-24 11:44:20 +00:00
|
|
|
netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init)
|
2017-10-30 21:14:31 +00:00
|
|
|
{
|
|
|
|
struct netmap_adapter *na = kring->na;
|
|
|
|
u_int const lim = kring->nkr_num_slots - 1;
|
|
|
|
struct netmap_ring *ring = kring->ring;
|
|
|
|
bus_dmamap_t *map;
|
|
|
|
struct if_rxd_update iru;
|
|
|
|
if_ctx_t ctx = rxq->ifr_ctx;
|
|
|
|
iflib_fl_t fl = &rxq->ifr_fl[0];
|
2020-08-24 11:44:20 +00:00
|
|
|
u_int nic_i_first, nic_i;
|
2021-01-10 22:49:37 +00:00
|
|
|
u_int nm_i;
|
2020-08-25 15:19:45 +00:00
|
|
|
int i, n;
|
2018-09-06 18:51:52 +00:00
|
|
|
#if IFLIB_DEBUG_COUNTERS
|
|
|
|
int rf_count = 0;
|
|
|
|
#endif
|
2017-10-30 21:14:31 +00:00
|
|
|
|
2020-08-24 11:44:20 +00:00
|
|
|
/*
|
2020-08-25 15:19:45 +00:00
|
|
|
* This function is used both at initialization and in rxsync.
|
|
|
|
* At initialization we need to prepare (with isc_rxd_refill())
|
2021-01-10 22:49:37 +00:00
|
|
|
* all the netmap buffers currently owned by the kernel, in
|
|
|
|
* such a way to keep fl->ifl_pidx and kring->nr_hwcur in sync
|
|
|
|
* (except for kring->nkr_hwofs). These may be less than
|
|
|
|
* kring->nkr_num_slots if netmap_reset() was called while
|
|
|
|
* an application using the kring that still owned some
|
|
|
|
* buffers.
|
|
|
|
* At rxsync time, both indexes point to the next buffer to be
|
|
|
|
* refilled.
|
2020-08-25 15:19:45 +00:00
|
|
|
* In any case we publish (with isc_rxd_flush()) up to
|
|
|
|
* (fl->ifl_pidx - 1) % N (included), to avoid the NIC tail/prod
|
|
|
|
* pointer to overrun the head/cons pointer, although this is
|
|
|
|
* not necessary for some NICs (e.g. vmx).
|
2020-08-24 11:44:20 +00:00
|
|
|
*/
|
2021-01-10 22:49:37 +00:00
|
|
|
if (__predict_false(init)) {
|
|
|
|
n = kring->nkr_num_slots - nm_kr_rxspace(kring);
|
|
|
|
} else {
|
|
|
|
n = kring->rhead - kring->nr_hwcur;
|
2020-08-25 15:19:45 +00:00
|
|
|
if (n == 0)
|
|
|
|
return (0); /* Nothing to do. */
|
|
|
|
if (n < 0)
|
|
|
|
n += kring->nkr_num_slots;
|
2020-08-24 11:44:20 +00:00
|
|
|
}
|
2020-08-12 14:17:38 +00:00
|
|
|
|
2017-10-30 21:14:31 +00:00
|
|
|
iru_init(&iru, rxq, 0 /* flid */);
|
|
|
|
map = fl->ifl_sds.ifsd_map;
|
2020-08-25 15:19:45 +00:00
|
|
|
nic_i = fl->ifl_pidx;
|
2021-01-10 22:49:37 +00:00
|
|
|
nm_i = netmap_idx_n2k(kring, nic_i);
|
|
|
|
if (__predict_false(init)) {
|
|
|
|
/*
|
|
|
|
* On init/reset, nic_i must be 0, and we must
|
|
|
|
* start to refill from hwtail (see netmap_reset()).
|
|
|
|
*/
|
|
|
|
MPASS(nic_i == 0);
|
|
|
|
MPASS(nm_i == kring->nr_hwtail);
|
|
|
|
} else
|
|
|
|
MPASS(nm_i == kring->nr_hwcur);
|
2018-09-06 18:51:52 +00:00
|
|
|
DBG_COUNTER_INC(fl_refills);
|
2020-08-25 15:19:45 +00:00
|
|
|
while (n > 0) {
|
2018-09-06 18:51:52 +00:00
|
|
|
#if IFLIB_DEBUG_COUNTERS
|
|
|
|
if (++rf_count == 9)
|
|
|
|
DBG_COUNTER_INC(fl_refills_large);
|
|
|
|
#endif
|
2020-08-12 14:17:38 +00:00
|
|
|
nic_i_first = nic_i;
|
2020-08-25 15:19:45 +00:00
|
|
|
for (i = 0; n > 0 && i < IFLIB_MAX_RX_REFRESH; n--, i++) {
|
2017-10-30 21:14:31 +00:00
|
|
|
struct netmap_slot *slot = &ring->slot[nm_i];
|
2021-04-05 07:54:47 +00:00
|
|
|
uint64_t paddr;
|
|
|
|
void *addr = PNMB(na, slot, &paddr);
|
2017-10-30 21:14:31 +00:00
|
|
|
|
2020-08-12 14:17:38 +00:00
|
|
|
MPASS(i < IFLIB_MAX_RX_REFRESH);
|
2017-10-30 21:14:31 +00:00
|
|
|
|
|
|
|
if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
|
|
|
|
return netmap_ring_reinit(kring);
|
|
|
|
|
2021-04-05 07:54:47 +00:00
|
|
|
fl->ifl_bus_addrs[i] = paddr +
|
|
|
|
nm_get_offset(kring, slot);
|
2020-08-12 14:17:38 +00:00
|
|
|
fl->ifl_rxd_idxs[i] = nic_i;
|
|
|
|
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
if (__predict_false(init)) {
|
|
|
|
netmap_load_map(na, fl->ifl_buf_tag,
|
|
|
|
map[nic_i], addr);
|
|
|
|
} else if (slot->flags & NS_BUF_CHANGED) {
|
2017-10-30 21:14:31 +00:00
|
|
|
/* buffer has changed, reload map */
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
netmap_reload_map(na, fl->ifl_buf_tag,
|
|
|
|
map[nic_i], addr);
|
2017-10-30 21:14:31 +00:00
|
|
|
}
|
2020-08-12 14:17:38 +00:00
|
|
|
bus_dmamap_sync(fl->ifl_buf_tag, map[nic_i],
|
|
|
|
BUS_DMASYNC_PREREAD);
|
2017-10-30 21:14:31 +00:00
|
|
|
slot->flags &= ~NS_BUF_CHANGED;
|
|
|
|
|
|
|
|
nm_i = nm_next(nm_i, lim);
|
2020-08-12 14:17:38 +00:00
|
|
|
nic_i = nm_next(nic_i, lim);
|
2017-10-30 21:14:31 +00:00
|
|
|
}
|
2020-08-12 14:17:38 +00:00
|
|
|
|
|
|
|
iru.iru_pidx = nic_i_first;
|
|
|
|
iru.iru_count = i;
|
|
|
|
ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
|
2017-10-30 21:14:31 +00:00
|
|
|
}
|
2020-08-25 15:19:45 +00:00
|
|
|
fl->ifl_pidx = nic_i;
|
2021-01-10 22:49:37 +00:00
|
|
|
/*
|
|
|
|
* At the end of the loop we must have refilled everything
|
|
|
|
* we could possibly refill.
|
|
|
|
*/
|
2020-08-25 15:19:45 +00:00
|
|
|
MPASS(nm_i == kring->rhead);
|
|
|
|
kring->nr_hwcur = nm_i;
|
2017-10-30 21:14:31 +00:00
|
|
|
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
|
|
|
|
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
|
2020-08-24 11:44:20 +00:00
|
|
|
ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id,
|
|
|
|
nm_prev(nic_i, lim));
|
2020-08-12 14:17:38 +00:00
|
|
|
DBG_COUNTER_INC(rxd_flush);
|
|
|
|
|
2017-10-30 21:14:31 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2020-10-27 21:53:33 +00:00
|
|
|
#define NETMAP_TX_TIMER_US 90
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
|
|
|
* Reconcile kernel and user view of the transmit ring.
|
|
|
|
*
|
|
|
|
* All information is in the kring.
|
|
|
|
* Userspace wants to send packets up to the one before kring->rhead,
|
|
|
|
* kernel knows kring->nr_hwcur is the first unsent packet.
|
|
|
|
*
|
|
|
|
* Here we push packets out (as many as possible), and possibly
|
|
|
|
* reclaim buffers from previously completed transmission.
|
|
|
|
*
|
|
|
|
* The caller (netmap) guarantees that there is only one instance
|
|
|
|
* running at any time. Any interference with other driver
|
|
|
|
* methods should be handled by the individual drivers.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
iflib_netmap_txsync(struct netmap_kring *kring, int flags)
|
|
|
|
{
|
|
|
|
struct netmap_adapter *na = kring->na;
|
2019-05-06 20:56:41 +00:00
|
|
|
if_t ifp = na->ifp;
|
2016-05-18 04:35:58 +00:00
|
|
|
struct netmap_ring *ring = kring->ring;
|
2018-07-20 17:24:45 +00:00
|
|
|
u_int nm_i; /* index into the netmap kring */
|
2016-05-18 04:35:58 +00:00
|
|
|
u_int nic_i; /* index into the NIC ring */
|
|
|
|
u_int n;
|
|
|
|
u_int const lim = kring->nkr_num_slots - 1;
|
|
|
|
u_int const head = kring->rhead;
|
|
|
|
struct if_pkt_info pi;
|
2021-08-18 07:17:43 +00:00
|
|
|
int tx_pkts = 0, tx_bytes = 0;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* interrupts on every tx packet are expensive so request
|
|
|
|
* them every half ring, or where NS_REPORT is set
|
|
|
|
*/
|
|
|
|
u_int report_frequency = kring->nkr_num_slots >> 1;
|
|
|
|
/* device-specific */
|
|
|
|
if_ctx_t ctx = ifp->if_softc;
|
|
|
|
iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id];
|
|
|
|
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
|
2018-11-27 20:01:05 +00:00
|
|
|
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* First part: process new packets to send.
|
2018-07-20 17:24:45 +00:00
|
|
|
* nm_i is the current index in the netmap kring,
|
2016-05-18 04:35:58 +00:00
|
|
|
* nic_i is the corresponding index in the NIC ring.
|
|
|
|
*
|
|
|
|
* If we have packets to send (nm_i != head)
|
|
|
|
* iterate over the netmap ring, fetch length and update
|
|
|
|
* the corresponding slot in the NIC ring. Some drivers also
|
|
|
|
* need to update the buffer's physical address in the NIC slot
|
|
|
|
* even NS_BUF_CHANGED is not set (PNMB computes the addresses).
|
|
|
|
*
|
|
|
|
* The netmap_reload_map() calls is especially expensive,
|
|
|
|
* even when (as in this case) the tag is 0, so do only
|
|
|
|
* when the buffer has actually changed.
|
|
|
|
*
|
|
|
|
* If possible do not set the report/intr bit on all slots,
|
|
|
|
* but only a few times per ring or when NS_REPORT is set.
|
|
|
|
*
|
|
|
|
* Finally, on 10G and faster drivers, it might be useful
|
|
|
|
* to prefetch the next slot and txr entry.
|
|
|
|
*/
|
|
|
|
|
2018-07-20 17:24:45 +00:00
|
|
|
nm_i = kring->nr_hwcur;
|
2016-05-18 04:35:58 +00:00
|
|
|
if (nm_i != head) { /* we have new packets to send */
|
2021-01-24 21:12:41 +00:00
|
|
|
uint32_t pkt_len = 0, seg_idx = 0;
|
|
|
|
int nic_i_start = -1, flags = 0;
|
2018-05-16 21:03:22 +00:00
|
|
|
pkt_info_zero(&pi);
|
|
|
|
pi.ipi_segs = txq->ift_segs;
|
|
|
|
pi.ipi_qsidx = kring->ring_id;
|
2016-05-18 04:35:58 +00:00
|
|
|
nic_i = netmap_idx_k2n(kring, nm_i);
|
|
|
|
|
|
|
|
__builtin_prefetch(&ring->slot[nm_i]);
|
|
|
|
__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]);
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
for (n = 0; nm_i != head; n++) {
|
|
|
|
struct netmap_slot *slot = &ring->slot[nm_i];
|
2021-04-05 07:54:47 +00:00
|
|
|
uint64_t offset = nm_get_offset(kring, slot);
|
2016-05-18 04:35:58 +00:00
|
|
|
u_int len = slot->len;
|
2017-03-14 15:08:56 +00:00
|
|
|
uint64_t paddr;
|
2016-05-18 04:35:58 +00:00
|
|
|
void *addr = PNMB(na, slot, &paddr);
|
2021-01-24 21:12:41 +00:00
|
|
|
|
|
|
|
flags |= (slot->flags & NS_REPORT ||
|
2016-05-18 04:35:58 +00:00
|
|
|
nic_i == 0 || nic_i == report_frequency) ?
|
|
|
|
IPI_TX_INTR : 0;
|
|
|
|
|
2021-01-24 21:12:41 +00:00
|
|
|
/*
|
|
|
|
* If this is the first packet fragment, save the
|
|
|
|
* index of the first NIC slot for later.
|
|
|
|
*/
|
|
|
|
if (nic_i_start < 0)
|
|
|
|
nic_i_start = nic_i;
|
|
|
|
|
2021-04-05 07:54:47 +00:00
|
|
|
pi.ipi_segs[seg_idx].ds_addr = paddr + offset;
|
2021-01-24 21:12:41 +00:00
|
|
|
pi.ipi_segs[seg_idx].ds_len = len;
|
|
|
|
if (len) {
|
|
|
|
pkt_len += len;
|
|
|
|
seg_idx++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!(slot->flags & NS_MOREFRAG)) {
|
|
|
|
pi.ipi_len = pkt_len;
|
|
|
|
pi.ipi_nsegs = seg_idx;
|
|
|
|
pi.ipi_pidx = nic_i_start;
|
|
|
|
pi.ipi_ndescs = 0;
|
|
|
|
pi.ipi_flags = flags;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2021-01-24 21:12:41 +00:00
|
|
|
/* Prepare the NIC TX ring. */
|
|
|
|
ctx->isc_txd_encap(ctx->ifc_softc, &pi);
|
|
|
|
DBG_COUNTER_INC(tx_encap);
|
|
|
|
|
2021-08-18 07:17:43 +00:00
|
|
|
/* Update transmit counters */
|
|
|
|
tx_bytes += pi.ipi_len;
|
|
|
|
tx_pkts++;
|
|
|
|
|
2021-01-24 21:12:41 +00:00
|
|
|
/* Reinit per-packet info for the next one. */
|
|
|
|
flags = seg_idx = pkt_len = 0;
|
|
|
|
nic_i_start = -1;
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
/* prefetch for next round */
|
|
|
|
__builtin_prefetch(&ring->slot[nm_i + 1]);
|
|
|
|
__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]);
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2021-04-05 07:54:47 +00:00
|
|
|
NM_CHECK_ADDR_LEN_OFF(na, len, offset);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
if (slot->flags & NS_BUF_CHANGED) {
|
|
|
|
/* buffer has changed, reload map */
|
|
|
|
netmap_reload_map(na, txq->ift_buf_tag,
|
|
|
|
txq->ift_sds.ifsd_map[nic_i], addr);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
/* make sure changes to the buffer are synced */
|
|
|
|
bus_dmamap_sync(txq->ift_buf_tag,
|
|
|
|
txq->ift_sds.ifsd_map[nic_i],
|
|
|
|
BUS_DMASYNC_PREWRITE);
|
|
|
|
|
2021-01-24 21:12:41 +00:00
|
|
|
slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED | NS_MOREFRAG);
|
2016-05-18 04:35:58 +00:00
|
|
|
nm_i = nm_next(nm_i, lim);
|
|
|
|
nic_i = nm_next(nic_i, lim);
|
|
|
|
}
|
2018-07-20 17:24:45 +00:00
|
|
|
kring->nr_hwcur = nm_i;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
/* synchronize the NIC ring */
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
|
2018-11-27 20:01:05 +00:00
|
|
|
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
/* (re)start the tx unit up to slot nic_i (excluded) */
|
|
|
|
ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Second part: reclaim buffers for completed transmissions.
|
2018-05-16 21:03:22 +00:00
|
|
|
*
|
|
|
|
* If there are unclaimed buffers, attempt to reclaim them.
|
2020-10-27 21:53:33 +00:00
|
|
|
* If we don't manage to reclaim them all, and TX IRQs are not in use,
|
|
|
|
* trigger a per-tx-queue timer to try again later.
|
2016-05-18 04:35:58 +00:00
|
|
|
*/
|
2018-07-20 17:24:45 +00:00
|
|
|
if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
|
2018-05-16 21:03:22 +00:00
|
|
|
if (iflib_tx_credits_update(ctx, txq)) {
|
|
|
|
/* some tx completed, increment avail */
|
|
|
|
nic_i = txq->ift_cidx_processed;
|
|
|
|
kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
|
|
|
|
}
|
2018-07-20 17:24:45 +00:00
|
|
|
}
|
2020-10-27 21:53:33 +00:00
|
|
|
|
2018-07-20 17:24:45 +00:00
|
|
|
if (!(ctx->ifc_flags & IFC_NETMAP_TX_IRQ))
|
|
|
|
if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
|
2020-10-28 21:06:17 +00:00
|
|
|
callout_reset_sbt_on(&txq->ift_netmap_timer,
|
2020-10-27 21:53:33 +00:00
|
|
|
NETMAP_TX_TIMER_US * SBT_1US, SBT_1US,
|
2020-10-28 21:06:17 +00:00
|
|
|
iflib_netmap_timer, txq,
|
|
|
|
txq->ift_netmap_timer.c_cpu, 0);
|
2020-10-27 21:53:33 +00:00
|
|
|
}
|
2021-08-18 07:17:43 +00:00
|
|
|
|
|
|
|
if_inc_counter(ifp, IFCOUNTER_OBYTES, tx_bytes);
|
|
|
|
if_inc_counter(ifp, IFCOUNTER_OPACKETS, tx_pkts);
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reconcile kernel and user view of the receive ring.
|
|
|
|
* Same as for the txsync, this routine must be efficient.
|
|
|
|
* The caller guarantees a single invocations, but races against
|
|
|
|
* the rest of the driver should be handled here.
|
|
|
|
*
|
|
|
|
* On call, kring->rhead is the first packet that userspace wants
|
|
|
|
* to keep, and kring->rcur is the wakeup point.
|
|
|
|
* The kernel has previously reported packets up to kring->rtail.
|
|
|
|
*
|
|
|
|
* If (flags & NAF_FORCE_READ) also check for incoming packets irrespective
|
|
|
|
* of whether or not we received an interrupt.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
iflib_netmap_rxsync(struct netmap_kring *kring, int flags)
|
|
|
|
{
|
|
|
|
struct netmap_adapter *na = kring->na;
|
|
|
|
struct netmap_ring *ring = kring->ring;
|
2019-05-06 20:56:41 +00:00
|
|
|
if_t ifp = na->ifp;
|
2017-03-13 22:53:06 +00:00
|
|
|
uint32_t nm_i; /* index into the netmap ring */
|
2017-10-30 21:14:31 +00:00
|
|
|
uint32_t nic_i; /* index into the NIC ring */
|
2020-08-06 21:32:25 +00:00
|
|
|
u_int n;
|
2016-05-18 04:35:58 +00:00
|
|
|
u_int const lim = kring->nkr_num_slots - 1;
|
|
|
|
int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
|
2021-08-18 07:17:43 +00:00
|
|
|
int i = 0, rx_bytes = 0, rx_pkts = 0;
|
2017-03-13 22:53:06 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
if_ctx_t ctx = ifp->if_softc;
|
2020-08-12 14:45:31 +00:00
|
|
|
if_shared_ctx_t sctx = ctx->ifc_sctx;
|
|
|
|
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id];
|
2020-08-06 21:32:25 +00:00
|
|
|
iflib_fl_t fl = &rxq->ifr_fl[0];
|
|
|
|
struct if_rxd_info ri;
|
2020-08-12 14:45:31 +00:00
|
|
|
qidx_t *cidxp;
|
2020-08-06 21:32:25 +00:00
|
|
|
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
/*
|
2020-08-06 21:32:25 +00:00
|
|
|
* netmap only uses free list 0, to avoid out of order consumption
|
|
|
|
* of receive buffers
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
*/
|
|
|
|
|
2020-08-06 21:32:25 +00:00
|
|
|
bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
|
|
|
|
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
|
|
|
* First part: import newly received packets.
|
|
|
|
*
|
|
|
|
* nm_i is the index of the next free slot in the netmap ring,
|
2020-08-12 14:45:31 +00:00
|
|
|
* nic_i is the index of the next received packet in the NIC ring
|
|
|
|
* (or in the free list 0 if IFLIB_HAS_RXCQ is set), and they may
|
|
|
|
* differ in case if_init() has been called while
|
2016-05-18 04:35:58 +00:00
|
|
|
* in netmap mode. For the receive ring we have
|
|
|
|
*
|
2020-08-12 14:45:31 +00:00
|
|
|
* nic_i = fl->ifl_cidx;
|
2016-05-18 04:35:58 +00:00
|
|
|
* nm_i = kring->nr_hwtail (previous)
|
|
|
|
* and
|
|
|
|
* nm_i == (nic_i + kring->nkr_hwofs) % ring_size
|
|
|
|
*
|
2020-08-12 14:45:31 +00:00
|
|
|
* fl->ifl_cidx is set to 0 on a ring reinit
|
2016-05-18 04:35:58 +00:00
|
|
|
*/
|
|
|
|
if (netmap_no_pendintr || force_update) {
|
2020-06-23 20:23:56 +00:00
|
|
|
uint32_t hwtail_lim = nm_prev(kring->nr_hwcur, lim);
|
2020-08-12 14:45:31 +00:00
|
|
|
bool have_rxcq = sctx->isc_flags & IFLIB_HAS_RXCQ;
|
2016-05-18 04:35:58 +00:00
|
|
|
int crclen = iflib_crcstrip ? 0 : 4;
|
|
|
|
int error, avail;
|
|
|
|
|
2020-08-12 14:45:31 +00:00
|
|
|
/*
|
|
|
|
* For the free list consumer index, we use the same
|
|
|
|
* logic as in iflib_rxeof().
|
|
|
|
*/
|
|
|
|
if (have_rxcq)
|
|
|
|
cidxp = &rxq->ifr_cq_cidx;
|
|
|
|
else
|
|
|
|
cidxp = &fl->ifl_cidx;
|
|
|
|
avail = ctx->isc_rxd_available(ctx->ifc_softc,
|
|
|
|
rxq->ifr_id, *cidxp, USHRT_MAX);
|
|
|
|
|
2020-08-06 21:32:25 +00:00
|
|
|
nic_i = fl->ifl_cidx;
|
|
|
|
nm_i = netmap_idx_n2k(kring, nic_i);
|
2020-08-24 11:44:20 +00:00
|
|
|
MPASS(nm_i == kring->nr_hwtail);
|
2020-08-06 21:32:25 +00:00
|
|
|
for (n = 0; avail > 0 && nm_i != hwtail_lim; n++, avail--) {
|
|
|
|
rxd_info_zero(&ri);
|
|
|
|
ri.iri_frags = rxq->ifr_frags;
|
|
|
|
ri.iri_qsidx = kring->ring_id;
|
|
|
|
ri.iri_ifp = ctx->ifc_ifp;
|
2020-08-12 14:45:31 +00:00
|
|
|
ri.iri_cidx = *cidxp;
|
2020-08-06 21:32:25 +00:00
|
|
|
|
|
|
|
error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
|
2021-01-24 21:12:41 +00:00
|
|
|
for (i = 0; i < ri.iri_nfrags; i++) {
|
|
|
|
if (error) {
|
|
|
|
ring->slot[nm_i].len = 0;
|
|
|
|
ring->slot[nm_i].flags = 0;
|
|
|
|
} else {
|
|
|
|
ring->slot[nm_i].len = ri.iri_frags[i].irf_len;
|
|
|
|
if (i == (ri.iri_nfrags - 1)) {
|
|
|
|
ring->slot[nm_i].len -= crclen;
|
|
|
|
ring->slot[nm_i].flags = 0;
|
2021-08-18 07:17:43 +00:00
|
|
|
|
|
|
|
/* Update receive counters */
|
|
|
|
rx_bytes += ri.iri_len;
|
|
|
|
rx_pkts++;
|
2021-01-24 21:12:41 +00:00
|
|
|
} else
|
|
|
|
ring->slot[nm_i].flags = NS_MOREFRAG;
|
|
|
|
}
|
|
|
|
|
|
|
|
bus_dmamap_sync(fl->ifl_buf_tag,
|
|
|
|
fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD);
|
|
|
|
nm_i = nm_next(nm_i, lim);
|
|
|
|
fl->ifl_cidx = nic_i = nm_next(nic_i, lim);
|
2020-08-12 14:45:31 +00:00
|
|
|
}
|
2021-01-24 21:38:59 +00:00
|
|
|
|
|
|
|
if (have_rxcq) {
|
|
|
|
*cidxp = ri.iri_cidx;
|
|
|
|
while (*cidxp >= scctx->isc_nrxd[0])
|
|
|
|
*cidxp -= scctx->isc_nrxd[0];
|
|
|
|
}
|
|
|
|
|
2020-08-06 21:32:25 +00:00
|
|
|
}
|
|
|
|
if (n) { /* update the state variables */
|
|
|
|
if (netmap_no_pendintr && !force_update) {
|
|
|
|
/* diagnostics */
|
|
|
|
iflib_rx_miss ++;
|
|
|
|
iflib_rx_miss_bufs += n;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
2020-08-06 21:32:25 +00:00
|
|
|
kring->nr_hwtail = nm_i;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
2020-08-06 21:32:25 +00:00
|
|
|
kring->nr_kflags &= ~NKR_PENDINTR;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Second part: skip past packets that userspace has released.
|
|
|
|
* (kring->nr_hwcur to head excluded),
|
|
|
|
* and make the buffers available for reception.
|
|
|
|
* As usual nm_i is the index in the netmap ring,
|
|
|
|
* nic_i is the index in the NIC ring, and
|
|
|
|
* nm_i == (nic_i + kring->nkr_hwofs) % ring_size
|
|
|
|
*/
|
2020-08-24 11:44:20 +00:00
|
|
|
netmap_fl_refill(rxq, kring, false);
|
2017-09-16 02:41:38 +00:00
|
|
|
|
2021-08-18 07:17:43 +00:00
|
|
|
if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
|
|
|
|
if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
|
|
|
|
|
2020-08-24 11:44:20 +00:00
|
|
|
return (0);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
static void
|
|
|
|
iflib_netmap_intr(struct netmap_adapter *na, int onoff)
|
|
|
|
{
|
2019-05-06 20:56:41 +00:00
|
|
|
if_ctx_t ctx = na->ifp->if_softc;
|
2017-03-13 22:53:06 +00:00
|
|
|
|
2017-09-16 02:41:38 +00:00
|
|
|
CTX_LOCK(ctx);
|
2017-03-13 22:53:06 +00:00
|
|
|
if (onoff) {
|
|
|
|
IFDI_INTR_ENABLE(ctx);
|
|
|
|
} else {
|
|
|
|
IFDI_INTR_DISABLE(ctx);
|
|
|
|
}
|
2017-09-16 02:41:38 +00:00
|
|
|
CTX_UNLOCK(ctx);
|
2017-03-13 22:53:06 +00:00
|
|
|
}
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
static int
|
|
|
|
iflib_netmap_attach(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
struct netmap_adapter na;
|
|
|
|
|
|
|
|
bzero(&na, sizeof(na));
|
|
|
|
|
|
|
|
na.ifp = ctx->ifc_ifp;
|
2021-04-05 07:54:47 +00:00
|
|
|
na.na_flags = NAF_BDG_MAYSLEEP | NAF_MOREFRAG | NAF_OFFSETS;
|
2016-05-18 04:35:58 +00:00
|
|
|
MPASS(ctx->ifc_softc_ctx.isc_ntxqsets);
|
|
|
|
MPASS(ctx->ifc_softc_ctx.isc_nrxqsets);
|
|
|
|
|
2020-07-20 21:08:56 +00:00
|
|
|
na.num_tx_desc = iflib_num_tx_descs(ctx);
|
|
|
|
na.num_rx_desc = iflib_num_rx_descs(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
na.nm_txsync = iflib_netmap_txsync;
|
|
|
|
na.nm_rxsync = iflib_netmap_rxsync;
|
|
|
|
na.nm_register = iflib_netmap_register;
|
2017-03-13 22:53:06 +00:00
|
|
|
na.nm_intr = iflib_netmap_intr;
|
2021-03-29 09:26:12 +00:00
|
|
|
na.nm_config = iflib_netmap_config;
|
2016-05-18 04:35:58 +00:00
|
|
|
na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
|
|
|
|
na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
|
|
|
|
return (netmap_attach(&na));
|
|
|
|
}
|
|
|
|
|
2020-06-25 19:44:24 +00:00
|
|
|
static int
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq)
|
|
|
|
{
|
|
|
|
struct netmap_adapter *na = NA(ctx->ifc_ifp);
|
|
|
|
struct netmap_slot *slot;
|
|
|
|
|
|
|
|
slot = netmap_reset(na, NR_TX, txq->ift_id, 0);
|
2017-02-22 02:35:59 +00:00
|
|
|
if (slot == NULL)
|
2020-06-25 19:44:24 +00:00
|
|
|
return (0);
|
2016-08-12 21:29:44 +00:00
|
|
|
for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) {
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
|
|
|
* In netmap mode, set the map for the packet buffer.
|
|
|
|
* NOTE: Some drivers (not this one) also need to set
|
|
|
|
* the physical buffer address in the NIC ring.
|
|
|
|
* netmap_idx_n2k() maps a nic index, i, into the corresponding
|
|
|
|
* netmap slot index, si
|
|
|
|
*/
|
netmap: align codebase to the current upstream (commit id 3fb001303718146)
Changelist:
- Turn tx_rings and rx_rings arrays into arrays of pointers to kring
structs. This patch includes fixes for ixv, ixl, ix, re, cxgbe, iflib,
vtnet and ptnet drivers to cope with the change.
- Generalize the nm_config() callback to accept a struct containing many
parameters.
- Introduce NKR_FAKERING to support buffers sharing (used for netmap
pipes)
- Improved API for external VALE modules.
- Various bug fixes and improvements to the netmap memory allocator,
including support for externally (userspace) allocated memory.
- Refactoring of netmap pipes: now linked rings share the same netmap
buffers, with a separate set of kring pointers (rhead, rcur, rtail).
Buffer swapping does not need to happen anymore.
- Large refactoring of the control API towards an extensible solution;
the goal is to allow the addition of more commands and extension of
existing ones (with new options) without the need of hacks or the
risk of running out of configuration space.
A new NIOCCTRL ioctl has been added to handle all the requests of the
new control API, which cover all the functionalities so far supported.
The netmap API bumps from 11 to 12 with this patch. Full backward
compatibility is provided for the old control command (NIOCREGIF), by
means of a new netmap_legacy module. Many parts of the old netmap.h
header has now been moved to netmap_legacy.h (included by netmap.h).
Approved by: hrs (mentor)
2018-04-12 07:20:50 +00:00
|
|
|
int si = netmap_idx_n2k(na->tx_rings[txq->ift_id], i);
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
netmap_load_map(na, txq->ift_buf_tag, txq->ift_sds.ifsd_map[i],
|
|
|
|
NMB(na, slot + si));
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
2020-06-25 19:44:24 +00:00
|
|
|
return (1);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
2017-10-30 21:14:31 +00:00
|
|
|
|
2020-06-25 19:44:24 +00:00
|
|
|
static int
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq)
|
|
|
|
{
|
|
|
|
struct netmap_adapter *na = NA(ctx->ifc_ifp);
|
2020-06-25 19:44:24 +00:00
|
|
|
struct netmap_kring *kring;
|
2016-05-18 04:35:58 +00:00
|
|
|
struct netmap_slot *slot;
|
|
|
|
|
|
|
|
slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0);
|
2017-02-22 02:35:59 +00:00
|
|
|
if (slot == NULL)
|
2020-06-25 19:44:24 +00:00
|
|
|
return (0);
|
|
|
|
kring = na->rx_rings[rxq->ifr_id];
|
2020-08-24 11:44:20 +00:00
|
|
|
netmap_fl_refill(rxq, kring, true);
|
2020-06-25 19:44:24 +00:00
|
|
|
return (1);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
2018-07-20 17:24:45 +00:00
|
|
|
static void
|
2020-10-27 21:53:33 +00:00
|
|
|
iflib_netmap_timer(void *arg)
|
2018-07-20 17:24:45 +00:00
|
|
|
{
|
2020-10-27 21:53:33 +00:00
|
|
|
iflib_txq_t txq = arg;
|
|
|
|
if_ctx_t ctx = txq->ift_ctx;
|
2018-07-20 17:24:45 +00:00
|
|
|
|
2020-10-27 21:53:33 +00:00
|
|
|
/*
|
|
|
|
* Wake up the netmap application, to give it a chance to
|
|
|
|
* call txsync and reclaim more completed TX buffers.
|
|
|
|
*/
|
|
|
|
netmap_tx_irq(ctx->ifc_ifp, txq->ift_id);
|
2018-07-20 17:24:45 +00:00
|
|
|
}
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
#define iflib_netmap_detach(ifp) netmap_detach(ifp)
|
|
|
|
|
|
|
|
#else
|
2020-06-25 19:44:24 +00:00
|
|
|
#define iflib_netmap_txq_init(ctx, txq) (0)
|
|
|
|
#define iflib_netmap_rxq_init(ctx, rxq) (0)
|
2016-05-18 04:35:58 +00:00
|
|
|
#define iflib_netmap_detach(ifp)
|
2021-01-10 14:42:49 +00:00
|
|
|
#define netmap_enable_all_rings(ifp)
|
|
|
|
#define netmap_disable_all_rings(ifp)
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
#define iflib_netmap_attach(ctx) (0)
|
|
|
|
#define netmap_rx_irq(ifp, qid, budget) (0)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(__i386__) || defined(__amd64__)
|
|
|
|
static __inline void
|
|
|
|
prefetch(void *x)
|
|
|
|
{
|
|
|
|
__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
|
|
|
|
}
|
2017-10-23 20:50:08 +00:00
|
|
|
static __inline void
|
|
|
|
prefetch2cachelines(void *x)
|
|
|
|
{
|
|
|
|
__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
|
|
|
|
#if (CACHE_LINE_SIZE < 128)
|
|
|
|
__asm volatile("prefetcht0 %0" :: "m" (*(((unsigned long *)x)+CACHE_LINE_SIZE/(sizeof(unsigned long)))));
|
|
|
|
#endif
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
#else
|
|
|
|
#define prefetch(x)
|
2017-10-23 20:50:08 +00:00
|
|
|
#define prefetch2cachelines(x)
|
2016-05-18 04:35:58 +00:00
|
|
|
#endif
|
|
|
|
|
2017-10-31 02:49:28 +00:00
|
|
|
static void
|
|
|
|
iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid)
|
|
|
|
{
|
|
|
|
iflib_fl_t fl;
|
|
|
|
|
|
|
|
fl = &rxq->ifr_fl[flid];
|
|
|
|
iru->iru_paddrs = fl->ifl_bus_addrs;
|
|
|
|
iru->iru_idxs = fl->ifl_rxd_idxs;
|
|
|
|
iru->iru_qsidx = rxq->ifr_id;
|
|
|
|
iru->iru_buf_size = fl->ifl_buf_size;
|
|
|
|
iru->iru_flidx = fl->ifl_id;
|
|
|
|
}
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
static void
|
|
|
|
_iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
|
|
|
|
{
|
|
|
|
if (err)
|
|
|
|
return;
|
|
|
|
*(bus_addr_t *) arg = segs[0].ds_addr;
|
|
|
|
}
|
|
|
|
|
2021-02-24 22:56:45 +00:00
|
|
|
#define DMA_WIDTH_TO_BUS_LOWADDR(width) \
|
2021-02-25 01:29:33 +00:00
|
|
|
(((width) == 0) || (width) == flsll(BUS_SPACE_MAXADDR) ? \
|
2021-02-24 22:56:45 +00:00
|
|
|
BUS_SPACE_MAXADDR : (1ULL << (width)) - 1ULL)
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
int
|
2019-01-22 01:11:17 +00:00
|
|
|
iflib_dma_alloc_align(if_ctx_t ctx, int size, int align, iflib_dma_info_t dma, int mapflags)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
device_t dev = ctx->ifc_dev;
|
2021-02-24 22:56:45 +00:00
|
|
|
bus_addr_t lowaddr;
|
|
|
|
|
|
|
|
lowaddr = DMA_WIDTH_TO_BUS_LOWADDR(ctx->ifc_softc_ctx.isc_dma_width);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2019-01-22 01:11:17 +00:00
|
|
|
err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
|
|
|
|
align, 0, /* alignment, bounds */
|
2021-02-24 22:56:45 +00:00
|
|
|
lowaddr, /* lowaddr */
|
2016-05-18 04:35:58 +00:00
|
|
|
BUS_SPACE_MAXADDR, /* highaddr */
|
|
|
|
NULL, NULL, /* filter, filterarg */
|
|
|
|
size, /* maxsize */
|
|
|
|
1, /* nsegments */
|
|
|
|
size, /* maxsegsize */
|
|
|
|
BUS_DMA_ALLOCNOW, /* flags */
|
|
|
|
NULL, /* lockfunc */
|
|
|
|
NULL, /* lockarg */
|
|
|
|
&dma->idi_tag);
|
|
|
|
if (err) {
|
|
|
|
device_printf(dev,
|
|
|
|
"%s: bus_dma_tag_create failed: %d\n",
|
|
|
|
__func__, err);
|
|
|
|
goto fail_0;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = bus_dmamem_alloc(dma->idi_tag, (void**) &dma->idi_vaddr,
|
|
|
|
BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->idi_map);
|
|
|
|
if (err) {
|
|
|
|
device_printf(dev,
|
|
|
|
"%s: bus_dmamem_alloc(%ju) failed: %d\n",
|
|
|
|
__func__, (uintmax_t)size, err);
|
|
|
|
goto fail_1;
|
|
|
|
}
|
|
|
|
|
|
|
|
dma->idi_paddr = IF_BAD_DMA;
|
|
|
|
err = bus_dmamap_load(dma->idi_tag, dma->idi_map, dma->idi_vaddr,
|
|
|
|
size, _iflib_dmamap_cb, &dma->idi_paddr, mapflags | BUS_DMA_NOWAIT);
|
|
|
|
if (err || dma->idi_paddr == IF_BAD_DMA) {
|
|
|
|
device_printf(dev,
|
|
|
|
"%s: bus_dmamap_load failed: %d\n",
|
|
|
|
__func__, err);
|
|
|
|
goto fail_2;
|
|
|
|
}
|
|
|
|
|
|
|
|
dma->idi_size = size;
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
fail_2:
|
|
|
|
bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
|
|
|
|
fail_1:
|
|
|
|
bus_dma_tag_destroy(dma->idi_tag);
|
|
|
|
fail_0:
|
|
|
|
dma->idi_tag = NULL;
|
|
|
|
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
2019-01-22 01:11:17 +00:00
|
|
|
int
|
|
|
|
iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags)
|
|
|
|
{
|
|
|
|
if_shared_ctx_t sctx = ctx->ifc_sctx;
|
|
|
|
|
|
|
|
KASSERT(sctx->isc_q_align != 0, ("alignment value not initialized"));
|
|
|
|
|
|
|
|
return (iflib_dma_alloc_align(ctx, size, sctx->isc_q_align, dma, mapflags));
|
|
|
|
}
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
int
|
|
|
|
iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count)
|
|
|
|
{
|
|
|
|
int i, err;
|
|
|
|
iflib_dma_info_t *dmaiter;
|
|
|
|
|
|
|
|
dmaiter = dmalist;
|
|
|
|
for (i = 0; i < count; i++, dmaiter++) {
|
|
|
|
if ((err = iflib_dma_alloc(ctx, sizes[i], *dmaiter, mapflags)) != 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (err)
|
|
|
|
iflib_dma_free_multi(dmalist, i);
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
iflib_dma_free(iflib_dma_info_t dma)
|
|
|
|
{
|
|
|
|
if (dma->idi_tag == NULL)
|
|
|
|
return;
|
|
|
|
if (dma->idi_paddr != IF_BAD_DMA) {
|
|
|
|
bus_dmamap_sync(dma->idi_tag, dma->idi_map,
|
|
|
|
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
|
|
|
|
bus_dmamap_unload(dma->idi_tag, dma->idi_map);
|
|
|
|
dma->idi_paddr = IF_BAD_DMA;
|
|
|
|
}
|
|
|
|
if (dma->idi_vaddr != NULL) {
|
|
|
|
bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
|
|
|
|
dma->idi_vaddr = NULL;
|
|
|
|
}
|
|
|
|
bus_dma_tag_destroy(dma->idi_tag);
|
|
|
|
dma->idi_tag = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
iflib_dma_info_t *dmaiter = dmalist;
|
|
|
|
|
|
|
|
for (i = 0; i < count; i++, dmaiter++)
|
|
|
|
iflib_dma_free(*dmaiter);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
iflib_fast_intr(void *arg)
|
2017-03-13 22:53:06 +00:00
|
|
|
{
|
|
|
|
iflib_filter_info_t info = arg;
|
|
|
|
struct grouptask *gtask = info->ifi_task;
|
2019-02-15 18:51:43 +00:00
|
|
|
int result;
|
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
DBG_COUNTER_INC(fast_intrs);
|
2019-02-15 18:51:43 +00:00
|
|
|
if (info->ifi_filter != NULL) {
|
|
|
|
result = info->ifi_filter(info->ifi_filter_arg);
|
|
|
|
if ((result & FILTER_SCHEDULE_THREAD) == 0)
|
|
|
|
return (result);
|
|
|
|
}
|
2017-03-13 22:53:06 +00:00
|
|
|
|
|
|
|
GROUPTASK_ENQUEUE(gtask);
|
|
|
|
return (FILTER_HANDLED);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
iflib_fast_intr_rxtx(void *arg)
|
|
|
|
{
|
|
|
|
iflib_filter_info_t info = arg;
|
|
|
|
struct grouptask *gtask = info->ifi_task;
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
if_ctx_t ctx;
|
2017-03-13 22:53:06 +00:00
|
|
|
iflib_rxq_t rxq = (iflib_rxq_t)info->ifi_ctx;
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
iflib_txq_t txq;
|
|
|
|
void *sc;
|
2019-02-15 18:51:43 +00:00
|
|
|
int i, cidx, result;
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
qidx_t txqid;
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
bool intr_enable, intr_legacy;
|
2017-03-13 22:53:06 +00:00
|
|
|
|
|
|
|
DBG_COUNTER_INC(fast_intrs);
|
2019-02-15 18:51:43 +00:00
|
|
|
if (info->ifi_filter != NULL) {
|
|
|
|
result = info->ifi_filter(info->ifi_filter_arg);
|
|
|
|
if ((result & FILTER_SCHEDULE_THREAD) == 0)
|
|
|
|
return (result);
|
|
|
|
}
|
2017-03-13 22:53:06 +00:00
|
|
|
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
ctx = rxq->ifr_ctx;
|
|
|
|
sc = ctx->ifc_softc;
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
intr_enable = false;
|
|
|
|
intr_legacy = !!(ctx->ifc_flags & IFC_LEGACY);
|
2018-05-04 18:57:05 +00:00
|
|
|
MPASS(rxq->ifr_ntxqirq);
|
2017-03-13 22:53:06 +00:00
|
|
|
for (i = 0; i < rxq->ifr_ntxqirq; i++) {
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
txqid = rxq->ifr_txqid[i];
|
|
|
|
txq = &ctx->ifc_txqs[txqid];
|
|
|
|
bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
|
2019-01-16 05:44:14 +00:00
|
|
|
BUS_DMASYNC_POSTREAD);
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
if (!ctx->isc_txd_credits_update(sc, txqid, false)) {
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
if (intr_legacy)
|
|
|
|
intr_enable = true;
|
|
|
|
else
|
|
|
|
IFDI_TX_QUEUE_INTR_ENABLE(ctx, txqid);
|
2017-03-13 22:53:06 +00:00
|
|
|
continue;
|
|
|
|
}
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
GROUPTASK_ENQUEUE(&txq->ift_task);
|
2017-03-13 22:53:06 +00:00
|
|
|
}
|
|
|
|
if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_RXCQ)
|
|
|
|
cidx = rxq->ifr_cq_cidx;
|
|
|
|
else
|
|
|
|
cidx = rxq->ifr_fl[0].ifl_cidx;
|
|
|
|
if (iflib_rxd_avail(ctx, rxq, cidx, 1))
|
|
|
|
GROUPTASK_ENQUEUE(gtask);
|
2018-09-06 18:51:52 +00:00
|
|
|
else {
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
if (intr_legacy)
|
|
|
|
intr_enable = true;
|
|
|
|
else
|
|
|
|
IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
|
2018-09-06 18:51:52 +00:00
|
|
|
DBG_COUNTER_INC(rx_intr_enables);
|
|
|
|
}
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
if (intr_enable)
|
|
|
|
IFDI_INTR_ENABLE(ctx);
|
2017-03-13 22:53:06 +00:00
|
|
|
return (FILTER_HANDLED);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
iflib_fast_intr_ctx(void *arg)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
|
|
|
iflib_filter_info_t info = arg;
|
|
|
|
struct grouptask *gtask = info->ifi_task;
|
2019-02-15 18:51:43 +00:00
|
|
|
int result;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
DBG_COUNTER_INC(fast_intrs);
|
2019-02-15 18:51:43 +00:00
|
|
|
if (info->ifi_filter != NULL) {
|
|
|
|
result = info->ifi_filter(info->ifi_filter_arg);
|
|
|
|
if ((result & FILTER_SCHEDULE_THREAD) == 0)
|
|
|
|
return (result);
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
GROUPTASK_ENQUEUE(gtask);
|
|
|
|
return (FILTER_HANDLED);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
_iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
|
2018-05-29 21:56:39 +00:00
|
|
|
driver_filter_t filter, driver_intr_t handler, void *arg,
|
|
|
|
const char *name)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
|
|
|
struct resource *res;
|
2017-03-30 16:54:01 +00:00
|
|
|
void *tag = NULL;
|
2016-05-18 04:35:58 +00:00
|
|
|
device_t dev = ctx->ifc_dev;
|
2019-06-15 11:07:41 +00:00
|
|
|
int flags, i, rc;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2017-03-30 16:54:01 +00:00
|
|
|
flags = RF_ACTIVE;
|
|
|
|
if (ctx->ifc_flags & IFC_LEGACY)
|
|
|
|
flags |= RF_SHAREABLE;
|
2016-05-18 04:35:58 +00:00
|
|
|
MPASS(rid < 512);
|
2019-06-15 11:07:41 +00:00
|
|
|
i = rid;
|
|
|
|
res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &i, flags);
|
2016-05-18 04:35:58 +00:00
|
|
|
if (res == NULL) {
|
|
|
|
device_printf(dev,
|
|
|
|
"failed to allocate IRQ for rid %d, name %s.\n", rid, name);
|
|
|
|
return (ENOMEM);
|
|
|
|
}
|
|
|
|
irq->ii_res = res;
|
|
|
|
KASSERT(filter == NULL || handler == NULL, ("filter and handler can't both be non-NULL"));
|
|
|
|
rc = bus_setup_intr(dev, res, INTR_MPSAFE | INTR_TYPE_NET,
|
|
|
|
filter, handler, arg, &tag);
|
|
|
|
if (rc != 0) {
|
|
|
|
device_printf(dev,
|
|
|
|
"failed to setup interrupt for rid %d, name %s: %d\n",
|
|
|
|
rid, name ? name : "unknown", rc);
|
|
|
|
return (rc);
|
|
|
|
} else if (name)
|
2016-08-04 18:29:16 +00:00
|
|
|
bus_describe_intr(dev, res, tag, "%s", name);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
irq->ii_tag = tag;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*********************************************************************
|
|
|
|
*
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
* Allocate DMA resources for TX buffers as well as memory for the TX
|
|
|
|
* mbuf map. TX DMA maps (non-TSO/TSO) and TX mbuf map are kept in a
|
|
|
|
* iflib_sw_tx_desc_array structure, storing all the information that
|
|
|
|
* is needed to transmit a packet on the wire. This is called only
|
|
|
|
* once at attach, setup is done every reset.
|
2016-05-18 04:35:58 +00:00
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
static int
|
|
|
|
iflib_txsd_alloc(iflib_txq_t txq)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = txq->ift_ctx;
|
|
|
|
if_shared_ctx_t sctx = ctx->ifc_sctx;
|
|
|
|
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
|
|
|
|
device_t dev = ctx->ifc_dev;
|
Assorted TSO fixes for em(4)/iflib(9) and dead code removal:
- Ever since the workaround for the silicon bug of TSO4 causing MAC hangs
was committed in r295133, CSUM_TSO always got disabled unconditionally
by em(4) on the first invocation of em_init_locked(). However, even with
that problem fixed, it turned out that for at least e. g. 82579 not all
necessary TSO workarounds are in place, still causing MAC hangs even at
Gigabit speed. Thus, for stable/11, TSO usage was deliberately disabled
in r323292 (r323293 for stable/10) for the EM-class by default, allowing
users to turn it on if it happens to work with their particular EM MAC
in a Gigabit-only environment.
In head, the TSO workaround for speeds other than Gigabit was lost with
the conversion to iflib(9) in r311849 (possibly along with another one
or two TSO workarounds). Yet at the same time, for EM-class MACs TSO4
got enabled by default again, causing device hangs. Therefore, change the
default for this hardware class back to have TSO4 off, allowing users
to turn it on manually if it happens to work in their environment as
we do in stable/{10,11}. An alternative would be to add a whitelist of
EM-class devices where TSO4 actually is reliable with the workarounds in
place, but given that the advantage of TSO at Gigabit speed is rather
limited - especially with the overhead of these workarounds -, that's
really not worth it. [1]
This change includes the addition of an isc_capabilities to struct
if_softc_ctx so iflib(9) can also handle interface capabilities that
shouldn't be enabled by default which is used to handle the default-off
capabilities of e1000 as suggested by shurd@ and moving their handling
from em_setup_interface() to em_if_attach_pre() accordingly.
- Although 82543 support TSO4 in theory, the former lem(4) didn't have
support for TSO4, presumably because TSO4 is even more broken in the
LEM-class of MACs than the later EM ones. Still, TSO4 for LEM-class
devices was enabled as part of the conversion to iflib(9) in r311849,
causing device hangs. So revert back to the pre-r311849 behavior of
not supporting TSO4 for LEM-class at all, which includes not creating
a TSO DMA tag in iflib(9) for devices not having IFCAP_TSO4 set. [2]
- In fact, the FreeBSD TCP stack can handle a TSO size of IP_MAXPACKET
(65535) rather than FREEBSD_TSO_SIZE_MAX (65518). However, the TSO
DMA must have a maxsize of the maximum TSO size plus the size of a
VLAN header for software VLAN tagging. The iflib(9) converted em(4),
thus, first correctly sets scctx->isc_tx_tso_size_max to EM_TSO_SIZE
in em_if_attach_pre(), but later on overrides it with IP_MAXPACKET
in em_setup_interface() (apparently, left-over from pre-iflib(9)
times). So remove the later and correct iflib(9) to correctly cap
the maximum TSO size reported to the stack at IP_MAXPACKET. While at
it, let iflib(9) use if_sethwtsomax*().
This change includes the addition of isc_tso_max{seg,}size DMA engine
constraints for the TSO DMA tag to struct if_shared_ctx and letting
iflib_txsd_alloc() automatically adjust the maxsize of that tag in case
IFCAP_VLAN_MTU is supported as requested by shurd@.
- Move the if_setifheaderlen(9) call for adjusting the maximum Ethernet
header length from {ixgbe,ixl,ixlv,ixv,em}_setup_interface() to iflib(9)
so adjustment is automatically done in case IFCAP_VLAN_MTU is supported.
As a consequence, this adjustment now is also done in case of bnxt(4)
which missed it previously.
- Move the reduction of the maximum TSO segment count reported to the
stack by the number of m_pullup(9) calls (which in the worst case,
can add another mbuf and, thus, the requirement for another DMA
segment each) in the transmit path for performance reasons from
em_setup_interface() to iflib_txsd_alloc() as these pull-ups are now
done in iflib_parse_header() rather than in the no longer existing
em_xmit(). Moreover, this optimization applies to all drivers using
iflib(9) and not just em(4); all in-tree iflib(9) consumers still
have enough room to handle full size TSO packets. Also, reduce the
adjustment to the maximum number of m_pullup(9)'s now performed in
iflib_parse_header().
- Prior to the conversion of em(4)/igb(4)/lem(4) and ixl(4) to iflib(9)
in r311849 and r335338 respectively, these drivers didn't enable
IFCAP_VLAN_HWFILTER by default due to VLAN events not being passed
through by lagg(4). With iflib(9), IFCAP_VLAN_HWFILTER was turned on
by default but also lagg(4) was fixed in that regard in r203548. So
just remove the now redundant and defunct IFCAP_VLAN_HWFILTER handling
in {em,ixl,ixlv}_setup_interface().
- Nuke other redundant IFCAP_* setting in {em,ixl,ixlv}_setup_interface()
which is (more completely) already done in {em,ixl,ixlv}_if_attach_pre()
now.
- Remove some redundant/dead setting of scctx->isc_tx_csum_flags in
em_if_attach_pre().
- Remove some IFCAP_* duplicated either directly or indirectly (e. g.
via IFCAP_HWCSUM) in {EM,IGB,IXL}_CAPS.
- Don't bother to fiddle with IFCAP_HWSTATS in ixgbe(4)/ixgbev(4) as
iflib(9) adds that capability unconditionally.
- Remove some unused macros from em(4).
- Bump __FreeBSD_version as some of the above changes require the modules
of drivers using iflib(9) to be recompiled.
Okayed by: sbruno@ at 201806 DevSummit Transport Working Group [1]
Reviewed by: sbruno (earlier version), erj
PR: 219428 (part of; comment #10) [1], 220997 (part of; comment #3) [2]
Differential Revision: https://reviews.freebsd.org/D15720
2018-07-15 19:04:23 +00:00
|
|
|
bus_size_t tsomaxsize;
|
2021-02-24 22:56:45 +00:00
|
|
|
bus_addr_t lowaddr;
|
2016-05-18 04:35:58 +00:00
|
|
|
int err, nsegments, ntsosegments;
|
2019-01-16 05:44:14 +00:00
|
|
|
bool tso;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
nsegments = scctx->isc_tx_nsegments;
|
|
|
|
ntsosegments = scctx->isc_tx_tso_segments_max;
|
Assorted TSO fixes for em(4)/iflib(9) and dead code removal:
- Ever since the workaround for the silicon bug of TSO4 causing MAC hangs
was committed in r295133, CSUM_TSO always got disabled unconditionally
by em(4) on the first invocation of em_init_locked(). However, even with
that problem fixed, it turned out that for at least e. g. 82579 not all
necessary TSO workarounds are in place, still causing MAC hangs even at
Gigabit speed. Thus, for stable/11, TSO usage was deliberately disabled
in r323292 (r323293 for stable/10) for the EM-class by default, allowing
users to turn it on if it happens to work with their particular EM MAC
in a Gigabit-only environment.
In head, the TSO workaround for speeds other than Gigabit was lost with
the conversion to iflib(9) in r311849 (possibly along with another one
or two TSO workarounds). Yet at the same time, for EM-class MACs TSO4
got enabled by default again, causing device hangs. Therefore, change the
default for this hardware class back to have TSO4 off, allowing users
to turn it on manually if it happens to work in their environment as
we do in stable/{10,11}. An alternative would be to add a whitelist of
EM-class devices where TSO4 actually is reliable with the workarounds in
place, but given that the advantage of TSO at Gigabit speed is rather
limited - especially with the overhead of these workarounds -, that's
really not worth it. [1]
This change includes the addition of an isc_capabilities to struct
if_softc_ctx so iflib(9) can also handle interface capabilities that
shouldn't be enabled by default which is used to handle the default-off
capabilities of e1000 as suggested by shurd@ and moving their handling
from em_setup_interface() to em_if_attach_pre() accordingly.
- Although 82543 support TSO4 in theory, the former lem(4) didn't have
support for TSO4, presumably because TSO4 is even more broken in the
LEM-class of MACs than the later EM ones. Still, TSO4 for LEM-class
devices was enabled as part of the conversion to iflib(9) in r311849,
causing device hangs. So revert back to the pre-r311849 behavior of
not supporting TSO4 for LEM-class at all, which includes not creating
a TSO DMA tag in iflib(9) for devices not having IFCAP_TSO4 set. [2]
- In fact, the FreeBSD TCP stack can handle a TSO size of IP_MAXPACKET
(65535) rather than FREEBSD_TSO_SIZE_MAX (65518). However, the TSO
DMA must have a maxsize of the maximum TSO size plus the size of a
VLAN header for software VLAN tagging. The iflib(9) converted em(4),
thus, first correctly sets scctx->isc_tx_tso_size_max to EM_TSO_SIZE
in em_if_attach_pre(), but later on overrides it with IP_MAXPACKET
in em_setup_interface() (apparently, left-over from pre-iflib(9)
times). So remove the later and correct iflib(9) to correctly cap
the maximum TSO size reported to the stack at IP_MAXPACKET. While at
it, let iflib(9) use if_sethwtsomax*().
This change includes the addition of isc_tso_max{seg,}size DMA engine
constraints for the TSO DMA tag to struct if_shared_ctx and letting
iflib_txsd_alloc() automatically adjust the maxsize of that tag in case
IFCAP_VLAN_MTU is supported as requested by shurd@.
- Move the if_setifheaderlen(9) call for adjusting the maximum Ethernet
header length from {ixgbe,ixl,ixlv,ixv,em}_setup_interface() to iflib(9)
so adjustment is automatically done in case IFCAP_VLAN_MTU is supported.
As a consequence, this adjustment now is also done in case of bnxt(4)
which missed it previously.
- Move the reduction of the maximum TSO segment count reported to the
stack by the number of m_pullup(9) calls (which in the worst case,
can add another mbuf and, thus, the requirement for another DMA
segment each) in the transmit path for performance reasons from
em_setup_interface() to iflib_txsd_alloc() as these pull-ups are now
done in iflib_parse_header() rather than in the no longer existing
em_xmit(). Moreover, this optimization applies to all drivers using
iflib(9) and not just em(4); all in-tree iflib(9) consumers still
have enough room to handle full size TSO packets. Also, reduce the
adjustment to the maximum number of m_pullup(9)'s now performed in
iflib_parse_header().
- Prior to the conversion of em(4)/igb(4)/lem(4) and ixl(4) to iflib(9)
in r311849 and r335338 respectively, these drivers didn't enable
IFCAP_VLAN_HWFILTER by default due to VLAN events not being passed
through by lagg(4). With iflib(9), IFCAP_VLAN_HWFILTER was turned on
by default but also lagg(4) was fixed in that regard in r203548. So
just remove the now redundant and defunct IFCAP_VLAN_HWFILTER handling
in {em,ixl,ixlv}_setup_interface().
- Nuke other redundant IFCAP_* setting in {em,ixl,ixlv}_setup_interface()
which is (more completely) already done in {em,ixl,ixlv}_if_attach_pre()
now.
- Remove some redundant/dead setting of scctx->isc_tx_csum_flags in
em_if_attach_pre().
- Remove some IFCAP_* duplicated either directly or indirectly (e. g.
via IFCAP_HWCSUM) in {EM,IGB,IXL}_CAPS.
- Don't bother to fiddle with IFCAP_HWSTATS in ixgbe(4)/ixgbev(4) as
iflib(9) adds that capability unconditionally.
- Remove some unused macros from em(4).
- Bump __FreeBSD_version as some of the above changes require the modules
of drivers using iflib(9) to be recompiled.
Okayed by: sbruno@ at 201806 DevSummit Transport Working Group [1]
Reviewed by: sbruno (earlier version), erj
PR: 219428 (part of; comment #10) [1], 220997 (part of; comment #3) [2]
Differential Revision: https://reviews.freebsd.org/D15720
2018-07-15 19:04:23 +00:00
|
|
|
tsomaxsize = scctx->isc_tx_tso_size_max;
|
|
|
|
if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_VLAN_MTU)
|
|
|
|
tsomaxsize += sizeof(struct ether_vlan_header);
|
2016-08-12 21:29:44 +00:00
|
|
|
MPASS(scctx->isc_ntxd[0] > 0);
|
|
|
|
MPASS(scctx->isc_ntxd[txq->ift_br_offset] > 0);
|
2016-05-18 04:35:58 +00:00
|
|
|
MPASS(nsegments > 0);
|
Assorted TSO fixes for em(4)/iflib(9) and dead code removal:
- Ever since the workaround for the silicon bug of TSO4 causing MAC hangs
was committed in r295133, CSUM_TSO always got disabled unconditionally
by em(4) on the first invocation of em_init_locked(). However, even with
that problem fixed, it turned out that for at least e. g. 82579 not all
necessary TSO workarounds are in place, still causing MAC hangs even at
Gigabit speed. Thus, for stable/11, TSO usage was deliberately disabled
in r323292 (r323293 for stable/10) for the EM-class by default, allowing
users to turn it on if it happens to work with their particular EM MAC
in a Gigabit-only environment.
In head, the TSO workaround for speeds other than Gigabit was lost with
the conversion to iflib(9) in r311849 (possibly along with another one
or two TSO workarounds). Yet at the same time, for EM-class MACs TSO4
got enabled by default again, causing device hangs. Therefore, change the
default for this hardware class back to have TSO4 off, allowing users
to turn it on manually if it happens to work in their environment as
we do in stable/{10,11}. An alternative would be to add a whitelist of
EM-class devices where TSO4 actually is reliable with the workarounds in
place, but given that the advantage of TSO at Gigabit speed is rather
limited - especially with the overhead of these workarounds -, that's
really not worth it. [1]
This change includes the addition of an isc_capabilities to struct
if_softc_ctx so iflib(9) can also handle interface capabilities that
shouldn't be enabled by default which is used to handle the default-off
capabilities of e1000 as suggested by shurd@ and moving their handling
from em_setup_interface() to em_if_attach_pre() accordingly.
- Although 82543 support TSO4 in theory, the former lem(4) didn't have
support for TSO4, presumably because TSO4 is even more broken in the
LEM-class of MACs than the later EM ones. Still, TSO4 for LEM-class
devices was enabled as part of the conversion to iflib(9) in r311849,
causing device hangs. So revert back to the pre-r311849 behavior of
not supporting TSO4 for LEM-class at all, which includes not creating
a TSO DMA tag in iflib(9) for devices not having IFCAP_TSO4 set. [2]
- In fact, the FreeBSD TCP stack can handle a TSO size of IP_MAXPACKET
(65535) rather than FREEBSD_TSO_SIZE_MAX (65518). However, the TSO
DMA must have a maxsize of the maximum TSO size plus the size of a
VLAN header for software VLAN tagging. The iflib(9) converted em(4),
thus, first correctly sets scctx->isc_tx_tso_size_max to EM_TSO_SIZE
in em_if_attach_pre(), but later on overrides it with IP_MAXPACKET
in em_setup_interface() (apparently, left-over from pre-iflib(9)
times). So remove the later and correct iflib(9) to correctly cap
the maximum TSO size reported to the stack at IP_MAXPACKET. While at
it, let iflib(9) use if_sethwtsomax*().
This change includes the addition of isc_tso_max{seg,}size DMA engine
constraints for the TSO DMA tag to struct if_shared_ctx and letting
iflib_txsd_alloc() automatically adjust the maxsize of that tag in case
IFCAP_VLAN_MTU is supported as requested by shurd@.
- Move the if_setifheaderlen(9) call for adjusting the maximum Ethernet
header length from {ixgbe,ixl,ixlv,ixv,em}_setup_interface() to iflib(9)
so adjustment is automatically done in case IFCAP_VLAN_MTU is supported.
As a consequence, this adjustment now is also done in case of bnxt(4)
which missed it previously.
- Move the reduction of the maximum TSO segment count reported to the
stack by the number of m_pullup(9) calls (which in the worst case,
can add another mbuf and, thus, the requirement for another DMA
segment each) in the transmit path for performance reasons from
em_setup_interface() to iflib_txsd_alloc() as these pull-ups are now
done in iflib_parse_header() rather than in the no longer existing
em_xmit(). Moreover, this optimization applies to all drivers using
iflib(9) and not just em(4); all in-tree iflib(9) consumers still
have enough room to handle full size TSO packets. Also, reduce the
adjustment to the maximum number of m_pullup(9)'s now performed in
iflib_parse_header().
- Prior to the conversion of em(4)/igb(4)/lem(4) and ixl(4) to iflib(9)
in r311849 and r335338 respectively, these drivers didn't enable
IFCAP_VLAN_HWFILTER by default due to VLAN events not being passed
through by lagg(4). With iflib(9), IFCAP_VLAN_HWFILTER was turned on
by default but also lagg(4) was fixed in that regard in r203548. So
just remove the now redundant and defunct IFCAP_VLAN_HWFILTER handling
in {em,ixl,ixlv}_setup_interface().
- Nuke other redundant IFCAP_* setting in {em,ixl,ixlv}_setup_interface()
which is (more completely) already done in {em,ixl,ixlv}_if_attach_pre()
now.
- Remove some redundant/dead setting of scctx->isc_tx_csum_flags in
em_if_attach_pre().
- Remove some IFCAP_* duplicated either directly or indirectly (e. g.
via IFCAP_HWCSUM) in {EM,IGB,IXL}_CAPS.
- Don't bother to fiddle with IFCAP_HWSTATS in ixgbe(4)/ixgbev(4) as
iflib(9) adds that capability unconditionally.
- Remove some unused macros from em(4).
- Bump __FreeBSD_version as some of the above changes require the modules
of drivers using iflib(9) to be recompiled.
Okayed by: sbruno@ at 201806 DevSummit Transport Working Group [1]
Reviewed by: sbruno (earlier version), erj
PR: 219428 (part of; comment #10) [1], 220997 (part of; comment #3) [2]
Differential Revision: https://reviews.freebsd.org/D15720
2018-07-15 19:04:23 +00:00
|
|
|
if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) {
|
|
|
|
MPASS(ntsosegments > 0);
|
|
|
|
MPASS(sctx->isc_tso_maxsize >= tsomaxsize);
|
|
|
|
}
|
|
|
|
|
2021-02-24 22:56:45 +00:00
|
|
|
lowaddr = DMA_WIDTH_TO_BUS_LOWADDR(scctx->isc_dma_width);
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
* Set up DMA tags for TX buffers.
|
2016-05-18 04:35:58 +00:00
|
|
|
*/
|
|
|
|
if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
|
|
|
|
1, 0, /* alignment, bounds */
|
2021-02-24 22:56:45 +00:00
|
|
|
lowaddr, /* lowaddr */
|
2016-05-18 04:35:58 +00:00
|
|
|
BUS_SPACE_MAXADDR, /* highaddr */
|
|
|
|
NULL, NULL, /* filter, filterarg */
|
|
|
|
sctx->isc_tx_maxsize, /* maxsize */
|
|
|
|
nsegments, /* nsegments */
|
|
|
|
sctx->isc_tx_maxsegsize, /* maxsegsize */
|
|
|
|
0, /* flags */
|
|
|
|
NULL, /* lockfunc */
|
|
|
|
NULL, /* lockfuncarg */
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
&txq->ift_buf_tag))) {
|
2016-05-18 04:35:58 +00:00
|
|
|
device_printf(dev,"Unable to allocate TX DMA tag: %d\n", err);
|
2017-07-20 20:28:31 +00:00
|
|
|
device_printf(dev,"maxsize: %ju nsegments: %d maxsegsize: %ju\n",
|
|
|
|
(uintmax_t)sctx->isc_tx_maxsize, nsegments, (uintmax_t)sctx->isc_tx_maxsegsize);
|
2016-05-18 04:35:58 +00:00
|
|
|
goto fail;
|
|
|
|
}
|
2019-01-16 05:44:14 +00:00
|
|
|
tso = (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) != 0;
|
|
|
|
if (tso && (err = bus_dma_tag_create(bus_get_dma_tag(dev),
|
2016-05-18 04:35:58 +00:00
|
|
|
1, 0, /* alignment, bounds */
|
2021-02-24 22:56:45 +00:00
|
|
|
lowaddr, /* lowaddr */
|
2016-05-18 04:35:58 +00:00
|
|
|
BUS_SPACE_MAXADDR, /* highaddr */
|
|
|
|
NULL, NULL, /* filter, filterarg */
|
Assorted TSO fixes for em(4)/iflib(9) and dead code removal:
- Ever since the workaround for the silicon bug of TSO4 causing MAC hangs
was committed in r295133, CSUM_TSO always got disabled unconditionally
by em(4) on the first invocation of em_init_locked(). However, even with
that problem fixed, it turned out that for at least e. g. 82579 not all
necessary TSO workarounds are in place, still causing MAC hangs even at
Gigabit speed. Thus, for stable/11, TSO usage was deliberately disabled
in r323292 (r323293 for stable/10) for the EM-class by default, allowing
users to turn it on if it happens to work with their particular EM MAC
in a Gigabit-only environment.
In head, the TSO workaround for speeds other than Gigabit was lost with
the conversion to iflib(9) in r311849 (possibly along with another one
or two TSO workarounds). Yet at the same time, for EM-class MACs TSO4
got enabled by default again, causing device hangs. Therefore, change the
default for this hardware class back to have TSO4 off, allowing users
to turn it on manually if it happens to work in their environment as
we do in stable/{10,11}. An alternative would be to add a whitelist of
EM-class devices where TSO4 actually is reliable with the workarounds in
place, but given that the advantage of TSO at Gigabit speed is rather
limited - especially with the overhead of these workarounds -, that's
really not worth it. [1]
This change includes the addition of an isc_capabilities to struct
if_softc_ctx so iflib(9) can also handle interface capabilities that
shouldn't be enabled by default which is used to handle the default-off
capabilities of e1000 as suggested by shurd@ and moving their handling
from em_setup_interface() to em_if_attach_pre() accordingly.
- Although 82543 support TSO4 in theory, the former lem(4) didn't have
support for TSO4, presumably because TSO4 is even more broken in the
LEM-class of MACs than the later EM ones. Still, TSO4 for LEM-class
devices was enabled as part of the conversion to iflib(9) in r311849,
causing device hangs. So revert back to the pre-r311849 behavior of
not supporting TSO4 for LEM-class at all, which includes not creating
a TSO DMA tag in iflib(9) for devices not having IFCAP_TSO4 set. [2]
- In fact, the FreeBSD TCP stack can handle a TSO size of IP_MAXPACKET
(65535) rather than FREEBSD_TSO_SIZE_MAX (65518). However, the TSO
DMA must have a maxsize of the maximum TSO size plus the size of a
VLAN header for software VLAN tagging. The iflib(9) converted em(4),
thus, first correctly sets scctx->isc_tx_tso_size_max to EM_TSO_SIZE
in em_if_attach_pre(), but later on overrides it with IP_MAXPACKET
in em_setup_interface() (apparently, left-over from pre-iflib(9)
times). So remove the later and correct iflib(9) to correctly cap
the maximum TSO size reported to the stack at IP_MAXPACKET. While at
it, let iflib(9) use if_sethwtsomax*().
This change includes the addition of isc_tso_max{seg,}size DMA engine
constraints for the TSO DMA tag to struct if_shared_ctx and letting
iflib_txsd_alloc() automatically adjust the maxsize of that tag in case
IFCAP_VLAN_MTU is supported as requested by shurd@.
- Move the if_setifheaderlen(9) call for adjusting the maximum Ethernet
header length from {ixgbe,ixl,ixlv,ixv,em}_setup_interface() to iflib(9)
so adjustment is automatically done in case IFCAP_VLAN_MTU is supported.
As a consequence, this adjustment now is also done in case of bnxt(4)
which missed it previously.
- Move the reduction of the maximum TSO segment count reported to the
stack by the number of m_pullup(9) calls (which in the worst case,
can add another mbuf and, thus, the requirement for another DMA
segment each) in the transmit path for performance reasons from
em_setup_interface() to iflib_txsd_alloc() as these pull-ups are now
done in iflib_parse_header() rather than in the no longer existing
em_xmit(). Moreover, this optimization applies to all drivers using
iflib(9) and not just em(4); all in-tree iflib(9) consumers still
have enough room to handle full size TSO packets. Also, reduce the
adjustment to the maximum number of m_pullup(9)'s now performed in
iflib_parse_header().
- Prior to the conversion of em(4)/igb(4)/lem(4) and ixl(4) to iflib(9)
in r311849 and r335338 respectively, these drivers didn't enable
IFCAP_VLAN_HWFILTER by default due to VLAN events not being passed
through by lagg(4). With iflib(9), IFCAP_VLAN_HWFILTER was turned on
by default but also lagg(4) was fixed in that regard in r203548. So
just remove the now redundant and defunct IFCAP_VLAN_HWFILTER handling
in {em,ixl,ixlv}_setup_interface().
- Nuke other redundant IFCAP_* setting in {em,ixl,ixlv}_setup_interface()
which is (more completely) already done in {em,ixl,ixlv}_if_attach_pre()
now.
- Remove some redundant/dead setting of scctx->isc_tx_csum_flags in
em_if_attach_pre().
- Remove some IFCAP_* duplicated either directly or indirectly (e. g.
via IFCAP_HWCSUM) in {EM,IGB,IXL}_CAPS.
- Don't bother to fiddle with IFCAP_HWSTATS in ixgbe(4)/ixgbev(4) as
iflib(9) adds that capability unconditionally.
- Remove some unused macros from em(4).
- Bump __FreeBSD_version as some of the above changes require the modules
of drivers using iflib(9) to be recompiled.
Okayed by: sbruno@ at 201806 DevSummit Transport Working Group [1]
Reviewed by: sbruno (earlier version), erj
PR: 219428 (part of; comment #10) [1], 220997 (part of; comment #3) [2]
Differential Revision: https://reviews.freebsd.org/D15720
2018-07-15 19:04:23 +00:00
|
|
|
tsomaxsize, /* maxsize */
|
2016-05-18 04:35:58 +00:00
|
|
|
ntsosegments, /* nsegments */
|
Assorted TSO fixes for em(4)/iflib(9) and dead code removal:
- Ever since the workaround for the silicon bug of TSO4 causing MAC hangs
was committed in r295133, CSUM_TSO always got disabled unconditionally
by em(4) on the first invocation of em_init_locked(). However, even with
that problem fixed, it turned out that for at least e. g. 82579 not all
necessary TSO workarounds are in place, still causing MAC hangs even at
Gigabit speed. Thus, for stable/11, TSO usage was deliberately disabled
in r323292 (r323293 for stable/10) for the EM-class by default, allowing
users to turn it on if it happens to work with their particular EM MAC
in a Gigabit-only environment.
In head, the TSO workaround for speeds other than Gigabit was lost with
the conversion to iflib(9) in r311849 (possibly along with another one
or two TSO workarounds). Yet at the same time, for EM-class MACs TSO4
got enabled by default again, causing device hangs. Therefore, change the
default for this hardware class back to have TSO4 off, allowing users
to turn it on manually if it happens to work in their environment as
we do in stable/{10,11}. An alternative would be to add a whitelist of
EM-class devices where TSO4 actually is reliable with the workarounds in
place, but given that the advantage of TSO at Gigabit speed is rather
limited - especially with the overhead of these workarounds -, that's
really not worth it. [1]
This change includes the addition of an isc_capabilities to struct
if_softc_ctx so iflib(9) can also handle interface capabilities that
shouldn't be enabled by default which is used to handle the default-off
capabilities of e1000 as suggested by shurd@ and moving their handling
from em_setup_interface() to em_if_attach_pre() accordingly.
- Although 82543 support TSO4 in theory, the former lem(4) didn't have
support for TSO4, presumably because TSO4 is even more broken in the
LEM-class of MACs than the later EM ones. Still, TSO4 for LEM-class
devices was enabled as part of the conversion to iflib(9) in r311849,
causing device hangs. So revert back to the pre-r311849 behavior of
not supporting TSO4 for LEM-class at all, which includes not creating
a TSO DMA tag in iflib(9) for devices not having IFCAP_TSO4 set. [2]
- In fact, the FreeBSD TCP stack can handle a TSO size of IP_MAXPACKET
(65535) rather than FREEBSD_TSO_SIZE_MAX (65518). However, the TSO
DMA must have a maxsize of the maximum TSO size plus the size of a
VLAN header for software VLAN tagging. The iflib(9) converted em(4),
thus, first correctly sets scctx->isc_tx_tso_size_max to EM_TSO_SIZE
in em_if_attach_pre(), but later on overrides it with IP_MAXPACKET
in em_setup_interface() (apparently, left-over from pre-iflib(9)
times). So remove the later and correct iflib(9) to correctly cap
the maximum TSO size reported to the stack at IP_MAXPACKET. While at
it, let iflib(9) use if_sethwtsomax*().
This change includes the addition of isc_tso_max{seg,}size DMA engine
constraints for the TSO DMA tag to struct if_shared_ctx and letting
iflib_txsd_alloc() automatically adjust the maxsize of that tag in case
IFCAP_VLAN_MTU is supported as requested by shurd@.
- Move the if_setifheaderlen(9) call for adjusting the maximum Ethernet
header length from {ixgbe,ixl,ixlv,ixv,em}_setup_interface() to iflib(9)
so adjustment is automatically done in case IFCAP_VLAN_MTU is supported.
As a consequence, this adjustment now is also done in case of bnxt(4)
which missed it previously.
- Move the reduction of the maximum TSO segment count reported to the
stack by the number of m_pullup(9) calls (which in the worst case,
can add another mbuf and, thus, the requirement for another DMA
segment each) in the transmit path for performance reasons from
em_setup_interface() to iflib_txsd_alloc() as these pull-ups are now
done in iflib_parse_header() rather than in the no longer existing
em_xmit(). Moreover, this optimization applies to all drivers using
iflib(9) and not just em(4); all in-tree iflib(9) consumers still
have enough room to handle full size TSO packets. Also, reduce the
adjustment to the maximum number of m_pullup(9)'s now performed in
iflib_parse_header().
- Prior to the conversion of em(4)/igb(4)/lem(4) and ixl(4) to iflib(9)
in r311849 and r335338 respectively, these drivers didn't enable
IFCAP_VLAN_HWFILTER by default due to VLAN events not being passed
through by lagg(4). With iflib(9), IFCAP_VLAN_HWFILTER was turned on
by default but also lagg(4) was fixed in that regard in r203548. So
just remove the now redundant and defunct IFCAP_VLAN_HWFILTER handling
in {em,ixl,ixlv}_setup_interface().
- Nuke other redundant IFCAP_* setting in {em,ixl,ixlv}_setup_interface()
which is (more completely) already done in {em,ixl,ixlv}_if_attach_pre()
now.
- Remove some redundant/dead setting of scctx->isc_tx_csum_flags in
em_if_attach_pre().
- Remove some IFCAP_* duplicated either directly or indirectly (e. g.
via IFCAP_HWCSUM) in {EM,IGB,IXL}_CAPS.
- Don't bother to fiddle with IFCAP_HWSTATS in ixgbe(4)/ixgbev(4) as
iflib(9) adds that capability unconditionally.
- Remove some unused macros from em(4).
- Bump __FreeBSD_version as some of the above changes require the modules
of drivers using iflib(9) to be recompiled.
Okayed by: sbruno@ at 201806 DevSummit Transport Working Group [1]
Reviewed by: sbruno (earlier version), erj
PR: 219428 (part of; comment #10) [1], 220997 (part of; comment #3) [2]
Differential Revision: https://reviews.freebsd.org/D15720
2018-07-15 19:04:23 +00:00
|
|
|
sctx->isc_tso_maxsegsize,/* maxsegsize */
|
2016-05-18 04:35:58 +00:00
|
|
|
0, /* flags */
|
|
|
|
NULL, /* lockfunc */
|
|
|
|
NULL, /* lockfuncarg */
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
&txq->ift_tso_buf_tag))) {
|
|
|
|
device_printf(dev, "Unable to allocate TSO TX DMA tag: %d\n",
|
|
|
|
err);
|
2016-05-18 04:35:58 +00:00
|
|
|
goto fail;
|
|
|
|
}
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
|
|
|
|
/* Allocate memory for the TX mbuf map. */
|
2016-05-18 04:35:58 +00:00
|
|
|
if (!(txq->ift_sds.ifsd_m =
|
2018-01-21 15:42:36 +00:00
|
|
|
(struct mbuf **) malloc(sizeof(struct mbuf *) *
|
|
|
|
scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
device_printf(dev, "Unable to allocate TX mbuf map memory\n");
|
2016-05-18 04:35:58 +00:00
|
|
|
err = ENOMEM;
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
/*
|
|
|
|
* Create the DMA maps for TX buffers.
|
|
|
|
*/
|
2019-01-16 05:44:14 +00:00
|
|
|
if ((txq->ift_sds.ifsd_map = (bus_dmamap_t *)malloc(
|
|
|
|
sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
|
|
|
|
M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
device_printf(dev,
|
|
|
|
"Unable to allocate TX buffer DMA map memory\n");
|
2016-05-18 04:35:58 +00:00
|
|
|
err = ENOMEM;
|
|
|
|
goto fail;
|
|
|
|
}
|
2019-01-16 05:44:14 +00:00
|
|
|
if (tso && (txq->ift_sds.ifsd_tso_map = (bus_dmamap_t *)malloc(
|
|
|
|
sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
|
|
|
|
M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
device_printf(dev,
|
|
|
|
"Unable to allocate TSO TX buffer map memory\n");
|
2019-01-16 05:44:14 +00:00
|
|
|
err = ENOMEM;
|
|
|
|
goto fail;
|
|
|
|
}
|
2016-08-12 21:29:44 +00:00
|
|
|
for (int i = 0; i < scctx->isc_ntxd[txq->ift_br_offset]; i++) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
err = bus_dmamap_create(txq->ift_buf_tag, 0,
|
2019-01-16 05:44:14 +00:00
|
|
|
&txq->ift_sds.ifsd_map[i]);
|
2016-05-18 04:35:58 +00:00
|
|
|
if (err != 0) {
|
|
|
|
device_printf(dev, "Unable to create TX DMA map\n");
|
|
|
|
goto fail;
|
|
|
|
}
|
2019-01-16 05:44:14 +00:00
|
|
|
if (!tso)
|
|
|
|
continue;
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
err = bus_dmamap_create(txq->ift_tso_buf_tag, 0,
|
2019-01-16 05:44:14 +00:00
|
|
|
&txq->ift_sds.ifsd_tso_map[i]);
|
|
|
|
if (err != 0) {
|
|
|
|
device_printf(dev, "Unable to create TSO TX DMA map\n");
|
|
|
|
goto fail;
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
fail:
|
|
|
|
/* We free all, it handles case where we are in the middle */
|
|
|
|
iflib_tx_structures_free(ctx);
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
iflib_txsd_destroy(if_ctx_t ctx, iflib_txq_t txq, int i)
|
|
|
|
{
|
|
|
|
bus_dmamap_t map;
|
|
|
|
|
iflib: properly release memory allocated for DMA
DMA memory allocations using the bus_dma.h interface are not properly
released in all cases for both Tx and Rx. This causes ~448 bytes of
M_DEVBUF allocations to be leaked.
First, the DMA maps for Rx are not properly destroyed. A slight attempt
is made in iflib_fl_bufs_free to destroy the maps if we're detaching.
However, this function may not be reliably called during detach. Indeed,
there is a comment "asking" if this should be moved out.
Fix this by moving the bus_dmamap_destroy call into iflib_rx_sds_free,
where we already sync and unload the DMA.
Second, the DMA tag associated with the ifr_ifdi descriptor DMA is not
released properly anywhere. Add a call to iflib_dma_free in
iflib_rx_structures_free.
Third, use of NULL as a canary value on the map pointer returned by
bus_dmamap_create is not valid. On some platforms, notably x86, this
value may be NULL. In this case, we fail to properly release the related
resources.
Remove the NULL checks on map values in both iflib_fl_bufs_free and
iflib_txsd_destroy.
With all of these fixes applied, the leaks to M_DEVBUF are squelched,
and iflib drivers now seem to properly cleanup when detaching.
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Submitted by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed by: erj@, gallatin@
MFC after: 1 week
Sponsored by: Intel Corporation
Differential Revision: https://reviews.freebsd.org/D22203
2019-11-04 23:06:57 +00:00
|
|
|
if (txq->ift_sds.ifsd_map != NULL) {
|
2016-05-18 04:35:58 +00:00
|
|
|
map = txq->ift_sds.ifsd_map[i];
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_sync(txq->ift_buf_tag, map, BUS_DMASYNC_POSTWRITE);
|
|
|
|
bus_dmamap_unload(txq->ift_buf_tag, map);
|
|
|
|
bus_dmamap_destroy(txq->ift_buf_tag, map);
|
2016-05-18 04:35:58 +00:00
|
|
|
txq->ift_sds.ifsd_map[i] = NULL;
|
|
|
|
}
|
2019-01-16 05:44:14 +00:00
|
|
|
|
iflib: properly release memory allocated for DMA
DMA memory allocations using the bus_dma.h interface are not properly
released in all cases for both Tx and Rx. This causes ~448 bytes of
M_DEVBUF allocations to be leaked.
First, the DMA maps for Rx are not properly destroyed. A slight attempt
is made in iflib_fl_bufs_free to destroy the maps if we're detaching.
However, this function may not be reliably called during detach. Indeed,
there is a comment "asking" if this should be moved out.
Fix this by moving the bus_dmamap_destroy call into iflib_rx_sds_free,
where we already sync and unload the DMA.
Second, the DMA tag associated with the ifr_ifdi descriptor DMA is not
released properly anywhere. Add a call to iflib_dma_free in
iflib_rx_structures_free.
Third, use of NULL as a canary value on the map pointer returned by
bus_dmamap_create is not valid. On some platforms, notably x86, this
value may be NULL. In this case, we fail to properly release the related
resources.
Remove the NULL checks on map values in both iflib_fl_bufs_free and
iflib_txsd_destroy.
With all of these fixes applied, the leaks to M_DEVBUF are squelched,
and iflib drivers now seem to properly cleanup when detaching.
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Submitted by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed by: erj@, gallatin@
MFC after: 1 week
Sponsored by: Intel Corporation
Differential Revision: https://reviews.freebsd.org/D22203
2019-11-04 23:06:57 +00:00
|
|
|
if (txq->ift_sds.ifsd_tso_map != NULL) {
|
2019-01-16 05:44:14 +00:00
|
|
|
map = txq->ift_sds.ifsd_tso_map[i];
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_sync(txq->ift_tso_buf_tag, map,
|
2019-01-16 05:44:14 +00:00
|
|
|
BUS_DMASYNC_POSTWRITE);
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_unload(txq->ift_tso_buf_tag, map);
|
|
|
|
bus_dmamap_destroy(txq->ift_tso_buf_tag, map);
|
2019-01-16 05:44:14 +00:00
|
|
|
txq->ift_sds.ifsd_tso_map[i] = NULL;
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
iflib_txq_destroy(iflib_txq_t txq)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = txq->ift_ctx;
|
|
|
|
|
2016-08-12 21:29:44 +00:00
|
|
|
for (int i = 0; i < txq->ift_size; i++)
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_txsd_destroy(ctx, txq, i);
|
2019-10-30 20:45:12 +00:00
|
|
|
|
|
|
|
if (txq->ift_br != NULL) {
|
|
|
|
ifmp_ring_free(txq->ift_br);
|
|
|
|
txq->ift_br = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
mtx_destroy(&txq->ift_mtx);
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
if (txq->ift_sds.ifsd_map != NULL) {
|
|
|
|
free(txq->ift_sds.ifsd_map, M_IFLIB);
|
|
|
|
txq->ift_sds.ifsd_map = NULL;
|
|
|
|
}
|
2019-01-16 05:44:14 +00:00
|
|
|
if (txq->ift_sds.ifsd_tso_map != NULL) {
|
|
|
|
free(txq->ift_sds.ifsd_tso_map, M_IFLIB);
|
|
|
|
txq->ift_sds.ifsd_tso_map = NULL;
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
if (txq->ift_sds.ifsd_m != NULL) {
|
|
|
|
free(txq->ift_sds.ifsd_m, M_IFLIB);
|
|
|
|
txq->ift_sds.ifsd_m = NULL;
|
|
|
|
}
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
if (txq->ift_buf_tag != NULL) {
|
|
|
|
bus_dma_tag_destroy(txq->ift_buf_tag);
|
|
|
|
txq->ift_buf_tag = NULL;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
if (txq->ift_tso_buf_tag != NULL) {
|
|
|
|
bus_dma_tag_destroy(txq->ift_tso_buf_tag);
|
|
|
|
txq->ift_tso_buf_tag = NULL;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
2019-10-30 20:45:12 +00:00
|
|
|
if (txq->ift_ifdi != NULL) {
|
|
|
|
free(txq->ift_ifdi, M_IFLIB);
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
iflib_txsd_free(if_ctx_t ctx, iflib_txq_t txq, int i)
|
|
|
|
{
|
|
|
|
struct mbuf **mp;
|
|
|
|
|
|
|
|
mp = &txq->ift_sds.ifsd_m[i];
|
|
|
|
if (*mp == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (txq->ift_sds.ifsd_map != NULL) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_sync(txq->ift_buf_tag,
|
2019-01-16 05:44:14 +00:00
|
|
|
txq->ift_sds.ifsd_map[i], BUS_DMASYNC_POSTWRITE);
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[i]);
|
2019-01-16 05:44:14 +00:00
|
|
|
}
|
|
|
|
if (txq->ift_sds.ifsd_tso_map != NULL) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_sync(txq->ift_tso_buf_tag,
|
2019-01-16 05:44:14 +00:00
|
|
|
txq->ift_sds.ifsd_tso_map[i], BUS_DMASYNC_POSTWRITE);
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_unload(txq->ift_tso_buf_tag,
|
2019-01-16 05:44:14 +00:00
|
|
|
txq->ift_sds.ifsd_tso_map[i]);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
2020-11-11 18:00:06 +00:00
|
|
|
m_freem(*mp);
|
2016-05-18 04:35:58 +00:00
|
|
|
DBG_COUNTER_INC(tx_frees);
|
|
|
|
*mp = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
iflib_txq_setup(iflib_txq_t txq)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = txq->ift_ctx;
|
2016-08-12 21:29:44 +00:00
|
|
|
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
|
2018-11-14 15:16:45 +00:00
|
|
|
if_shared_ctx_t sctx = ctx->ifc_sctx;
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_dma_info_t di;
|
|
|
|
int i;
|
|
|
|
|
2017-01-02 00:56:33 +00:00
|
|
|
/* Set number of descriptors available */
|
2016-05-18 04:35:58 +00:00
|
|
|
txq->ift_qstatus = IFLIB_QUEUE_IDLE;
|
2017-03-13 22:53:06 +00:00
|
|
|
/* XXX make configurable */
|
|
|
|
txq->ift_update_freq = IFLIB_DEFAULT_TX_UPDATE_FREQ;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
/* Reset indices */
|
2017-03-13 22:53:06 +00:00
|
|
|
txq->ift_cidx_processed = 0;
|
|
|
|
txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0;
|
2016-08-12 21:29:44 +00:00
|
|
|
txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset];
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2018-11-14 15:16:45 +00:00
|
|
|
for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
|
2016-05-18 04:35:58 +00:00
|
|
|
bzero((void *)di->idi_vaddr, di->idi_size);
|
|
|
|
|
|
|
|
IFDI_TXQ_SETUP(ctx, txq->ift_id);
|
2018-11-14 15:16:45 +00:00
|
|
|
for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
|
2016-05-18 04:35:58 +00:00
|
|
|
bus_dmamap_sync(di->idi_tag, di->idi_map,
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
|
2016-05-18 04:35:58 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*********************************************************************
|
|
|
|
*
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
* Allocate DMA resources for RX buffers as well as memory for the RX
|
|
|
|
* mbuf map, direct RX cluster pointer map and RX cluster bus address
|
|
|
|
* map. RX DMA map, RX mbuf map, direct RX cluster pointer map and
|
|
|
|
* RX cluster map are kept in a iflib_sw_rx_desc_array structure.
|
|
|
|
* Since we use use one entry in iflib_sw_rx_desc_array per received
|
|
|
|
* packet, the maximum number of entries we'll need is equal to the
|
|
|
|
* number of hardware receive descriptors that we've allocated.
|
2016-05-18 04:35:58 +00:00
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
static int
|
|
|
|
iflib_rxsd_alloc(iflib_rxq_t rxq)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = rxq->ifr_ctx;
|
|
|
|
if_shared_ctx_t sctx = ctx->ifc_sctx;
|
2016-08-12 21:29:44 +00:00
|
|
|
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
|
2016-05-18 04:35:58 +00:00
|
|
|
device_t dev = ctx->ifc_dev;
|
|
|
|
iflib_fl_t fl;
|
2021-02-24 22:56:45 +00:00
|
|
|
bus_addr_t lowaddr;
|
2016-05-18 04:35:58 +00:00
|
|
|
int err;
|
|
|
|
|
2016-08-12 21:29:44 +00:00
|
|
|
MPASS(scctx->isc_nrxd[0] > 0);
|
|
|
|
MPASS(scctx->isc_nrxd[rxq->ifr_fl_offset] > 0);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2021-02-24 22:56:45 +00:00
|
|
|
lowaddr = DMA_WIDTH_TO_BUS_LOWADDR(scctx->isc_dma_width);
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
fl = rxq->ifr_fl;
|
|
|
|
for (int i = 0; i < rxq->ifr_nfl; i++, fl++) {
|
2016-08-12 21:29:44 +00:00
|
|
|
fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
/* Set up DMA tag for RX buffers. */
|
2016-05-18 04:35:58 +00:00
|
|
|
err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
|
|
|
|
1, 0, /* alignment, bounds */
|
2021-02-24 22:56:45 +00:00
|
|
|
lowaddr, /* lowaddr */
|
2016-05-18 04:35:58 +00:00
|
|
|
BUS_SPACE_MAXADDR, /* highaddr */
|
|
|
|
NULL, NULL, /* filter, filterarg */
|
|
|
|
sctx->isc_rx_maxsize, /* maxsize */
|
|
|
|
sctx->isc_rx_nsegments, /* nsegments */
|
|
|
|
sctx->isc_rx_maxsegsize, /* maxsegsize */
|
|
|
|
0, /* flags */
|
|
|
|
NULL, /* lockfunc */
|
|
|
|
NULL, /* lockarg */
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
&fl->ifl_buf_tag);
|
2016-05-18 04:35:58 +00:00
|
|
|
if (err) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
device_printf(dev,
|
|
|
|
"Unable to allocate RX DMA tag: %d\n", err);
|
2016-05-18 04:35:58 +00:00
|
|
|
goto fail;
|
|
|
|
}
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
|
|
|
|
/* Allocate memory for the RX mbuf map. */
|
2017-01-27 23:08:06 +00:00
|
|
|
if (!(fl->ifl_sds.ifsd_m =
|
2018-01-21 15:42:36 +00:00
|
|
|
(struct mbuf **) malloc(sizeof(struct mbuf *) *
|
|
|
|
scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
device_printf(dev,
|
|
|
|
"Unable to allocate RX mbuf map memory\n");
|
2017-01-27 23:08:06 +00:00
|
|
|
err = ENOMEM;
|
|
|
|
goto fail;
|
|
|
|
}
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
|
|
|
|
/* Allocate memory for the direct RX cluster pointer map. */
|
2017-01-27 23:08:06 +00:00
|
|
|
if (!(fl->ifl_sds.ifsd_cl =
|
2018-01-21 15:42:36 +00:00
|
|
|
(caddr_t *) malloc(sizeof(caddr_t) *
|
|
|
|
scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
device_printf(dev,
|
|
|
|
"Unable to allocate RX cluster map memory\n");
|
2017-01-27 23:08:06 +00:00
|
|
|
err = ENOMEM;
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
/* Allocate memory for the RX cluster bus address map. */
|
2018-11-27 20:01:05 +00:00
|
|
|
if (!(fl->ifl_sds.ifsd_ba =
|
|
|
|
(bus_addr_t *) malloc(sizeof(bus_addr_t) *
|
|
|
|
scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
device_printf(dev,
|
|
|
|
"Unable to allocate RX bus address map memory\n");
|
2018-11-27 20:01:05 +00:00
|
|
|
err = ENOMEM;
|
|
|
|
goto fail;
|
|
|
|
}
|
2017-01-27 23:08:06 +00:00
|
|
|
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
/*
|
|
|
|
* Create the DMA maps for RX buffers.
|
|
|
|
*/
|
2017-01-27 23:08:06 +00:00
|
|
|
if (!(fl->ifl_sds.ifsd_map =
|
2018-01-21 15:42:36 +00:00
|
|
|
(bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
device_printf(dev,
|
|
|
|
"Unable to allocate RX buffer DMA map memory\n");
|
2017-01-27 23:08:06 +00:00
|
|
|
err = ENOMEM;
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
err = bus_dmamap_create(fl->ifl_buf_tag, 0,
|
|
|
|
&fl->ifl_sds.ifsd_map[i]);
|
2017-01-27 23:08:06 +00:00
|
|
|
if (err != 0) {
|
2017-03-13 22:53:06 +00:00
|
|
|
device_printf(dev, "Unable to create RX buffer DMA map\n");
|
2016-05-18 04:35:58 +00:00
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
}
|
2017-01-28 15:44:14 +00:00
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
return (0);
|
|
|
|
|
|
|
|
fail:
|
|
|
|
iflib_rx_structures_free(ctx);
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Internal service routines
|
|
|
|
*/
|
|
|
|
|
|
|
|
struct rxq_refill_cb_arg {
|
|
|
|
int error;
|
|
|
|
bus_dma_segment_t seg;
|
|
|
|
int nseg;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void
|
|
|
|
_rxq_refill_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
|
|
|
|
{
|
|
|
|
struct rxq_refill_cb_arg *cb_arg = arg;
|
|
|
|
|
|
|
|
cb_arg->error = error;
|
|
|
|
cb_arg->seg = segs[0];
|
|
|
|
cb_arg->nseg = nseg;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2020-07-06 14:52:21 +00:00
|
|
|
* iflib_fl_refill - refill an rxq free-buffer list
|
2019-05-06 20:56:41 +00:00
|
|
|
* @ctx: the iflib context
|
|
|
|
* @fl: the free list to refill
|
|
|
|
* @count: the number of new buffers to allocate
|
2016-05-18 04:35:58 +00:00
|
|
|
*
|
2019-05-06 20:56:41 +00:00
|
|
|
* (Re)populate an rxq free-buffer list with up to @count new packet buffers.
|
iflib: leave only 1 receive descriptor unused
The pidx argument of isc_rxd_flush() indicates which is the last valid
receive descriptor to be used by the NIC. However, current code has
multiple issues:
- Intel drivers write pidx to their RDT register, which means that
NICs will only use the descriptors up to pidx-1 (modulo ring size N),
and won't actually use the one pointed by pidx. This does not break
reception, but it is anyway confusing and suboptimal (the NIC will
actually see only N-2 descriptors as available, rather than N-1).
Other drivers (if_vmx, if_bnxt, if_mgb) adhere to this semantic).
- The semantic used by Intel (RDT is one descriptor past the last
valid one) is used by most (if not all) NICs, and it is also used
on the TX side (also in iflib). Since iflib is not currently
using this semantic for RX, it must decrement fl->ifl_pidx
(modulo N) before calling isc_rxd_flush(), and then the
per-driver callback implementation must increment the index
again (to match the real semantic). This is confusing and suboptimal.
- The iflib refill function is also called at initialization.
However, in case the ring size is smaller than 128 (e.g. if_mgb),
the refill function will actually prepare all the receive
descriptors (N), without leaving one unused, as most of NICs assume
(e.g. to avoid RDT to overrun RDH). I can speculate that the code
looks like this right now because this issue showed up during
testing (e.g. with if_mgb), and it was easy to workaround by
decrementing pidx before isc_rxd_flush().
The goal of this change is to simplify the code (removing a bunch
of instructions from the RX fast path), and to make the semantic of
isc_rxd_flush() consistent across drivers. To achieve this, we:
- change the semantics of the pidx argument to the usual one (that
is the index one past the last valid one), so that both iflib and
drivers avoid the decrement/increment dance.
- fix the initialization code to prepare at most N-1 descriptors.
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D26191
2020-09-01 20:41:47 +00:00
|
|
|
* The caller must assure that @count does not exceed the queue's capacity
|
|
|
|
* minus one (since we always leave a descriptor unavailable).
|
2016-05-18 04:35:58 +00:00
|
|
|
*/
|
2020-02-12 08:30:07 +00:00
|
|
|
static uint8_t
|
2020-07-06 14:52:21 +00:00
|
|
|
iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
2019-01-26 21:35:51 +00:00
|
|
|
struct if_rxd_update iru;
|
|
|
|
struct rxq_refill_cb_arg cb_arg;
|
2016-05-18 04:35:58 +00:00
|
|
|
struct mbuf *m;
|
2017-01-27 23:08:06 +00:00
|
|
|
caddr_t cl, *sd_cl;
|
|
|
|
struct mbuf **sd_m;
|
|
|
|
bus_dmamap_t *sd_map;
|
2018-11-27 20:01:05 +00:00
|
|
|
bus_addr_t bus_addr, *sd_ba;
|
2019-01-26 21:35:51 +00:00
|
|
|
int err, frag_idx, i, idx, n, pidx;
|
2017-10-31 17:50:42 +00:00
|
|
|
qidx_t credits;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
iflib: leave only 1 receive descriptor unused
The pidx argument of isc_rxd_flush() indicates which is the last valid
receive descriptor to be used by the NIC. However, current code has
multiple issues:
- Intel drivers write pidx to their RDT register, which means that
NICs will only use the descriptors up to pidx-1 (modulo ring size N),
and won't actually use the one pointed by pidx. This does not break
reception, but it is anyway confusing and suboptimal (the NIC will
actually see only N-2 descriptors as available, rather than N-1).
Other drivers (if_vmx, if_bnxt, if_mgb) adhere to this semantic).
- The semantic used by Intel (RDT is one descriptor past the last
valid one) is used by most (if not all) NICs, and it is also used
on the TX side (also in iflib). Since iflib is not currently
using this semantic for RX, it must decrement fl->ifl_pidx
(modulo N) before calling isc_rxd_flush(), and then the
per-driver callback implementation must increment the index
again (to match the real semantic). This is confusing and suboptimal.
- The iflib refill function is also called at initialization.
However, in case the ring size is smaller than 128 (e.g. if_mgb),
the refill function will actually prepare all the receive
descriptors (N), without leaving one unused, as most of NICs assume
(e.g. to avoid RDT to overrun RDH). I can speculate that the code
looks like this right now because this issue showed up during
testing (e.g. with if_mgb), and it was easy to workaround by
decrementing pidx before isc_rxd_flush().
The goal of this change is to simplify the code (removing a bunch
of instructions from the RX fast path), and to make the semantic of
isc_rxd_flush() consistent across drivers. To achieve this, we:
- change the semantics of the pidx argument to the usual one (that
is the index one past the last valid one), so that both iflib and
drivers avoid the decrement/increment dance.
- fix the initialization code to prepare at most N-1 descriptors.
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D26191
2020-09-01 20:41:47 +00:00
|
|
|
MPASS(count <= fl->ifl_size - fl->ifl_credits - 1);
|
|
|
|
|
2017-01-27 23:08:06 +00:00
|
|
|
sd_m = fl->ifl_sds.ifsd_m;
|
|
|
|
sd_map = fl->ifl_sds.ifsd_map;
|
|
|
|
sd_cl = fl->ifl_sds.ifsd_cl;
|
2018-11-27 20:01:05 +00:00
|
|
|
sd_ba = fl->ifl_sds.ifsd_ba;
|
2019-01-26 21:35:51 +00:00
|
|
|
pidx = fl->ifl_pidx;
|
2017-01-27 23:08:06 +00:00
|
|
|
idx = pidx;
|
2019-01-26 21:35:51 +00:00
|
|
|
frag_idx = fl->ifl_fragidx;
|
2017-10-31 17:50:42 +00:00
|
|
|
credits = fl->ifl_credits;
|
2017-01-27 23:08:06 +00:00
|
|
|
|
2019-01-26 21:35:51 +00:00
|
|
|
i = 0;
|
|
|
|
n = count;
|
2016-05-18 04:35:58 +00:00
|
|
|
MPASS(n > 0);
|
2017-10-31 17:50:42 +00:00
|
|
|
MPASS(credits + n <= fl->ifl_size);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
if (pidx < fl->ifl_cidx)
|
|
|
|
MPASS(pidx + n <= fl->ifl_cidx);
|
2017-10-31 17:50:42 +00:00
|
|
|
if (pidx == fl->ifl_cidx && (credits < fl->ifl_size))
|
2016-05-18 04:35:58 +00:00
|
|
|
MPASS(fl->ifl_gen == 0);
|
|
|
|
if (pidx > fl->ifl_cidx)
|
|
|
|
MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx);
|
|
|
|
|
|
|
|
DBG_COUNTER_INC(fl_refills);
|
|
|
|
if (n > 8)
|
|
|
|
DBG_COUNTER_INC(fl_refills_large);
|
2017-10-30 21:14:31 +00:00
|
|
|
iru_init(&iru, fl->ifl_rxq, fl->ifl_id);
|
2020-07-06 14:52:21 +00:00
|
|
|
while (n-- > 0) {
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
|
|
|
* We allocate an uninitialized mbuf + cluster, mbuf is
|
|
|
|
* initialized after rx.
|
|
|
|
*
|
2020-07-06 14:52:21 +00:00
|
|
|
* If the cluster is still set then we know a minimum sized
|
|
|
|
* packet was received
|
2016-05-18 04:35:58 +00:00
|
|
|
*/
|
2019-01-26 21:35:51 +00:00
|
|
|
bit_ffc_at(fl->ifl_rx_bitmap, frag_idx, fl->ifl_size,
|
|
|
|
&frag_idx);
|
|
|
|
if (frag_idx < 0)
|
|
|
|
bit_ffc(fl->ifl_rx_bitmap, fl->ifl_size, &frag_idx);
|
|
|
|
MPASS(frag_idx >= 0);
|
2017-07-03 18:23:35 +00:00
|
|
|
if ((cl = sd_cl[frag_idx]) == NULL) {
|
2020-07-06 14:52:21 +00:00
|
|
|
cl = uma_zalloc(fl->ifl_zone, M_NOWAIT);
|
2020-07-06 14:52:09 +00:00
|
|
|
if (__predict_false(cl == NULL))
|
2016-05-18 04:35:58 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
cb_arg.error = 0;
|
2017-03-13 22:53:06 +00:00
|
|
|
MPASS(sd_map != NULL);
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
err = bus_dmamap_load(fl->ifl_buf_tag, sd_map[frag_idx],
|
2019-01-16 05:44:14 +00:00
|
|
|
cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg,
|
|
|
|
BUS_DMA_NOWAIT);
|
2020-07-06 14:52:09 +00:00
|
|
|
if (__predict_false(err != 0 || cb_arg.error)) {
|
|
|
|
uma_zfree(fl->ifl_zone, cl);
|
2018-11-27 20:01:05 +00:00
|
|
|
break;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
2018-11-27 20:01:05 +00:00
|
|
|
|
2020-07-06 14:52:21 +00:00
|
|
|
sd_ba[frag_idx] = bus_addr = cb_arg.seg.ds_addr;
|
2018-11-27 20:01:05 +00:00
|
|
|
sd_cl[frag_idx] = cl;
|
|
|
|
#if MEMORY_LOGGING
|
|
|
|
fl->ifl_cl_enqueued++;
|
|
|
|
#endif
|
|
|
|
} else {
|
|
|
|
bus_addr = sd_ba[frag_idx];
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
bus_dmamap_sync(fl->ifl_buf_tag, sd_map[frag_idx],
|
|
|
|
BUS_DMASYNC_PREREAD);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2019-04-24 13:32:04 +00:00
|
|
|
if (sd_m[frag_idx] == NULL) {
|
2021-07-06 18:52:48 +00:00
|
|
|
m = m_gethdr_raw(M_NOWAIT, 0);
|
2020-07-06 14:52:09 +00:00
|
|
|
if (__predict_false(m == NULL))
|
2019-04-24 13:32:04 +00:00
|
|
|
break;
|
|
|
|
sd_m[frag_idx] = m;
|
2018-11-27 20:01:05 +00:00
|
|
|
}
|
2019-01-26 21:35:51 +00:00
|
|
|
bit_set(fl->ifl_rx_bitmap, frag_idx);
|
2018-11-27 20:01:05 +00:00
|
|
|
#if MEMORY_LOGGING
|
|
|
|
fl->ifl_m_enqueued++;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
DBG_COUNTER_INC(rx_allocs);
|
2017-07-03 18:23:35 +00:00
|
|
|
fl->ifl_rxd_idxs[i] = frag_idx;
|
2016-05-18 04:35:58 +00:00
|
|
|
fl->ifl_bus_addrs[i] = bus_addr;
|
2017-10-31 17:50:42 +00:00
|
|
|
credits++;
|
2016-05-18 04:35:58 +00:00
|
|
|
i++;
|
2017-10-31 17:50:42 +00:00
|
|
|
MPASS(credits <= fl->ifl_size);
|
2017-01-27 23:08:06 +00:00
|
|
|
if (++idx == fl->ifl_size) {
|
2020-07-06 14:52:21 +00:00
|
|
|
#ifdef INVARIANTS
|
2016-05-18 04:35:58 +00:00
|
|
|
fl->ifl_gen = 1;
|
2020-07-06 14:52:21 +00:00
|
|
|
#endif
|
2017-01-27 23:08:06 +00:00
|
|
|
idx = 0;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
if (n == 0 || i == IFLIB_MAX_RX_REFRESH) {
|
2017-03-13 22:53:06 +00:00
|
|
|
iru.iru_pidx = pidx;
|
|
|
|
iru.iru_count = i;
|
|
|
|
ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
|
2017-07-03 18:23:35 +00:00
|
|
|
fl->ifl_pidx = idx;
|
2017-10-31 17:50:42 +00:00
|
|
|
fl->ifl_credits = credits;
|
2020-07-06 14:52:21 +00:00
|
|
|
pidx = idx;
|
|
|
|
i = 0;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
}
|
2018-11-27 20:01:05 +00:00
|
|
|
|
2020-07-06 14:52:09 +00:00
|
|
|
if (n < count - 1) {
|
|
|
|
if (i != 0) {
|
|
|
|
iru.iru_pidx = pidx;
|
|
|
|
iru.iru_count = i;
|
|
|
|
ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
|
|
|
|
fl->ifl_pidx = idx;
|
|
|
|
fl->ifl_credits = credits;
|
|
|
|
}
|
|
|
|
DBG_COUNTER_INC(rxd_flush);
|
|
|
|
bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
|
|
|
|
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
|
|
|
|
ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id,
|
iflib: leave only 1 receive descriptor unused
The pidx argument of isc_rxd_flush() indicates which is the last valid
receive descriptor to be used by the NIC. However, current code has
multiple issues:
- Intel drivers write pidx to their RDT register, which means that
NICs will only use the descriptors up to pidx-1 (modulo ring size N),
and won't actually use the one pointed by pidx. This does not break
reception, but it is anyway confusing and suboptimal (the NIC will
actually see only N-2 descriptors as available, rather than N-1).
Other drivers (if_vmx, if_bnxt, if_mgb) adhere to this semantic).
- The semantic used by Intel (RDT is one descriptor past the last
valid one) is used by most (if not all) NICs, and it is also used
on the TX side (also in iflib). Since iflib is not currently
using this semantic for RX, it must decrement fl->ifl_pidx
(modulo N) before calling isc_rxd_flush(), and then the
per-driver callback implementation must increment the index
again (to match the real semantic). This is confusing and suboptimal.
- The iflib refill function is also called at initialization.
However, in case the ring size is smaller than 128 (e.g. if_mgb),
the refill function will actually prepare all the receive
descriptors (N), without leaving one unused, as most of NICs assume
(e.g. to avoid RDT to overrun RDH). I can speculate that the code
looks like this right now because this issue showed up during
testing (e.g. with if_mgb), and it was easy to workaround by
decrementing pidx before isc_rxd_flush().
The goal of this change is to simplify the code (removing a bunch
of instructions from the RX fast path), and to make the semantic of
isc_rxd_flush() consistent across drivers. To achieve this, we:
- change the semantics of the pidx argument to the usual one (that
is the index one past the last valid one), so that both iflib and
drivers avoid the decrement/increment dance.
- fix the initialization code to prepare at most N-1 descriptors.
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D26191
2020-09-01 20:41:47 +00:00
|
|
|
fl->ifl_id, fl->ifl_pidx);
|
2020-07-06 14:52:09 +00:00
|
|
|
if (__predict_true(bit_test(fl->ifl_rx_bitmap, frag_idx))) {
|
|
|
|
fl->ifl_fragidx = frag_idx + 1;
|
|
|
|
if (fl->ifl_fragidx == fl->ifl_size)
|
|
|
|
fl->ifl_fragidx = 0;
|
|
|
|
} else {
|
|
|
|
fl->ifl_fragidx = frag_idx;
|
|
|
|
}
|
|
|
|
}
|
2020-02-12 08:30:07 +00:00
|
|
|
|
|
|
|
return (n == -1 ? 0 : IFLIB_RXEOF_EMPTY);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
2020-07-06 14:52:21 +00:00
|
|
|
static inline uint8_t
|
|
|
|
iflib_fl_refill_all(if_ctx_t ctx, iflib_fl_t fl)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
iflib: leave only 1 receive descriptor unused
The pidx argument of isc_rxd_flush() indicates which is the last valid
receive descriptor to be used by the NIC. However, current code has
multiple issues:
- Intel drivers write pidx to their RDT register, which means that
NICs will only use the descriptors up to pidx-1 (modulo ring size N),
and won't actually use the one pointed by pidx. This does not break
reception, but it is anyway confusing and suboptimal (the NIC will
actually see only N-2 descriptors as available, rather than N-1).
Other drivers (if_vmx, if_bnxt, if_mgb) adhere to this semantic).
- The semantic used by Intel (RDT is one descriptor past the last
valid one) is used by most (if not all) NICs, and it is also used
on the TX side (also in iflib). Since iflib is not currently
using this semantic for RX, it must decrement fl->ifl_pidx
(modulo N) before calling isc_rxd_flush(), and then the
per-driver callback implementation must increment the index
again (to match the real semantic). This is confusing and suboptimal.
- The iflib refill function is also called at initialization.
However, in case the ring size is smaller than 128 (e.g. if_mgb),
the refill function will actually prepare all the receive
descriptors (N), without leaving one unused, as most of NICs assume
(e.g. to avoid RDT to overrun RDH). I can speculate that the code
looks like this right now because this issue showed up during
testing (e.g. with if_mgb), and it was easy to workaround by
decrementing pidx before isc_rxd_flush().
The goal of this change is to simplify the code (removing a bunch
of instructions from the RX fast path), and to make the semantic of
isc_rxd_flush() consistent across drivers. To achieve this, we:
- change the semantics of the pidx argument to the usual one (that
is the index one past the last valid one), so that both iflib and
drivers avoid the decrement/increment dance.
- fix the initialization code to prepare at most N-1 descriptors.
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D26191
2020-09-01 20:41:47 +00:00
|
|
|
/*
|
|
|
|
* We leave an unused descriptor to avoid pidx to catch up with cidx.
|
|
|
|
* This is important as it confuses most NICs. For instance,
|
|
|
|
* Intel NICs have (per receive ring) RDH and RDT registers, where
|
|
|
|
* RDH points to the next receive descriptor to be used by the NIC,
|
|
|
|
* and RDT for the next receive descriptor to be published by the
|
|
|
|
* driver to the NIC (RDT - 1 is thus the last valid one).
|
|
|
|
* The condition RDH == RDT means no descriptors are available to
|
|
|
|
* the NIC, and thus it would be ambiguous if it also meant that
|
|
|
|
* all the descriptors are available to the NIC.
|
|
|
|
*/
|
2016-05-18 04:35:58 +00:00
|
|
|
int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1;
|
|
|
|
#ifdef INVARIANTS
|
|
|
|
int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
MPASS(fl->ifl_credits <= fl->ifl_size);
|
|
|
|
MPASS(reclaimable == delta);
|
|
|
|
|
|
|
|
if (reclaimable > 0)
|
2020-07-06 14:52:21 +00:00
|
|
|
return (iflib_fl_refill(ctx, fl, reclaimable));
|
2020-02-12 08:30:07 +00:00
|
|
|
return (0);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
2018-10-12 22:40:54 +00:00
|
|
|
uint8_t
|
|
|
|
iflib_in_detach(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
bool in_detach;
|
2019-05-06 20:56:41 +00:00
|
|
|
|
2018-10-12 22:40:54 +00:00
|
|
|
STATE_LOCK(ctx);
|
|
|
|
in_detach = !!(ctx->ifc_flags & IFC_IN_DETACH);
|
|
|
|
STATE_UNLOCK(ctx);
|
|
|
|
return (in_detach);
|
|
|
|
}
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
static void
|
|
|
|
iflib_fl_bufs_free(iflib_fl_t fl)
|
|
|
|
{
|
|
|
|
iflib_dma_info_t idi = fl->ifl_ifdi;
|
2019-01-16 05:44:14 +00:00
|
|
|
bus_dmamap_t sd_map;
|
2016-05-18 04:35:58 +00:00
|
|
|
uint32_t i;
|
|
|
|
|
|
|
|
for (i = 0; i < fl->ifl_size; i++) {
|
2017-01-27 23:08:06 +00:00
|
|
|
struct mbuf **sd_m = &fl->ifl_sds.ifsd_m[i];
|
|
|
|
caddr_t *sd_cl = &fl->ifl_sds.ifsd_cl[i];
|
|
|
|
|
2018-11-27 20:01:05 +00:00
|
|
|
if (*sd_cl != NULL) {
|
2019-01-16 05:44:14 +00:00
|
|
|
sd_map = fl->ifl_sds.ifsd_map[i];
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_sync(fl->ifl_buf_tag, sd_map,
|
2019-01-16 05:44:14 +00:00
|
|
|
BUS_DMASYNC_POSTREAD);
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_unload(fl->ifl_buf_tag, sd_map);
|
2020-07-06 14:52:21 +00:00
|
|
|
uma_zfree(fl->ifl_zone, *sd_cl);
|
|
|
|
*sd_cl = NULL;
|
2017-01-27 23:08:06 +00:00
|
|
|
if (*sd_m != NULL) {
|
|
|
|
m_init(*sd_m, M_NOWAIT, MT_DATA, 0);
|
2021-06-30 14:17:29 +00:00
|
|
|
m_free_raw(*sd_m);
|
2020-07-06 14:52:21 +00:00
|
|
|
*sd_m = NULL;
|
2017-01-27 23:08:06 +00:00
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
} else {
|
2017-01-27 23:08:06 +00:00
|
|
|
MPASS(*sd_m == NULL);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
#if MEMORY_LOGGING
|
2017-09-16 02:41:38 +00:00
|
|
|
fl->ifl_m_dequeued++;
|
|
|
|
fl->ifl_cl_dequeued++;
|
2016-05-18 04:35:58 +00:00
|
|
|
#endif
|
|
|
|
}
|
2017-03-13 22:53:06 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
for (i = 0; i < fl->ifl_size; i++) {
|
|
|
|
MPASS(fl->ifl_sds.ifsd_cl[i] == NULL);
|
|
|
|
MPASS(fl->ifl_sds.ifsd_m[i] == NULL);
|
|
|
|
}
|
|
|
|
#endif
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
|
|
|
* Reset free list values
|
|
|
|
*/
|
2017-07-03 18:23:35 +00:00
|
|
|
fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = fl->ifl_fragidx = 0;
|
2016-05-18 04:35:58 +00:00
|
|
|
bzero(idi->idi_vaddr, idi->idi_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*********************************************************************
|
|
|
|
*
|
2019-05-06 20:56:41 +00:00
|
|
|
* Initialize a free list and its buffers.
|
2016-05-18 04:35:58 +00:00
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
static int
|
|
|
|
iflib_fl_setup(iflib_fl_t fl)
|
|
|
|
{
|
|
|
|
iflib_rxq_t rxq = fl->ifl_rxq;
|
|
|
|
if_ctx_t ctx = rxq->ifr_ctx;
|
2020-03-14 19:56:46 +00:00
|
|
|
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
|
|
|
|
int qidx;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2017-11-20 21:57:04 +00:00
|
|
|
bit_nclear(fl->ifl_rx_bitmap, 0, fl->ifl_size - 1);
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
|
|
|
** Free current RX buffer structs and their mbufs
|
|
|
|
*/
|
|
|
|
iflib_fl_bufs_free(fl);
|
|
|
|
/* Now replenish the mbufs */
|
|
|
|
MPASS(fl->ifl_credits == 0);
|
2020-03-14 19:56:46 +00:00
|
|
|
qidx = rxq->ifr_fl_offset + fl->ifl_id;
|
|
|
|
if (scctx->isc_rxd_buf_size[qidx] != 0)
|
|
|
|
fl->ifl_buf_size = scctx->isc_rxd_buf_size[qidx];
|
|
|
|
else
|
|
|
|
fl->ifl_buf_size = ctx->ifc_rx_mbuf_sz;
|
|
|
|
/*
|
|
|
|
* ifl_buf_size may be a driver-supplied value, so pull it up
|
|
|
|
* to the selected mbuf size.
|
|
|
|
*/
|
|
|
|
fl->ifl_buf_size = iflib_get_mbuf_size_for(fl->ifl_buf_size);
|
2016-05-18 04:35:58 +00:00
|
|
|
if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size)
|
|
|
|
ctx->ifc_max_fl_buf_size = fl->ifl_buf_size;
|
|
|
|
fl->ifl_cltype = m_gettype(fl->ifl_buf_size);
|
|
|
|
fl->ifl_zone = m_getzone(fl->ifl_buf_size);
|
|
|
|
|
iflib: leave only 1 receive descriptor unused
The pidx argument of isc_rxd_flush() indicates which is the last valid
receive descriptor to be used by the NIC. However, current code has
multiple issues:
- Intel drivers write pidx to their RDT register, which means that
NICs will only use the descriptors up to pidx-1 (modulo ring size N),
and won't actually use the one pointed by pidx. This does not break
reception, but it is anyway confusing and suboptimal (the NIC will
actually see only N-2 descriptors as available, rather than N-1).
Other drivers (if_vmx, if_bnxt, if_mgb) adhere to this semantic).
- The semantic used by Intel (RDT is one descriptor past the last
valid one) is used by most (if not all) NICs, and it is also used
on the TX side (also in iflib). Since iflib is not currently
using this semantic for RX, it must decrement fl->ifl_pidx
(modulo N) before calling isc_rxd_flush(), and then the
per-driver callback implementation must increment the index
again (to match the real semantic). This is confusing and suboptimal.
- The iflib refill function is also called at initialization.
However, in case the ring size is smaller than 128 (e.g. if_mgb),
the refill function will actually prepare all the receive
descriptors (N), without leaving one unused, as most of NICs assume
(e.g. to avoid RDT to overrun RDH). I can speculate that the code
looks like this right now because this issue showed up during
testing (e.g. with if_mgb), and it was easy to workaround by
decrementing pidx before isc_rxd_flush().
The goal of this change is to simplify the code (removing a bunch
of instructions from the RX fast path), and to make the semantic of
isc_rxd_flush() consistent across drivers. To achieve this, we:
- change the semantics of the pidx argument to the usual one (that
is the index one past the last valid one), so that both iflib and
drivers avoid the decrement/increment dance.
- fix the initialization code to prepare at most N-1 descriptors.
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D26191
2020-09-01 20:41:47 +00:00
|
|
|
/*
|
|
|
|
* Avoid pre-allocating zillions of clusters to an idle card
|
|
|
|
* potentially speeding up attach. In any case make sure
|
|
|
|
* to leave a descriptor unavailable. See the comment in
|
|
|
|
* iflib_fl_refill_all().
|
2016-05-18 04:35:58 +00:00
|
|
|
*/
|
iflib: leave only 1 receive descriptor unused
The pidx argument of isc_rxd_flush() indicates which is the last valid
receive descriptor to be used by the NIC. However, current code has
multiple issues:
- Intel drivers write pidx to their RDT register, which means that
NICs will only use the descriptors up to pidx-1 (modulo ring size N),
and won't actually use the one pointed by pidx. This does not break
reception, but it is anyway confusing and suboptimal (the NIC will
actually see only N-2 descriptors as available, rather than N-1).
Other drivers (if_vmx, if_bnxt, if_mgb) adhere to this semantic).
- The semantic used by Intel (RDT is one descriptor past the last
valid one) is used by most (if not all) NICs, and it is also used
on the TX side (also in iflib). Since iflib is not currently
using this semantic for RX, it must decrement fl->ifl_pidx
(modulo N) before calling isc_rxd_flush(), and then the
per-driver callback implementation must increment the index
again (to match the real semantic). This is confusing and suboptimal.
- The iflib refill function is also called at initialization.
However, in case the ring size is smaller than 128 (e.g. if_mgb),
the refill function will actually prepare all the receive
descriptors (N), without leaving one unused, as most of NICs assume
(e.g. to avoid RDT to overrun RDH). I can speculate that the code
looks like this right now because this issue showed up during
testing (e.g. with if_mgb), and it was easy to workaround by
decrementing pidx before isc_rxd_flush().
The goal of this change is to simplify the code (removing a bunch
of instructions from the RX fast path), and to make the semantic of
isc_rxd_flush() consistent across drivers. To achieve this, we:
- change the semantics of the pidx argument to the usual one (that
is the index one past the last valid one), so that both iflib and
drivers avoid the decrement/increment dance.
- fix the initialization code to prepare at most N-1 descriptors.
Reviewed by: markj
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D26191
2020-09-01 20:41:47 +00:00
|
|
|
MPASS(fl->ifl_size > 0);
|
|
|
|
(void)iflib_fl_refill(ctx, fl, min(128, fl->ifl_size - 1));
|
|
|
|
if (min(128, fl->ifl_size - 1) != fl->ifl_credits)
|
2016-05-18 04:35:58 +00:00
|
|
|
return (ENOBUFS);
|
|
|
|
/*
|
|
|
|
* handle failure
|
|
|
|
*/
|
|
|
|
MPASS(rxq != NULL);
|
|
|
|
MPASS(fl->ifl_ifdi != NULL);
|
|
|
|
bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
|
|
|
|
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*********************************************************************
|
|
|
|
*
|
|
|
|
* Free receive ring data structures
|
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
static void
|
|
|
|
iflib_rx_sds_free(iflib_rxq_t rxq)
|
|
|
|
{
|
|
|
|
iflib_fl_t fl;
|
2019-01-16 05:44:14 +00:00
|
|
|
int i, j;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
if (rxq->ifr_fl != NULL) {
|
|
|
|
for (i = 0; i < rxq->ifr_nfl; i++) {
|
|
|
|
fl = &rxq->ifr_fl[i];
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
if (fl->ifl_buf_tag != NULL) {
|
2019-01-16 05:44:14 +00:00
|
|
|
if (fl->ifl_sds.ifsd_map != NULL) {
|
2019-01-25 15:02:18 +00:00
|
|
|
for (j = 0; j < fl->ifl_size; j++) {
|
2019-01-16 05:44:14 +00:00
|
|
|
bus_dmamap_sync(
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
fl->ifl_buf_tag,
|
2019-01-25 15:02:18 +00:00
|
|
|
fl->ifl_sds.ifsd_map[j],
|
2019-01-16 05:44:14 +00:00
|
|
|
BUS_DMASYNC_POSTREAD);
|
|
|
|
bus_dmamap_unload(
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
fl->ifl_buf_tag,
|
2019-01-25 15:02:18 +00:00
|
|
|
fl->ifl_sds.ifsd_map[j]);
|
iflib: properly release memory allocated for DMA
DMA memory allocations using the bus_dma.h interface are not properly
released in all cases for both Tx and Rx. This causes ~448 bytes of
M_DEVBUF allocations to be leaked.
First, the DMA maps for Rx are not properly destroyed. A slight attempt
is made in iflib_fl_bufs_free to destroy the maps if we're detaching.
However, this function may not be reliably called during detach. Indeed,
there is a comment "asking" if this should be moved out.
Fix this by moving the bus_dmamap_destroy call into iflib_rx_sds_free,
where we already sync and unload the DMA.
Second, the DMA tag associated with the ifr_ifdi descriptor DMA is not
released properly anywhere. Add a call to iflib_dma_free in
iflib_rx_structures_free.
Third, use of NULL as a canary value on the map pointer returned by
bus_dmamap_create is not valid. On some platforms, notably x86, this
value may be NULL. In this case, we fail to properly release the related
resources.
Remove the NULL checks on map values in both iflib_fl_bufs_free and
iflib_txsd_destroy.
With all of these fixes applied, the leaks to M_DEVBUF are squelched,
and iflib drivers now seem to properly cleanup when detaching.
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Submitted by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed by: erj@, gallatin@
MFC after: 1 week
Sponsored by: Intel Corporation
Differential Revision: https://reviews.freebsd.org/D22203
2019-11-04 23:06:57 +00:00
|
|
|
bus_dmamap_destroy(
|
|
|
|
fl->ifl_buf_tag,
|
|
|
|
fl->ifl_sds.ifsd_map[j]);
|
2019-01-16 05:44:14 +00:00
|
|
|
}
|
|
|
|
}
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dma_tag_destroy(fl->ifl_buf_tag);
|
|
|
|
fl->ifl_buf_tag = NULL;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
2017-01-27 23:08:06 +00:00
|
|
|
free(fl->ifl_sds.ifsd_m, M_IFLIB);
|
|
|
|
free(fl->ifl_sds.ifsd_cl, M_IFLIB);
|
2018-11-27 20:01:05 +00:00
|
|
|
free(fl->ifl_sds.ifsd_ba, M_IFLIB);
|
2017-01-27 23:08:06 +00:00
|
|
|
free(fl->ifl_sds.ifsd_map, M_IFLIB);
|
2020-12-07 14:53:14 +00:00
|
|
|
free(fl->ifl_rx_bitmap, M_IFLIB);
|
2017-01-27 23:08:06 +00:00
|
|
|
fl->ifl_sds.ifsd_m = NULL;
|
|
|
|
fl->ifl_sds.ifsd_cl = NULL;
|
2018-11-27 20:01:05 +00:00
|
|
|
fl->ifl_sds.ifsd_ba = NULL;
|
2017-01-27 23:08:06 +00:00
|
|
|
fl->ifl_sds.ifsd_map = NULL;
|
2020-12-07 14:53:14 +00:00
|
|
|
fl->ifl_rx_bitmap = NULL;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
free(rxq->ifr_fl, M_IFLIB);
|
|
|
|
rxq->ifr_fl = NULL;
|
2019-10-30 20:45:12 +00:00
|
|
|
free(rxq->ifr_ifdi, M_IFLIB);
|
|
|
|
rxq->ifr_ifdi = NULL;
|
2019-05-06 20:56:41 +00:00
|
|
|
rxq->ifr_cq_cidx = 0;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2019-05-06 20:56:41 +00:00
|
|
|
* Timer routine
|
2016-05-18 04:35:58 +00:00
|
|
|
*/
|
|
|
|
static void
|
|
|
|
iflib_timer(void *arg)
|
|
|
|
{
|
2017-09-16 02:41:38 +00:00
|
|
|
iflib_txq_t txq = arg;
|
2016-05-18 04:35:58 +00:00
|
|
|
if_ctx_t ctx = txq->ift_ctx;
|
2017-09-16 02:41:38 +00:00
|
|
|
if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
|
2018-07-20 17:24:45 +00:00
|
|
|
uint64_t this_tick = ticks;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
|
|
|
|
return;
|
2019-05-06 20:56:41 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
|
|
|
** Check on the state of the TX queue(s), this
|
|
|
|
** can be done without the lock because its RO
|
|
|
|
** and the HUNG state will be static if set.
|
|
|
|
*/
|
2020-12-19 01:08:33 +00:00
|
|
|
if (this_tick - txq->ift_last_timer_tick >= iflib_timer_default) {
|
2018-07-20 17:24:45 +00:00
|
|
|
txq->ift_last_timer_tick = this_tick;
|
|
|
|
IFDI_TIMER(ctx, txq->ift_id);
|
|
|
|
if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) &&
|
|
|
|
((txq->ift_cleaned_prev == txq->ift_cleaned) ||
|
|
|
|
(sctx->isc_pause_frames == 0)))
|
|
|
|
goto hung;
|
|
|
|
|
2020-01-02 23:35:06 +00:00
|
|
|
if (txq->ift_qstatus != IFLIB_QUEUE_IDLE &&
|
|
|
|
ifmp_ring_is_stalled(txq->ift_br)) {
|
2020-12-19 01:08:33 +00:00
|
|
|
KASSERT(ctx->ifc_link_state == LINK_STATE_UP,
|
|
|
|
("queue can't be marked as hung if interface is down"));
|
2018-07-20 17:24:45 +00:00
|
|
|
txq->ift_qstatus = IFLIB_QUEUE_HUNG;
|
2020-01-02 23:35:06 +00:00
|
|
|
}
|
2018-07-20 17:24:45 +00:00
|
|
|
txq->ift_cleaned_prev = txq->ift_cleaned;
|
|
|
|
}
|
2017-09-16 02:41:38 +00:00
|
|
|
/* handle any laggards */
|
|
|
|
if (txq->ift_db_pending)
|
|
|
|
GROUPTASK_ENQUEUE(&txq->ift_task);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2017-09-16 02:41:38 +00:00
|
|
|
sctx->isc_pause_frames = 0;
|
2016-05-18 04:35:58 +00:00
|
|
|
if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)
|
2020-12-19 01:08:33 +00:00
|
|
|
callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer,
|
|
|
|
txq, txq->ift_timer.c_cpu);
|
2017-09-16 02:41:38 +00:00
|
|
|
return;
|
2019-05-06 20:56:41 +00:00
|
|
|
|
2018-04-12 14:35:37 +00:00
|
|
|
hung:
|
2019-05-06 20:56:41 +00:00
|
|
|
device_printf(ctx->ifc_dev,
|
|
|
|
"Watchdog timeout (TX: %d desc avail: %d pidx: %d) -- resetting\n",
|
|
|
|
txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx);
|
2018-04-12 14:35:37 +00:00
|
|
|
STATE_LOCK(ctx);
|
|
|
|
if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
|
|
|
|
ctx->ifc_flags |= (IFC_DO_WATCHDOG|IFC_DO_RESET);
|
2018-10-23 04:37:29 +00:00
|
|
|
iflib_admin_intr_deferred(ctx);
|
2018-10-23 17:06:36 +00:00
|
|
|
STATE_UNLOCK(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
2020-03-14 19:56:46 +00:00
|
|
|
static uint16_t
|
|
|
|
iflib_get_mbuf_size_for(unsigned int size)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (size <= MCLBYTES)
|
|
|
|
return (MCLBYTES);
|
|
|
|
else
|
|
|
|
return (MJUMPAGESIZE);
|
|
|
|
}
|
|
|
|
|
2019-03-19 17:59:56 +00:00
|
|
|
static void
|
|
|
|
iflib_calc_rx_mbuf_sz(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX don't set the max_frame_size to larger
|
|
|
|
* than the hardware can handle
|
|
|
|
*/
|
2020-03-14 19:56:46 +00:00
|
|
|
ctx->ifc_rx_mbuf_sz =
|
|
|
|
iflib_get_mbuf_size_for(sctx->isc_max_frame_size);
|
2019-03-19 17:59:56 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t
|
|
|
|
iflib_get_rx_mbuf_sz(if_ctx_t ctx)
|
|
|
|
{
|
2019-05-06 20:56:41 +00:00
|
|
|
|
2019-03-19 17:59:56 +00:00
|
|
|
return (ctx->ifc_rx_mbuf_sz);
|
|
|
|
}
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
static void
|
|
|
|
iflib_init_locked(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
|
2017-01-02 00:56:33 +00:00
|
|
|
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
|
2016-05-18 04:35:58 +00:00
|
|
|
if_t ifp = ctx->ifc_ifp;
|
|
|
|
iflib_fl_t fl;
|
|
|
|
iflib_txq_t txq;
|
|
|
|
iflib_rxq_t rxq;
|
2017-09-16 02:41:38 +00:00
|
|
|
int i, j, tx_ip_csum_flags, tx_ip6_csum_flags;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
|
|
|
|
IFDI_INTR_DISABLE(ctx);
|
|
|
|
|
2021-01-10 12:00:30 +00:00
|
|
|
/*
|
|
|
|
* See iflib_stop(). Useful in case iflib_init_locked() is
|
|
|
|
* called without first calling iflib_stop().
|
|
|
|
*/
|
|
|
|
netmap_disable_all_rings(ifp);
|
|
|
|
|
2017-01-02 00:56:33 +00:00
|
|
|
tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP);
|
|
|
|
tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP);
|
2016-05-18 04:35:58 +00:00
|
|
|
/* Set hardware offload abilities */
|
|
|
|
if_clearhwassist(ifp);
|
|
|
|
if (if_getcapenable(ifp) & IFCAP_TXCSUM)
|
2017-01-02 00:56:33 +00:00
|
|
|
if_sethwassistbits(ifp, tx_ip_csum_flags, 0);
|
2016-05-18 04:35:58 +00:00
|
|
|
if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
|
2017-01-02 00:56:33 +00:00
|
|
|
if_sethwassistbits(ifp, tx_ip6_csum_flags, 0);
|
2016-05-18 04:35:58 +00:00
|
|
|
if (if_getcapenable(ifp) & IFCAP_TSO4)
|
|
|
|
if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
|
|
|
|
if (if_getcapenable(ifp) & IFCAP_TSO6)
|
|
|
|
if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
|
|
|
|
|
|
|
|
for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) {
|
|
|
|
CALLOUT_LOCK(txq);
|
|
|
|
callout_stop(&txq->ift_timer);
|
2020-10-27 21:53:33 +00:00
|
|
|
#ifdef DEV_NETMAP
|
|
|
|
callout_stop(&txq->ift_netmap_timer);
|
|
|
|
#endif /* DEV_NETMAP */
|
2016-05-18 04:35:58 +00:00
|
|
|
CALLOUT_UNLOCK(txq);
|
2021-02-19 20:52:05 +00:00
|
|
|
(void)iflib_netmap_txq_init(ctx, txq);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
2019-03-19 17:59:56 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate a suitable Rx mbuf size prior to calling IFDI_INIT, so
|
|
|
|
* that drivers can use the value when setting up the hardware receive
|
|
|
|
* buffers.
|
|
|
|
*/
|
|
|
|
iflib_calc_rx_mbuf_sz(ctx);
|
|
|
|
|
2016-08-12 21:29:44 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
i = if_getdrvflags(ifp);
|
|
|
|
#endif
|
2016-05-18 04:35:58 +00:00
|
|
|
IFDI_INIT(ctx);
|
2016-08-12 21:29:44 +00:00
|
|
|
MPASS(if_getdrvflags(ifp) == i);
|
2016-05-18 04:35:58 +00:00
|
|
|
for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
|
2020-06-25 19:44:24 +00:00
|
|
|
if (iflib_netmap_rxq_init(ctx, rxq) > 0) {
|
|
|
|
/* This rxq is in netmap mode. Skip normal init. */
|
2017-03-13 22:53:06 +00:00
|
|
|
continue;
|
2017-09-20 20:40:49 +00:00
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
|
|
|
|
if (iflib_fl_setup(fl)) {
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
device_printf(ctx->ifc_dev,
|
|
|
|
"setting up free list %d failed - "
|
|
|
|
"check cluster settings\n", j);
|
2016-05-18 04:35:58 +00:00
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2018-06-18 17:27:43 +00:00
|
|
|
done:
|
2016-05-18 04:35:58 +00:00
|
|
|
if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
|
|
|
|
IFDI_INTR_ENABLE(ctx);
|
|
|
|
txq = ctx->ifc_txqs;
|
|
|
|
for (i = 0; i < sctx->isc_ntxqsets; i++, txq++)
|
2020-12-19 01:08:33 +00:00
|
|
|
callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer, txq,
|
2017-09-16 02:41:38 +00:00
|
|
|
txq->ift_timer.c_cpu);
|
2021-01-10 12:00:30 +00:00
|
|
|
|
|
|
|
/* Re-enable txsync/rxsync. */
|
|
|
|
netmap_enable_all_rings(ifp);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
iflib_media_change(if_t ifp)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = if_getsoftc(ifp);
|
|
|
|
int err;
|
|
|
|
|
|
|
|
CTX_LOCK(ctx);
|
|
|
|
if ((err = IFDI_MEDIA_CHANGE(ctx)) == 0)
|
2021-02-14 18:39:09 +00:00
|
|
|
iflib_if_init_locked(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
iflib_media_status(if_t ifp, struct ifmediareq *ifmr)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = if_getsoftc(ifp);
|
|
|
|
|
|
|
|
CTX_LOCK(ctx);
|
2017-09-16 02:41:38 +00:00
|
|
|
IFDI_UPDATE_ADMIN_STATUS(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
IFDI_MEDIA_STATUS(ctx, ifmr);
|
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
}
|
|
|
|
|
2018-05-11 20:08:28 +00:00
|
|
|
void
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_stop(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
iflib_txq_t txq = ctx->ifc_txqs;
|
|
|
|
iflib_rxq_t rxq = ctx->ifc_rxqs;
|
|
|
|
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
|
2018-11-14 15:16:45 +00:00
|
|
|
if_shared_ctx_t sctx = ctx->ifc_sctx;
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_dma_info_t di;
|
|
|
|
iflib_fl_t fl;
|
|
|
|
int i, j;
|
|
|
|
|
|
|
|
/* Tell the stack that the interface is no longer active */
|
|
|
|
if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
|
|
|
|
|
|
|
|
IFDI_INTR_DISABLE(ctx);
|
2017-09-16 02:41:38 +00:00
|
|
|
DELAY(1000);
|
2016-11-18 04:19:21 +00:00
|
|
|
IFDI_STOP(ctx);
|
2017-09-16 02:41:38 +00:00
|
|
|
DELAY(1000);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2021-01-10 12:00:30 +00:00
|
|
|
/*
|
|
|
|
* Stop any pending txsync/rxsync and prevent new ones
|
|
|
|
* form starting. Processes blocked in poll() will get
|
|
|
|
* POLLERR.
|
|
|
|
*/
|
|
|
|
netmap_disable_all_rings(ctx->ifc_ifp);
|
|
|
|
|
2016-11-18 04:19:21 +00:00
|
|
|
iflib_debug_reset();
|
2016-05-18 04:35:58 +00:00
|
|
|
/* Wait for current tx queue users to exit to disarm watchdog timer. */
|
|
|
|
for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) {
|
|
|
|
/* make sure all transmitters have completed before proceeding XXX */
|
|
|
|
|
2018-03-02 18:48:07 +00:00
|
|
|
CALLOUT_LOCK(txq);
|
|
|
|
callout_stop(&txq->ift_timer);
|
2020-10-27 21:53:33 +00:00
|
|
|
#ifdef DEV_NETMAP
|
|
|
|
callout_stop(&txq->ift_netmap_timer);
|
|
|
|
#endif /* DEV_NETMAP */
|
2018-03-02 18:48:07 +00:00
|
|
|
CALLOUT_UNLOCK(txq);
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
/* clean any enqueued buffers */
|
2016-11-18 04:19:21 +00:00
|
|
|
iflib_ifmp_purge(txq);
|
2016-05-18 04:35:58 +00:00
|
|
|
/* Free any existing tx buffers. */
|
2016-08-12 21:29:44 +00:00
|
|
|
for (j = 0; j < txq->ift_size; j++) {
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_txsd_free(ctx, txq, j);
|
|
|
|
}
|
2017-09-16 02:41:38 +00:00
|
|
|
txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0;
|
2021-05-19 09:09:16 +00:00
|
|
|
txq->ift_in_use = txq->ift_gen = txq->ift_no_desc_avail = 0;
|
|
|
|
if (sctx->isc_flags & IFLIB_PRESERVE_TX_INDICES)
|
|
|
|
txq->ift_cidx = txq->ift_pidx;
|
|
|
|
else
|
|
|
|
txq->ift_cidx = txq->ift_pidx = 0;
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0;
|
|
|
|
txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0;
|
2017-09-16 02:41:38 +00:00
|
|
|
txq->ift_pullups = 0;
|
2017-03-13 22:53:06 +00:00
|
|
|
ifmp_ring_reset_stats(txq->ift_br);
|
2018-11-14 15:16:45 +00:00
|
|
|
for (j = 0, di = txq->ift_ifdi; j < sctx->isc_ntxqs; j++, di++)
|
2016-05-18 04:35:58 +00:00
|
|
|
bzero((void *)di->idi_vaddr, di->idi_size);
|
|
|
|
}
|
|
|
|
for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) {
|
2021-11-19 07:56:30 +00:00
|
|
|
gtaskqueue_drain(rxq->ifr_task.gt_taskqueue,
|
|
|
|
&rxq->ifr_task.gt_task);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2019-05-06 20:56:41 +00:00
|
|
|
rxq->ifr_cq_cidx = 0;
|
2018-11-14 15:16:45 +00:00
|
|
|
for (j = 0, di = rxq->ifr_ifdi; j < sctx->isc_nrxqs; j++, di++)
|
2016-05-18 04:35:58 +00:00
|
|
|
bzero((void *)di->idi_vaddr, di->idi_size);
|
|
|
|
/* also resets the free lists pidx/cidx */
|
|
|
|
for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
|
|
|
|
iflib_fl_bufs_free(fl);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
static inline caddr_t
|
|
|
|
calc_next_rxd(iflib_fl_t fl, int cidx)
|
|
|
|
{
|
|
|
|
qidx_t size;
|
|
|
|
int nrxd;
|
|
|
|
caddr_t start, end, cur, next;
|
|
|
|
|
|
|
|
nrxd = fl->ifl_size;
|
|
|
|
size = fl->ifl_rxd_size;
|
|
|
|
start = fl->ifl_ifdi->idi_vaddr;
|
|
|
|
|
|
|
|
if (__predict_false(size == 0))
|
|
|
|
return (start);
|
|
|
|
cur = start + size*cidx;
|
|
|
|
end = start + size*nrxd;
|
|
|
|
next = CACHE_PTR_NEXT(cur);
|
|
|
|
return (next < end ? next : start);
|
|
|
|
}
|
|
|
|
|
2017-01-27 23:08:06 +00:00
|
|
|
static inline void
|
|
|
|
prefetch_pkts(iflib_fl_t fl, int cidx)
|
|
|
|
{
|
|
|
|
int nextptr;
|
|
|
|
int nrxd = fl->ifl_size;
|
2017-03-13 22:53:06 +00:00
|
|
|
caddr_t next_rxd;
|
|
|
|
|
2017-01-27 23:08:06 +00:00
|
|
|
nextptr = (cidx + CACHE_PTR_INCREMENT) & (nrxd-1);
|
|
|
|
prefetch(&fl->ifl_sds.ifsd_m[nextptr]);
|
|
|
|
prefetch(&fl->ifl_sds.ifsd_cl[nextptr]);
|
2017-03-13 22:53:06 +00:00
|
|
|
next_rxd = calc_next_rxd(fl, cidx);
|
|
|
|
prefetch(next_rxd);
|
2017-01-27 23:08:06 +00:00
|
|
|
prefetch(fl->ifl_sds.ifsd_m[(cidx + 1) & (nrxd-1)]);
|
|
|
|
prefetch(fl->ifl_sds.ifsd_m[(cidx + 2) & (nrxd-1)]);
|
|
|
|
prefetch(fl->ifl_sds.ifsd_m[(cidx + 3) & (nrxd-1)]);
|
|
|
|
prefetch(fl->ifl_sds.ifsd_m[(cidx + 4) & (nrxd-1)]);
|
|
|
|
prefetch(fl->ifl_sds.ifsd_cl[(cidx + 1) & (nrxd-1)]);
|
|
|
|
prefetch(fl->ifl_sds.ifsd_cl[(cidx + 2) & (nrxd-1)]);
|
|
|
|
prefetch(fl->ifl_sds.ifsd_cl[(cidx + 3) & (nrxd-1)]);
|
|
|
|
prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]);
|
|
|
|
}
|
|
|
|
|
2019-04-24 13:32:04 +00:00
|
|
|
static struct mbuf *
|
|
|
|
rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, bool unload, if_rxsd_t sd,
|
|
|
|
int *pf_rv, if_rxd_info_t ri)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
2017-01-27 23:08:06 +00:00
|
|
|
bus_dmamap_t map;
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_fl_t fl;
|
2019-04-24 13:32:04 +00:00
|
|
|
caddr_t payload;
|
|
|
|
struct mbuf *m;
|
|
|
|
int flid, cidx, len, next;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
map = NULL;
|
2016-05-18 04:35:58 +00:00
|
|
|
flid = irf->irf_flid;
|
|
|
|
cidx = irf->irf_idx;
|
|
|
|
fl = &rxq->ifr_fl[flid];
|
2017-03-13 22:53:06 +00:00
|
|
|
sd->ifsd_fl = fl;
|
2019-04-24 13:32:04 +00:00
|
|
|
m = fl->ifl_sds.ifsd_m[cidx];
|
2017-03-13 22:53:06 +00:00
|
|
|
sd->ifsd_cl = &fl->ifl_sds.ifsd_cl[cidx];
|
2016-05-18 04:35:58 +00:00
|
|
|
fl->ifl_credits--;
|
|
|
|
#if MEMORY_LOGGING
|
|
|
|
fl->ifl_m_dequeued++;
|
|
|
|
#endif
|
2017-03-13 22:53:06 +00:00
|
|
|
if (rxq->ifr_ctx->ifc_flags & IFC_PREFETCH)
|
|
|
|
prefetch_pkts(fl, cidx);
|
2018-11-27 20:01:05 +00:00
|
|
|
next = (cidx + CACHE_PTR_INCREMENT) & (fl->ifl_size-1);
|
|
|
|
prefetch(&fl->ifl_sds.ifsd_map[next]);
|
|
|
|
map = fl->ifl_sds.ifsd_map[cidx];
|
2016-05-18 04:35:58 +00:00
|
|
|
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_sync(fl->ifl_buf_tag, map, BUS_DMASYNC_POSTREAD);
|
2019-04-24 13:32:04 +00:00
|
|
|
|
2020-03-14 19:51:55 +00:00
|
|
|
if (rxq->pfil != NULL && PFIL_HOOKED_IN(rxq->pfil) && pf_rv != NULL &&
|
|
|
|
irf->irf_len != 0) {
|
2019-04-24 13:32:04 +00:00
|
|
|
payload = *sd->ifsd_cl;
|
|
|
|
payload += ri->iri_pad;
|
|
|
|
len = ri->iri_len - ri->iri_pad;
|
|
|
|
*pf_rv = pfil_run_hooks(rxq->pfil, payload, ri->iri_ifp,
|
|
|
|
len | PFIL_MEMPTR | PFIL_IN, NULL);
|
|
|
|
switch (*pf_rv) {
|
|
|
|
case PFIL_DROPPED:
|
|
|
|
case PFIL_CONSUMED:
|
|
|
|
/*
|
|
|
|
* The filter ate it. Everything is recycled.
|
|
|
|
*/
|
|
|
|
m = NULL;
|
|
|
|
unload = 0;
|
|
|
|
break;
|
|
|
|
case PFIL_REALLOCED:
|
|
|
|
/*
|
|
|
|
* The filter copied it. Everything is recycled.
|
|
|
|
*/
|
|
|
|
m = pfil_mem2mbuf(payload);
|
|
|
|
unload = 0;
|
|
|
|
break;
|
|
|
|
case PFIL_PASS:
|
|
|
|
/*
|
|
|
|
* Filter said it was OK, so receive like
|
|
|
|
* normal
|
|
|
|
*/
|
|
|
|
fl->ifl_sds.ifsd_m[cidx] = NULL;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
MPASS(0);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
fl->ifl_sds.ifsd_m[cidx] = NULL;
|
2021-01-21 14:45:15 +00:00
|
|
|
if (pf_rv != NULL)
|
|
|
|
*pf_rv = PFIL_PASS;
|
2019-04-24 13:32:04 +00:00
|
|
|
}
|
|
|
|
|
2020-03-14 19:51:55 +00:00
|
|
|
if (unload && irf->irf_len != 0)
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_unload(fl->ifl_buf_tag, map);
|
2017-03-13 22:53:06 +00:00
|
|
|
fl->ifl_cidx = (fl->ifl_cidx + 1) & (fl->ifl_size-1);
|
|
|
|
if (__predict_false(fl->ifl_cidx == 0))
|
2016-05-18 04:35:58 +00:00
|
|
|
fl->ifl_gen = 0;
|
2019-01-16 05:44:14 +00:00
|
|
|
bit_clear(fl->ifl_rx_bitmap, cidx);
|
2019-04-24 13:32:04 +00:00
|
|
|
return (m);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct mbuf *
|
2019-04-24 13:32:04 +00:00
|
|
|
assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri, if_rxsd_t sd, int *pf_rv)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
2017-03-13 22:53:06 +00:00
|
|
|
struct mbuf *m, *mh, *mt;
|
|
|
|
caddr_t cl;
|
2019-04-24 13:32:04 +00:00
|
|
|
int *pf_rv_ptr, flags, i, padlen;
|
|
|
|
bool consumed;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
i = 0;
|
2016-08-12 21:29:44 +00:00
|
|
|
mh = NULL;
|
2019-04-24 13:32:04 +00:00
|
|
|
consumed = false;
|
|
|
|
*pf_rv = PFIL_PASS;
|
|
|
|
pf_rv_ptr = pf_rv;
|
2016-05-18 04:35:58 +00:00
|
|
|
do {
|
2019-04-24 13:32:04 +00:00
|
|
|
m = rxd_frag_to_sd(rxq, &ri->iri_frags[i], !consumed, sd,
|
|
|
|
pf_rv_ptr, ri);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
MPASS(*sd->ifsd_cl != NULL);
|
2016-08-12 21:29:44 +00:00
|
|
|
|
2019-04-24 13:32:04 +00:00
|
|
|
/*
|
|
|
|
* Exclude zero-length frags & frags from
|
|
|
|
* packets the filter has consumed or dropped
|
|
|
|
*/
|
|
|
|
if (ri->iri_frags[i].irf_len == 0 || consumed ||
|
|
|
|
*pf_rv == PFIL_CONSUMED || *pf_rv == PFIL_DROPPED) {
|
|
|
|
if (mh == NULL) {
|
|
|
|
/* everything saved here */
|
|
|
|
consumed = true;
|
|
|
|
pf_rv_ptr = NULL;
|
|
|
|
continue;
|
|
|
|
}
|
2016-08-12 21:29:44 +00:00
|
|
|
/* XXX we can save the cluster here, but not the mbuf */
|
2019-04-24 13:32:04 +00:00
|
|
|
m_init(m, M_NOWAIT, MT_DATA, 0);
|
|
|
|
m_free(m);
|
2016-08-12 21:29:44 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (mh == NULL) {
|
2016-05-18 04:35:58 +00:00
|
|
|
flags = M_PKTHDR|M_EXT;
|
|
|
|
mh = mt = m;
|
|
|
|
padlen = ri->iri_pad;
|
|
|
|
} else {
|
|
|
|
flags = M_EXT;
|
|
|
|
mt->m_next = m;
|
|
|
|
mt = m;
|
|
|
|
/* assuming padding is only on the first fragment */
|
|
|
|
padlen = 0;
|
|
|
|
}
|
2017-03-13 22:53:06 +00:00
|
|
|
cl = *sd->ifsd_cl;
|
|
|
|
*sd->ifsd_cl = NULL;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
/* Can these two be made one ? */
|
|
|
|
m_init(m, M_NOWAIT, MT_DATA, flags);
|
2017-03-13 22:53:06 +00:00
|
|
|
m_cljset(m, cl, sd->ifsd_fl->ifl_cltype);
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
|
|
|
* These must follow m_init and m_cljset
|
|
|
|
*/
|
|
|
|
m->m_data += padlen;
|
|
|
|
ri->iri_len -= padlen;
|
2016-08-12 21:29:44 +00:00
|
|
|
m->m_len = ri->iri_frags[i].irf_len;
|
2016-05-18 04:35:58 +00:00
|
|
|
} while (++i < ri->iri_nfrags);
|
|
|
|
|
|
|
|
return (mh);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Process one software descriptor
|
|
|
|
*/
|
|
|
|
static struct mbuf *
|
|
|
|
iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri)
|
|
|
|
{
|
2017-03-13 22:53:06 +00:00
|
|
|
struct if_rxsd sd;
|
2016-05-18 04:35:58 +00:00
|
|
|
struct mbuf *m;
|
2019-04-24 13:32:04 +00:00
|
|
|
int pf_rv;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
/* should I merge this back in now that the two paths are basically duplicated? */
|
2016-08-12 21:29:44 +00:00
|
|
|
if (ri->iri_nfrags == 1 &&
|
2020-03-14 19:51:55 +00:00
|
|
|
ri->iri_frags[0].irf_len != 0 &&
|
2018-03-25 23:23:19 +00:00
|
|
|
ri->iri_frags[0].irf_len <= MIN(IFLIB_RX_COPY_THRESH, MHLEN)) {
|
2019-04-24 13:32:04 +00:00
|
|
|
m = rxd_frag_to_sd(rxq, &ri->iri_frags[0], false, &sd,
|
|
|
|
&pf_rv, ri);
|
|
|
|
if (pf_rv != PFIL_PASS && pf_rv != PFIL_REALLOCED)
|
|
|
|
return (m);
|
|
|
|
if (pf_rv == PFIL_PASS) {
|
|
|
|
m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR);
|
2017-03-13 22:53:06 +00:00
|
|
|
#ifndef __NO_STRICT_ALIGNMENT
|
2021-04-27 09:00:15 +00:00
|
|
|
if (!IP_ALIGNED(m) && ri->iri_pad == 0)
|
2019-04-24 13:32:04 +00:00
|
|
|
m->m_data += 2;
|
2017-03-13 22:53:06 +00:00
|
|
|
#endif
|
2019-04-24 13:32:04 +00:00
|
|
|
memcpy(m->m_data, *sd.ifsd_cl, ri->iri_len);
|
|
|
|
m->m_len = ri->iri_frags[0].irf_len;
|
2021-04-27 09:00:15 +00:00
|
|
|
m->m_data += ri->iri_pad;
|
|
|
|
ri->iri_len -= ri->iri_pad;
|
2019-04-24 13:32:04 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
m = assemble_segments(rxq, ri, &sd, &pf_rv);
|
2020-03-14 19:51:55 +00:00
|
|
|
if (m == NULL)
|
|
|
|
return (NULL);
|
2019-04-24 13:32:04 +00:00
|
|
|
if (pf_rv != PFIL_PASS && pf_rv != PFIL_REALLOCED)
|
|
|
|
return (m);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
m->m_pkthdr.len = ri->iri_len;
|
|
|
|
m->m_pkthdr.rcvif = ri->iri_ifp;
|
|
|
|
m->m_flags |= ri->iri_flags;
|
|
|
|
m->m_pkthdr.ether_vtag = ri->iri_vtag;
|
|
|
|
m->m_pkthdr.flowid = ri->iri_flowid;
|
|
|
|
M_HASHTYPE_SET(m, ri->iri_rsstype);
|
|
|
|
m->m_pkthdr.csum_flags = ri->iri_csum_flags;
|
|
|
|
m->m_pkthdr.csum_data = ri->iri_csum_data;
|
|
|
|
return (m);
|
|
|
|
}
|
|
|
|
|
2017-11-06 16:23:21 +00:00
|
|
|
#if defined(INET6) || defined(INET)
|
2017-12-05 20:43:24 +00:00
|
|
|
static void
|
|
|
|
iflib_get_ip_forwarding(struct lro_ctrl *lc, bool *v4, bool *v6)
|
|
|
|
{
|
|
|
|
CURVNET_SET(lc->ifp->if_vnet);
|
|
|
|
#if defined(INET6)
|
2019-06-19 08:49:24 +00:00
|
|
|
*v6 = V_ip6_forwarding;
|
2017-12-05 20:43:24 +00:00
|
|
|
#endif
|
|
|
|
#if defined(INET)
|
2019-06-19 08:49:24 +00:00
|
|
|
*v4 = V_ipforwarding;
|
2017-12-05 20:43:24 +00:00
|
|
|
#endif
|
|
|
|
CURVNET_RESTORE();
|
|
|
|
}
|
|
|
|
|
2017-11-06 16:23:21 +00:00
|
|
|
/*
|
|
|
|
* Returns true if it's possible this packet could be LROed.
|
|
|
|
* if it returns false, it is guaranteed that tcp_lro_rx()
|
|
|
|
* would not return zero.
|
|
|
|
*/
|
|
|
|
static bool
|
2017-12-05 20:43:24 +00:00
|
|
|
iflib_check_lro_possible(struct mbuf *m, bool v4_forwarding, bool v6_forwarding)
|
2017-11-06 16:23:21 +00:00
|
|
|
{
|
|
|
|
struct ether_header *eh;
|
|
|
|
|
|
|
|
eh = mtod(m, struct ether_header *);
|
2019-06-19 08:39:19 +00:00
|
|
|
switch (eh->ether_type) {
|
2017-11-06 19:54:25 +00:00
|
|
|
#if defined(INET6)
|
2019-06-19 08:39:19 +00:00
|
|
|
case htons(ETHERTYPE_IPV6):
|
|
|
|
return (!v6_forwarding);
|
2017-11-06 19:54:25 +00:00
|
|
|
#endif
|
|
|
|
#if defined (INET)
|
2019-06-19 08:39:19 +00:00
|
|
|
case htons(ETHERTYPE_IP):
|
|
|
|
return (!v4_forwarding);
|
2017-11-06 19:54:25 +00:00
|
|
|
#endif
|
2017-11-06 16:23:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
2017-12-05 20:43:24 +00:00
|
|
|
#else
|
|
|
|
static void
|
|
|
|
iflib_get_ip_forwarding(struct lro_ctrl *lc __unused, bool *v4 __unused, bool *v6 __unused)
|
|
|
|
{
|
|
|
|
}
|
2017-11-06 16:23:21 +00:00
|
|
|
#endif
|
|
|
|
|
2020-02-12 08:30:07 +00:00
|
|
|
static void
|
|
|
|
_task_fn_rx_watchdog(void *context)
|
|
|
|
{
|
|
|
|
iflib_rxq_t rxq = context;
|
|
|
|
|
|
|
|
GROUPTASK_ENQUEUE(&rxq->ifr_task);
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint8_t
|
2017-03-13 22:53:06 +00:00
|
|
|
iflib_rxeof(iflib_rxq_t rxq, qidx_t budget)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
2019-05-06 20:56:41 +00:00
|
|
|
if_t ifp;
|
2016-05-18 04:35:58 +00:00
|
|
|
if_ctx_t ctx = rxq->ifr_ctx;
|
|
|
|
if_shared_ctx_t sctx = ctx->ifc_sctx;
|
2016-08-12 21:29:44 +00:00
|
|
|
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
|
2016-05-18 04:35:58 +00:00
|
|
|
int avail, i;
|
2017-03-13 22:53:06 +00:00
|
|
|
qidx_t *cidxp;
|
2016-05-18 04:35:58 +00:00
|
|
|
struct if_rxd_info ri;
|
|
|
|
int err, budget_left, rx_bytes, rx_pkts;
|
|
|
|
iflib_fl_t fl;
|
|
|
|
int lro_enabled;
|
2018-05-19 19:00:04 +00:00
|
|
|
bool v4_forwarding, v6_forwarding, lro_possible;
|
2020-02-12 08:30:07 +00:00
|
|
|
uint8_t retval = 0;
|
2017-03-13 22:53:06 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
|
|
|
* XXX early demux data packets so that if_input processing only handles
|
|
|
|
* acks in interrupt context
|
|
|
|
*/
|
2017-09-23 01:35:14 +00:00
|
|
|
struct mbuf *m, *mh, *mt, *mf;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2020-01-23 01:27:58 +00:00
|
|
|
NET_EPOCH_ASSERT();
|
|
|
|
|
2018-05-19 19:00:04 +00:00
|
|
|
lro_possible = v4_forwarding = v6_forwarding = false;
|
2017-03-13 22:53:06 +00:00
|
|
|
ifp = ctx->ifc_ifp;
|
2016-05-18 04:35:58 +00:00
|
|
|
mh = mt = NULL;
|
|
|
|
MPASS(budget > 0);
|
2017-09-16 02:41:38 +00:00
|
|
|
rx_pkts = rx_bytes = 0;
|
2016-08-12 21:29:44 +00:00
|
|
|
if (sctx->isc_flags & IFLIB_HAS_RXCQ)
|
2016-05-18 04:35:58 +00:00
|
|
|
cidxp = &rxq->ifr_cq_cidx;
|
|
|
|
else
|
|
|
|
cidxp = &rxq->ifr_fl[0].ifl_cidx;
|
2016-08-12 21:29:44 +00:00
|
|
|
if ((avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget)) == 0) {
|
2016-05-18 04:35:58 +00:00
|
|
|
for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
|
2020-07-06 14:52:21 +00:00
|
|
|
retval |= iflib_fl_refill_all(ctx, fl);
|
2016-05-18 04:35:58 +00:00
|
|
|
DBG_COUNTER_INC(rx_unavail);
|
2020-02-12 08:30:07 +00:00
|
|
|
return (retval);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
2019-04-24 13:32:04 +00:00
|
|
|
/* pfil needs the vnet to be set */
|
|
|
|
CURVNET_SET_QUIET(ifp->if_vnet);
|
2018-07-22 17:45:44 +00:00
|
|
|
for (budget_left = budget; budget_left > 0 && avail > 0;) {
|
2016-05-18 04:35:58 +00:00
|
|
|
if (__predict_false(!CTX_ACTIVE(ctx))) {
|
|
|
|
DBG_COUNTER_INC(rx_ctx_inactive);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Reset client set fields to their default values
|
|
|
|
*/
|
2017-03-13 22:53:06 +00:00
|
|
|
rxd_info_zero(&ri);
|
2016-05-18 04:35:58 +00:00
|
|
|
ri.iri_qsidx = rxq->ifr_id;
|
|
|
|
ri.iri_cidx = *cidxp;
|
2017-03-13 22:53:06 +00:00
|
|
|
ri.iri_ifp = ifp;
|
2016-05-18 04:35:58 +00:00
|
|
|
ri.iri_frags = rxq->ifr_frags;
|
|
|
|
err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
|
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
if (err)
|
|
|
|
goto err;
|
2019-04-24 13:32:04 +00:00
|
|
|
rx_pkts += 1;
|
|
|
|
rx_bytes += ri.iri_len;
|
2016-08-12 21:29:44 +00:00
|
|
|
if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
|
|
|
|
*cidxp = ri.iri_cidx;
|
|
|
|
/* Update our consumer index */
|
2017-03-13 22:53:06 +00:00
|
|
|
/* XXX NB: shurd - check if this is still safe */
|
2019-05-06 20:56:41 +00:00
|
|
|
while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0])
|
2016-08-12 21:29:44 +00:00
|
|
|
rxq->ifr_cq_cidx -= scctx->isc_nrxd[0];
|
2016-05-18 04:35:58 +00:00
|
|
|
/* was this only a completion queue message? */
|
|
|
|
if (__predict_false(ri.iri_nfrags == 0))
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
MPASS(ri.iri_nfrags != 0);
|
|
|
|
MPASS(ri.iri_len != 0);
|
|
|
|
|
|
|
|
/* will advance the cidx on the corresponding free lists */
|
|
|
|
m = iflib_rxd_pkt_get(rxq, &ri);
|
2018-07-22 17:45:44 +00:00
|
|
|
avail--;
|
|
|
|
budget_left--;
|
2016-05-18 04:35:58 +00:00
|
|
|
if (avail == 0 && budget_left)
|
2016-08-12 21:29:44 +00:00
|
|
|
avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget_left);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2019-04-24 13:32:04 +00:00
|
|
|
if (__predict_false(m == NULL))
|
2016-05-18 04:35:58 +00:00
|
|
|
continue;
|
2019-04-24 13:32:04 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
/* imm_pkt: -- cxgb */
|
|
|
|
if (mh == NULL)
|
|
|
|
mh = mt = m;
|
|
|
|
else {
|
|
|
|
mt->m_nextpkt = m;
|
|
|
|
mt = m;
|
|
|
|
}
|
|
|
|
}
|
2019-04-24 13:32:04 +00:00
|
|
|
CURVNET_RESTORE();
|
2016-05-18 04:35:58 +00:00
|
|
|
/* make sure that we can refill faster than drain */
|
|
|
|
for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
|
2020-07-06 14:52:21 +00:00
|
|
|
retval |= iflib_fl_refill_all(ctx, fl);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO);
|
2017-12-05 20:43:24 +00:00
|
|
|
if (lro_enabled)
|
|
|
|
iflib_get_ip_forwarding(&rxq->ifr_lc, &v4_forwarding, &v6_forwarding);
|
2017-09-23 01:35:14 +00:00
|
|
|
mt = mf = NULL;
|
2016-05-18 04:35:58 +00:00
|
|
|
while (mh != NULL) {
|
|
|
|
m = mh;
|
|
|
|
mh = mh->m_nextpkt;
|
|
|
|
m->m_nextpkt = NULL;
|
2017-03-13 22:53:06 +00:00
|
|
|
#ifndef __NO_STRICT_ALIGNMENT
|
|
|
|
if (!IP_ALIGNED(m) && (m = iflib_fixup_rx(m)) == NULL)
|
|
|
|
continue;
|
|
|
|
#endif
|
2016-05-18 14:18:03 +00:00
|
|
|
#if defined(INET6) || defined(INET)
|
2017-11-06 16:23:21 +00:00
|
|
|
if (lro_enabled) {
|
|
|
|
if (!lro_possible) {
|
2017-12-05 20:43:24 +00:00
|
|
|
lro_possible = iflib_check_lro_possible(m, v4_forwarding, v6_forwarding);
|
2017-11-06 16:23:21 +00:00
|
|
|
if (lro_possible && mf != NULL) {
|
|
|
|
ifp->if_input(ifp, mf);
|
|
|
|
DBG_COUNTER_INC(rx_if_input);
|
|
|
|
mt = mf = NULL;
|
|
|
|
}
|
|
|
|
}
|
2017-12-21 01:22:36 +00:00
|
|
|
if ((m->m_pkthdr.csum_flags & (CSUM_L4_CALC|CSUM_L4_VALID)) ==
|
|
|
|
(CSUM_L4_CALC|CSUM_L4_VALID)) {
|
|
|
|
if (lro_possible && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0)
|
2017-12-27 19:12:32 +00:00
|
|
|
continue;
|
2017-12-21 01:22:36 +00:00
|
|
|
}
|
2017-09-23 01:35:14 +00:00
|
|
|
}
|
2016-05-18 14:18:03 +00:00
|
|
|
#endif
|
2017-11-06 16:23:21 +00:00
|
|
|
if (lro_possible) {
|
|
|
|
ifp->if_input(ifp, m);
|
|
|
|
DBG_COUNTER_INC(rx_if_input);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (mf == NULL)
|
|
|
|
mf = m;
|
2017-09-23 01:35:14 +00:00
|
|
|
if (mt != NULL)
|
|
|
|
mt->m_nextpkt = m;
|
|
|
|
mt = m;
|
|
|
|
}
|
|
|
|
if (mf != NULL) {
|
|
|
|
ifp->if_input(ifp, mf);
|
2016-05-18 04:35:58 +00:00
|
|
|
DBG_COUNTER_INC(rx_if_input);
|
|
|
|
}
|
2016-08-12 21:29:44 +00:00
|
|
|
|
2017-09-16 02:41:38 +00:00
|
|
|
if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
|
|
|
|
if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Flush any outstanding LRO work
|
|
|
|
*/
|
2016-05-18 14:18:03 +00:00
|
|
|
#if defined(INET6) || defined(INET)
|
2016-08-12 21:29:44 +00:00
|
|
|
tcp_lro_flush_all(&rxq->ifr_lc);
|
2016-05-18 14:18:03 +00:00
|
|
|
#endif
|
2020-02-12 08:30:07 +00:00
|
|
|
if (avail != 0 || iflib_rxd_avail(ctx, rxq, *cidxp, 1) != 0)
|
|
|
|
retval |= IFLIB_RXEOF_MORE;
|
|
|
|
return (retval);
|
2017-03-13 22:53:06 +00:00
|
|
|
err:
|
2018-04-12 14:35:37 +00:00
|
|
|
STATE_LOCK(ctx);
|
2017-09-16 02:41:38 +00:00
|
|
|
ctx->ifc_flags |= IFC_DO_RESET;
|
2018-10-23 04:37:29 +00:00
|
|
|
iflib_admin_intr_deferred(ctx);
|
2018-10-23 17:06:36 +00:00
|
|
|
STATE_UNLOCK(ctx);
|
2020-02-12 08:30:07 +00:00
|
|
|
return (0);
|
2017-03-13 22:53:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#define TXD_NOTIFY_COUNT(txq) (((txq)->ift_size / (txq)->ift_update_freq)-1)
|
|
|
|
static inline qidx_t
|
|
|
|
txq_max_db_deferred(iflib_txq_t txq, qidx_t in_use)
|
|
|
|
{
|
|
|
|
qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
|
|
|
|
qidx_t minthresh = txq->ift_size / 8;
|
|
|
|
if (in_use > 4*minthresh)
|
|
|
|
return (notify_count);
|
|
|
|
if (in_use > 2*minthresh)
|
|
|
|
return (notify_count >> 1);
|
|
|
|
if (in_use > minthresh)
|
|
|
|
return (notify_count >> 3);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline qidx_t
|
|
|
|
txq_max_rs_deferred(iflib_txq_t txq)
|
|
|
|
{
|
|
|
|
qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
|
|
|
|
qidx_t minthresh = txq->ift_size / 8;
|
|
|
|
if (txq->ift_in_use > 4*minthresh)
|
|
|
|
return (notify_count);
|
|
|
|
if (txq->ift_in_use > 2*minthresh)
|
|
|
|
return (notify_count >> 1);
|
|
|
|
if (txq->ift_in_use > minthresh)
|
|
|
|
return (notify_count >> 2);
|
2017-03-30 16:54:01 +00:00
|
|
|
return (2);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#define M_CSUM_FLAGS(m) ((m)->m_pkthdr.csum_flags)
|
|
|
|
#define M_HAS_VLANTAG(m) (m->m_flags & M_VLANTAG)
|
2017-03-13 22:53:06 +00:00
|
|
|
|
|
|
|
#define TXQ_MAX_DB_DEFERRED(txq, in_use) txq_max_db_deferred((txq), (in_use))
|
|
|
|
#define TXQ_MAX_RS_DEFERRED(txq) txq_max_rs_deferred(txq)
|
2016-08-12 21:29:44 +00:00
|
|
|
#define TXQ_MAX_DB_CONSUMED(size) (size >> 4)
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
/* forward compatibility for cxgb */
|
|
|
|
#define FIRST_QSET(ctx) 0
|
|
|
|
#define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets)
|
|
|
|
#define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets)
|
|
|
|
#define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
|
|
|
|
#define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
|
|
|
|
|
|
|
|
/* XXX we should be setting this to something other than zero */
|
|
|
|
#define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
|
2020-12-19 01:08:33 +00:00
|
|
|
#define MAX_TX_DESC(ctx) MAX((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max, \
|
Use the maximum of isc_tx_{nsegments,tso_segments_max} for MAX_TX_DESC.
Since r336313, TSO support for LEM-class devices is removed again as it
was before the conversion of {l,}em(4) to iflib(4) in r311849 and as a
result, isc_tx_tso_segments_max is 0 for LEM-class devices now. Thus,
inappropriate watermarks were used for this class.
This is really only a band-aid, though, because so far iflib(9) doesn't
fully take into account that DMA engines can support different maxima
of segments for transfers of TSO and non-TSO packets. For example, the
DESC_RECLAIMABLE macro is based on isc_tx_nsegments while MAX_TX_DESC
used isc_tx_tso_segments_max only. For most in-tree consumers that
doesn't make a difference as the maxima are the same for both kinds of
transfers (that is, apart from the fact that TSO may require up to 2
sentinel descriptors but also not with every MAC supported). However,
isc_tx_nsegments is 8 but isc_tx_tso_segments_max is 85 by default
with ixl(4).
2018-07-22 17:51:11 +00:00
|
|
|
(ctx)->ifc_softc_ctx.isc_tx_nsegments)
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
static inline bool
|
2020-12-19 01:08:33 +00:00
|
|
|
iflib_txd_db_check(iflib_txq_t txq, int ring)
|
2017-03-13 22:53:06 +00:00
|
|
|
{
|
2020-12-19 01:08:33 +00:00
|
|
|
if_ctx_t ctx = txq->ift_ctx;
|
2017-03-13 22:53:06 +00:00
|
|
|
qidx_t dbval, max;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2020-12-19 01:08:33 +00:00
|
|
|
max = TXQ_MAX_DB_DEFERRED(txq, txq->ift_in_use);
|
|
|
|
|
|
|
|
/* force || threshold exceeded || at the edge of the ring */
|
|
|
|
if (ring || (txq->ift_db_pending >= max) || (TXQ_AVAIL(txq) <= MAX_TX_DESC(ctx) + 2)) {
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 'npending' is used if the card's doorbell is in terms of the number of descriptors
|
|
|
|
* pending flush (BRCM). 'pidx' is used in cases where the card's doorbeel uses the
|
|
|
|
* producer index explicitly (INTC).
|
|
|
|
*/
|
2016-05-18 04:35:58 +00:00
|
|
|
dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx;
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
|
|
|
|
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
|
2016-05-18 04:35:58 +00:00
|
|
|
ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval);
|
2020-12-19 01:08:33 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Absent bugs there are zero packets pending so reset pending counts to zero.
|
|
|
|
*/
|
2016-05-18 04:35:58 +00:00
|
|
|
txq->ift_db_pending = txq->ift_npending = 0;
|
2020-12-19 01:08:33 +00:00
|
|
|
return (true);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
2020-12-19 01:08:33 +00:00
|
|
|
return (false);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef PKT_DEBUG
|
|
|
|
static void
|
|
|
|
print_pkt(if_pkt_info_t pi)
|
|
|
|
{
|
|
|
|
printf("pi len: %d qsidx: %d nsegs: %d ndescs: %d flags: %x pidx: %d\n",
|
|
|
|
pi->ipi_len, pi->ipi_qsidx, pi->ipi_nsegs, pi->ipi_ndescs, pi->ipi_flags, pi->ipi_pidx);
|
|
|
|
printf("pi new_pidx: %d csum_flags: %lx tso_segsz: %d mflags: %x vtag: %d\n",
|
|
|
|
pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag);
|
|
|
|
printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n",
|
|
|
|
pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO)
|
2018-06-07 13:03:07 +00:00
|
|
|
#define IS_TX_OFFLOAD4(pi) ((pi)->ipi_csum_flags & (CSUM_IP_TCP | CSUM_IP_TSO))
|
2016-05-18 04:35:58 +00:00
|
|
|
#define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO)
|
2018-06-07 13:03:07 +00:00
|
|
|
#define IS_TX_OFFLOAD6(pi) ((pi)->ipi_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_TSO))
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
static int
|
|
|
|
iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
|
|
|
|
{
|
2017-09-16 02:41:38 +00:00
|
|
|
if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx;
|
2016-05-18 04:35:58 +00:00
|
|
|
struct ether_vlan_header *eh;
|
2018-07-24 23:40:27 +00:00
|
|
|
struct mbuf *m;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2018-07-22 17:45:44 +00:00
|
|
|
m = *mp;
|
2017-09-16 02:41:38 +00:00
|
|
|
if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) &&
|
|
|
|
M_WRITABLE(m) == 0) {
|
|
|
|
if ((m = m_dup(m, M_NOWAIT)) == NULL) {
|
|
|
|
return (ENOMEM);
|
|
|
|
} else {
|
|
|
|
m_freem(*mp);
|
2018-09-06 18:51:52 +00:00
|
|
|
DBG_COUNTER_INC(tx_frees);
|
2018-07-22 17:45:44 +00:00
|
|
|
*mp = m;
|
2017-09-16 02:41:38 +00:00
|
|
|
}
|
|
|
|
}
|
2017-01-02 00:56:33 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
|
|
|
* Determine where frame payload starts.
|
|
|
|
* Jump over vlan headers if already present,
|
|
|
|
* helpful for QinQ too.
|
|
|
|
*/
|
|
|
|
if (__predict_false(m->m_len < sizeof(*eh))) {
|
|
|
|
txq->ift_pullups++;
|
|
|
|
if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL))
|
|
|
|
return (ENOMEM);
|
|
|
|
}
|
|
|
|
eh = mtod(m, struct ether_vlan_header *);
|
|
|
|
if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
|
|
|
|
pi->ipi_etype = ntohs(eh->evl_proto);
|
|
|
|
pi->ipi_ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
|
|
|
|
} else {
|
|
|
|
pi->ipi_etype = ntohs(eh->evl_encap_proto);
|
|
|
|
pi->ipi_ehdrlen = ETHER_HDR_LEN;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (pi->ipi_etype) {
|
|
|
|
#ifdef INET
|
|
|
|
case ETHERTYPE_IP:
|
|
|
|
{
|
2018-07-24 23:40:27 +00:00
|
|
|
struct mbuf *n;
|
2016-05-18 04:35:58 +00:00
|
|
|
struct ip *ip = NULL;
|
|
|
|
struct tcphdr *th = NULL;
|
|
|
|
int minthlen;
|
|
|
|
|
|
|
|
minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th));
|
|
|
|
if (__predict_false(m->m_len < minthlen)) {
|
|
|
|
/*
|
|
|
|
* if this code bloat is causing too much of a hit
|
|
|
|
* move it to a separate function and mark it noinline
|
|
|
|
*/
|
|
|
|
if (m->m_len == pi->ipi_ehdrlen) {
|
|
|
|
n = m->m_next;
|
|
|
|
MPASS(n);
|
|
|
|
if (n->m_len >= sizeof(*ip)) {
|
|
|
|
ip = (struct ip *)n->m_data;
|
|
|
|
if (n->m_len >= (ip->ip_hl << 2) + sizeof(*th))
|
|
|
|
th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
|
|
|
|
} else {
|
|
|
|
txq->ift_pullups++;
|
|
|
|
if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
|
|
|
|
return (ENOMEM);
|
|
|
|
ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
txq->ift_pullups++;
|
|
|
|
if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
|
|
|
|
return (ENOMEM);
|
|
|
|
ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
|
|
|
|
if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
|
|
|
|
th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
|
|
|
|
if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
|
|
|
|
th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
|
|
|
|
}
|
|
|
|
pi->ipi_ip_hlen = ip->ip_hl << 2;
|
|
|
|
pi->ipi_ipproto = ip->ip_p;
|
|
|
|
pi->ipi_flags |= IPI_TX_IPV4;
|
|
|
|
|
2018-06-07 13:03:07 +00:00
|
|
|
/* TCP checksum offload may require TCP header length */
|
|
|
|
if (IS_TX_OFFLOAD4(pi)) {
|
|
|
|
if (__predict_true(pi->ipi_ipproto == IPPROTO_TCP)) {
|
2017-09-23 01:33:20 +00:00
|
|
|
if (__predict_false(th == NULL)) {
|
|
|
|
txq->ift_pullups++;
|
|
|
|
if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL))
|
|
|
|
return (ENOMEM);
|
|
|
|
th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen);
|
|
|
|
}
|
|
|
|
pi->ipi_tcp_hflags = th->th_flags;
|
|
|
|
pi->ipi_tcp_hlen = th->th_off << 2;
|
|
|
|
pi->ipi_tcp_seq = th->th_seq;
|
|
|
|
}
|
2018-06-07 13:03:07 +00:00
|
|
|
if (IS_TSO4(pi)) {
|
|
|
|
if (__predict_false(ip->ip_p != IPPROTO_TCP))
|
|
|
|
return (ENXIO);
|
2018-11-14 15:23:39 +00:00
|
|
|
/*
|
|
|
|
* TSO always requires hardware checksum offload.
|
|
|
|
*/
|
|
|
|
pi->ipi_csum_flags |= (CSUM_IP_TCP | CSUM_IP);
|
2018-06-07 13:03:07 +00:00
|
|
|
th->th_sum = in_pseudo(ip->ip_src.s_addr,
|
|
|
|
ip->ip_dst.s_addr, htons(IPPROTO_TCP));
|
|
|
|
pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
|
|
|
|
if (sctx->isc_flags & IFLIB_TSO_INIT_IP) {
|
|
|
|
ip->ip_sum = 0;
|
|
|
|
ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz);
|
|
|
|
}
|
2017-01-02 00:56:33 +00:00
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
2018-11-14 15:23:39 +00:00
|
|
|
if ((sctx->isc_flags & IFLIB_NEED_ZERO_CSUM) && (pi->ipi_csum_flags & CSUM_IP))
|
|
|
|
ip->ip_sum = 0;
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifdef INET6
|
|
|
|
case ETHERTYPE_IPV6:
|
|
|
|
{
|
|
|
|
struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
|
|
|
|
struct tcphdr *th;
|
|
|
|
pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
|
|
|
|
|
|
|
|
if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
|
2018-09-06 18:51:52 +00:00
|
|
|
txq->ift_pullups++;
|
2016-05-18 04:35:58 +00:00
|
|
|
if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
|
|
|
|
return (ENOMEM);
|
|
|
|
}
|
|
|
|
th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen);
|
|
|
|
|
|
|
|
/* XXX-BZ this will go badly in case of ext hdrs. */
|
|
|
|
pi->ipi_ipproto = ip6->ip6_nxt;
|
|
|
|
pi->ipi_flags |= IPI_TX_IPV6;
|
|
|
|
|
2018-06-07 13:03:07 +00:00
|
|
|
/* TCP checksum offload may require TCP header length */
|
|
|
|
if (IS_TX_OFFLOAD6(pi)) {
|
2017-09-23 01:33:20 +00:00
|
|
|
if (pi->ipi_ipproto == IPPROTO_TCP) {
|
|
|
|
if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) {
|
2018-06-07 13:03:07 +00:00
|
|
|
txq->ift_pullups++;
|
2017-09-23 01:33:20 +00:00
|
|
|
if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL))
|
|
|
|
return (ENOMEM);
|
|
|
|
}
|
|
|
|
pi->ipi_tcp_hflags = th->th_flags;
|
|
|
|
pi->ipi_tcp_hlen = th->th_off << 2;
|
2018-06-07 13:03:07 +00:00
|
|
|
pi->ipi_tcp_seq = th->th_seq;
|
|
|
|
}
|
|
|
|
if (IS_TSO6(pi)) {
|
|
|
|
if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP))
|
|
|
|
return (ENXIO);
|
|
|
|
/*
|
2018-11-14 15:23:39 +00:00
|
|
|
* TSO always requires hardware checksum offload.
|
2018-06-07 13:03:07 +00:00
|
|
|
*/
|
|
|
|
pi->ipi_csum_flags |= CSUM_IP6_TCP;
|
|
|
|
th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
|
|
|
|
pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
|
2017-09-23 01:33:20 +00:00
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
default:
|
|
|
|
pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
|
|
|
|
pi->ipi_ip_hlen = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
*mp = m;
|
2017-01-02 00:56:33 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If dodgy hardware rejects the scatter gather chain we've handed it
|
2016-08-12 21:29:44 +00:00
|
|
|
* we'll need to remove the mbuf chain from ifsg_m[] before we can add the
|
|
|
|
* m_defrag'd mbufs
|
2016-05-18 04:35:58 +00:00
|
|
|
*/
|
|
|
|
static __noinline struct mbuf *
|
2016-08-12 21:29:44 +00:00
|
|
|
iflib_remove_mbuf(iflib_txq_t txq)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
2018-11-27 20:01:05 +00:00
|
|
|
int ntxd, pidx;
|
|
|
|
struct mbuf *m, **ifsd_m;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
ifsd_m = txq->ift_sds.ifsd_m;
|
2016-08-12 21:29:44 +00:00
|
|
|
ntxd = txq->ift_size;
|
2018-11-27 20:01:05 +00:00
|
|
|
pidx = txq->ift_pidx & (ntxd - 1);
|
|
|
|
ifsd_m = txq->ift_sds.ifsd_m;
|
|
|
|
m = ifsd_m[pidx];
|
2016-05-18 04:35:58 +00:00
|
|
|
ifsd_m[pidx] = NULL;
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[pidx]);
|
2019-01-16 05:44:14 +00:00
|
|
|
if (txq->ift_sds.ifsd_tso_map != NULL)
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_unload(txq->ift_tso_buf_tag,
|
2019-01-16 05:44:14 +00:00
|
|
|
txq->ift_sds.ifsd_tso_map[pidx]);
|
2016-05-18 04:35:58 +00:00
|
|
|
#if MEMORY_LOGGING
|
|
|
|
txq->ift_dequeued++;
|
|
|
|
#endif
|
2018-11-27 20:01:05 +00:00
|
|
|
return (m);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
static inline caddr_t
|
|
|
|
calc_next_txd(iflib_txq_t txq, int cidx, uint8_t qid)
|
|
|
|
{
|
|
|
|
qidx_t size;
|
|
|
|
int ntxd;
|
|
|
|
caddr_t start, end, cur, next;
|
|
|
|
|
|
|
|
ntxd = txq->ift_size;
|
|
|
|
size = txq->ift_txd_size[qid];
|
|
|
|
start = txq->ift_ifdi[qid].idi_vaddr;
|
|
|
|
|
|
|
|
if (__predict_false(size == 0))
|
|
|
|
return (start);
|
|
|
|
cur = start + size*cidx;
|
|
|
|
end = start + size*ntxd;
|
|
|
|
next = CACHE_PTR_NEXT(cur);
|
|
|
|
return (next < end ? next : start);
|
|
|
|
}
|
|
|
|
|
2017-12-05 21:00:31 +00:00
|
|
|
/*
|
|
|
|
* Pad an mbuf to ensure a minimum ethernet frame size.
|
|
|
|
* min_frame_size is the frame size (less CRC) to pad the mbuf to
|
|
|
|
*/
|
|
|
|
static __noinline int
|
2017-12-08 18:43:31 +00:00
|
|
|
iflib_ether_pad(device_t dev, struct mbuf **m_head, uint16_t min_frame_size)
|
2017-12-05 21:00:31 +00:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* 18 is enough bytes to pad an ARP packet to 46 bytes, and
|
|
|
|
* and ARP message is the smallest common payload I can think of
|
|
|
|
*/
|
|
|
|
static char pad[18]; /* just zeros */
|
|
|
|
int n;
|
2017-12-08 18:43:31 +00:00
|
|
|
struct mbuf *new_head;
|
2017-12-05 21:00:31 +00:00
|
|
|
|
2017-12-08 18:43:31 +00:00
|
|
|
if (!M_WRITABLE(*m_head)) {
|
|
|
|
new_head = m_dup(*m_head, M_NOWAIT);
|
|
|
|
if (new_head == NULL) {
|
2017-12-08 19:50:06 +00:00
|
|
|
m_freem(*m_head);
|
2017-12-08 18:43:31 +00:00
|
|
|
device_printf(dev, "cannot pad short frame, m_dup() failed");
|
2017-12-11 20:01:28 +00:00
|
|
|
DBG_COUNTER_INC(encap_pad_mbuf_fail);
|
2018-09-06 18:51:52 +00:00
|
|
|
DBG_COUNTER_INC(tx_frees);
|
2017-12-08 18:43:31 +00:00
|
|
|
return ENOMEM;
|
|
|
|
}
|
|
|
|
m_freem(*m_head);
|
|
|
|
*m_head = new_head;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (n = min_frame_size - (*m_head)->m_pkthdr.len;
|
2017-12-05 21:00:31 +00:00
|
|
|
n > 0; n -= sizeof(pad))
|
2017-12-08 18:43:31 +00:00
|
|
|
if (!m_append(*m_head, min(n, sizeof(pad)), pad))
|
2017-12-05 21:00:31 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
if (n > 0) {
|
2017-12-08 18:43:31 +00:00
|
|
|
m_freem(*m_head);
|
2017-12-05 21:00:31 +00:00
|
|
|
device_printf(dev, "cannot pad short frame\n");
|
|
|
|
DBG_COUNTER_INC(encap_pad_mbuf_fail);
|
2018-09-06 18:51:52 +00:00
|
|
|
DBG_COUNTER_INC(tx_frees);
|
2017-12-05 21:00:31 +00:00
|
|
|
return (ENOBUFS);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
static int
|
|
|
|
iflib_encap(iflib_txq_t txq, struct mbuf **m_headp)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx;
|
|
|
|
if_shared_ctx_t sctx;
|
|
|
|
if_softc_ctx_t scctx;
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dma_tag_t buf_tag;
|
2016-05-18 04:35:58 +00:00
|
|
|
bus_dma_segment_t *segs;
|
2018-11-27 20:01:05 +00:00
|
|
|
struct mbuf *m_head, **ifsd_m;
|
2017-03-13 22:53:06 +00:00
|
|
|
void *next_txd;
|
2016-05-18 04:35:58 +00:00
|
|
|
bus_dmamap_t map;
|
|
|
|
struct if_pkt_info pi;
|
|
|
|
int remap = 0;
|
|
|
|
int err, nsegs, ndesc, max_segs, pidx, cidx, next, ntxd;
|
|
|
|
|
|
|
|
ctx = txq->ift_ctx;
|
|
|
|
sctx = ctx->ifc_sctx;
|
|
|
|
scctx = &ctx->ifc_softc_ctx;
|
|
|
|
segs = txq->ift_segs;
|
2016-08-12 21:29:44 +00:00
|
|
|
ntxd = txq->ift_size;
|
2016-05-18 04:35:58 +00:00
|
|
|
m_head = *m_headp;
|
|
|
|
map = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're doing TSO the next descriptor to clean may be quite far ahead
|
|
|
|
*/
|
|
|
|
cidx = txq->ift_cidx;
|
|
|
|
pidx = txq->ift_pidx;
|
2017-03-13 22:53:06 +00:00
|
|
|
if (ctx->ifc_flags & IFC_PREFETCH) {
|
|
|
|
next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1);
|
|
|
|
if (!(ctx->ifc_flags & IFLIB_HAS_TXCQ)) {
|
|
|
|
next_txd = calc_next_txd(txq, cidx, 0);
|
|
|
|
prefetch(next_txd);
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
/* prefetch the next cache line of mbuf pointers and flags */
|
|
|
|
prefetch(&txq->ift_sds.ifsd_m[next]);
|
2018-11-27 20:01:05 +00:00
|
|
|
prefetch(&txq->ift_sds.ifsd_map[next]);
|
|
|
|
next = (cidx + CACHE_LINE_SIZE) & (ntxd-1);
|
|
|
|
}
|
|
|
|
map = txq->ift_sds.ifsd_map[pidx];
|
|
|
|
ifsd_m = txq->ift_sds.ifsd_m;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
buf_tag = txq->ift_tso_buf_tag;
|
2016-05-18 04:35:58 +00:00
|
|
|
max_segs = scctx->isc_tx_tso_segments_max;
|
2019-01-16 05:44:14 +00:00
|
|
|
map = txq->ift_sds.ifsd_tso_map[pidx];
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
MPASS(buf_tag != NULL);
|
Assorted TSO fixes for em(4)/iflib(9) and dead code removal:
- Ever since the workaround for the silicon bug of TSO4 causing MAC hangs
was committed in r295133, CSUM_TSO always got disabled unconditionally
by em(4) on the first invocation of em_init_locked(). However, even with
that problem fixed, it turned out that for at least e. g. 82579 not all
necessary TSO workarounds are in place, still causing MAC hangs even at
Gigabit speed. Thus, for stable/11, TSO usage was deliberately disabled
in r323292 (r323293 for stable/10) for the EM-class by default, allowing
users to turn it on if it happens to work with their particular EM MAC
in a Gigabit-only environment.
In head, the TSO workaround for speeds other than Gigabit was lost with
the conversion to iflib(9) in r311849 (possibly along with another one
or two TSO workarounds). Yet at the same time, for EM-class MACs TSO4
got enabled by default again, causing device hangs. Therefore, change the
default for this hardware class back to have TSO4 off, allowing users
to turn it on manually if it happens to work in their environment as
we do in stable/{10,11}. An alternative would be to add a whitelist of
EM-class devices where TSO4 actually is reliable with the workarounds in
place, but given that the advantage of TSO at Gigabit speed is rather
limited - especially with the overhead of these workarounds -, that's
really not worth it. [1]
This change includes the addition of an isc_capabilities to struct
if_softc_ctx so iflib(9) can also handle interface capabilities that
shouldn't be enabled by default which is used to handle the default-off
capabilities of e1000 as suggested by shurd@ and moving their handling
from em_setup_interface() to em_if_attach_pre() accordingly.
- Although 82543 support TSO4 in theory, the former lem(4) didn't have
support for TSO4, presumably because TSO4 is even more broken in the
LEM-class of MACs than the later EM ones. Still, TSO4 for LEM-class
devices was enabled as part of the conversion to iflib(9) in r311849,
causing device hangs. So revert back to the pre-r311849 behavior of
not supporting TSO4 for LEM-class at all, which includes not creating
a TSO DMA tag in iflib(9) for devices not having IFCAP_TSO4 set. [2]
- In fact, the FreeBSD TCP stack can handle a TSO size of IP_MAXPACKET
(65535) rather than FREEBSD_TSO_SIZE_MAX (65518). However, the TSO
DMA must have a maxsize of the maximum TSO size plus the size of a
VLAN header for software VLAN tagging. The iflib(9) converted em(4),
thus, first correctly sets scctx->isc_tx_tso_size_max to EM_TSO_SIZE
in em_if_attach_pre(), but later on overrides it with IP_MAXPACKET
in em_setup_interface() (apparently, left-over from pre-iflib(9)
times). So remove the later and correct iflib(9) to correctly cap
the maximum TSO size reported to the stack at IP_MAXPACKET. While at
it, let iflib(9) use if_sethwtsomax*().
This change includes the addition of isc_tso_max{seg,}size DMA engine
constraints for the TSO DMA tag to struct if_shared_ctx and letting
iflib_txsd_alloc() automatically adjust the maxsize of that tag in case
IFCAP_VLAN_MTU is supported as requested by shurd@.
- Move the if_setifheaderlen(9) call for adjusting the maximum Ethernet
header length from {ixgbe,ixl,ixlv,ixv,em}_setup_interface() to iflib(9)
so adjustment is automatically done in case IFCAP_VLAN_MTU is supported.
As a consequence, this adjustment now is also done in case of bnxt(4)
which missed it previously.
- Move the reduction of the maximum TSO segment count reported to the
stack by the number of m_pullup(9) calls (which in the worst case,
can add another mbuf and, thus, the requirement for another DMA
segment each) in the transmit path for performance reasons from
em_setup_interface() to iflib_txsd_alloc() as these pull-ups are now
done in iflib_parse_header() rather than in the no longer existing
em_xmit(). Moreover, this optimization applies to all drivers using
iflib(9) and not just em(4); all in-tree iflib(9) consumers still
have enough room to handle full size TSO packets. Also, reduce the
adjustment to the maximum number of m_pullup(9)'s now performed in
iflib_parse_header().
- Prior to the conversion of em(4)/igb(4)/lem(4) and ixl(4) to iflib(9)
in r311849 and r335338 respectively, these drivers didn't enable
IFCAP_VLAN_HWFILTER by default due to VLAN events not being passed
through by lagg(4). With iflib(9), IFCAP_VLAN_HWFILTER was turned on
by default but also lagg(4) was fixed in that regard in r203548. So
just remove the now redundant and defunct IFCAP_VLAN_HWFILTER handling
in {em,ixl,ixlv}_setup_interface().
- Nuke other redundant IFCAP_* setting in {em,ixl,ixlv}_setup_interface()
which is (more completely) already done in {em,ixl,ixlv}_if_attach_pre()
now.
- Remove some redundant/dead setting of scctx->isc_tx_csum_flags in
em_if_attach_pre().
- Remove some IFCAP_* duplicated either directly or indirectly (e. g.
via IFCAP_HWCSUM) in {EM,IGB,IXL}_CAPS.
- Don't bother to fiddle with IFCAP_HWSTATS in ixgbe(4)/ixgbev(4) as
iflib(9) adds that capability unconditionally.
- Remove some unused macros from em(4).
- Bump __FreeBSD_version as some of the above changes require the modules
of drivers using iflib(9) to be recompiled.
Okayed by: sbruno@ at 201806 DevSummit Transport Working Group [1]
Reviewed by: sbruno (earlier version), erj
PR: 219428 (part of; comment #10) [1], 220997 (part of; comment #3) [2]
Differential Revision: https://reviews.freebsd.org/D15720
2018-07-15 19:04:23 +00:00
|
|
|
MPASS(max_segs > 0);
|
2016-05-18 04:35:58 +00:00
|
|
|
} else {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
buf_tag = txq->ift_buf_tag;
|
2016-05-18 04:35:58 +00:00
|
|
|
max_segs = scctx->isc_tx_nsegments;
|
2019-01-16 05:44:14 +00:00
|
|
|
map = txq->ift_sds.ifsd_map[pidx];
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
2017-12-05 21:00:31 +00:00
|
|
|
if ((sctx->isc_flags & IFLIB_NEED_ETHER_PAD) &&
|
|
|
|
__predict_false(m_head->m_pkthdr.len < scctx->isc_min_frame_size)) {
|
2017-12-08 18:43:31 +00:00
|
|
|
err = iflib_ether_pad(ctx->ifc_dev, m_headp, scctx->isc_min_frame_size);
|
2018-09-06 18:51:52 +00:00
|
|
|
if (err) {
|
|
|
|
DBG_COUNTER_INC(encap_txd_encap_fail);
|
2017-12-05 21:00:31 +00:00
|
|
|
return err;
|
2018-09-06 18:51:52 +00:00
|
|
|
}
|
2017-12-05 21:00:31 +00:00
|
|
|
}
|
2017-12-08 18:43:31 +00:00
|
|
|
m_head = *m_headp;
|
2017-03-13 22:53:06 +00:00
|
|
|
|
|
|
|
pkt_info_zero(&pi);
|
2017-09-16 02:41:38 +00:00
|
|
|
pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG|M_BCAST|M_MCAST));
|
|
|
|
pi.ipi_pidx = pidx;
|
|
|
|
pi.ipi_qsidx = txq->ift_id;
|
2017-10-23 20:50:08 +00:00
|
|
|
pi.ipi_len = m_head->m_pkthdr.len;
|
|
|
|
pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags;
|
2019-05-06 20:56:41 +00:00
|
|
|
pi.ipi_vtag = M_HAS_VLANTAG(m_head) ? m_head->m_pkthdr.ether_vtag : 0;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
/* deliberate bitwise OR to make one condition */
|
|
|
|
if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) {
|
2018-09-06 18:51:52 +00:00
|
|
|
if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0)) {
|
|
|
|
DBG_COUNTER_INC(encap_txd_encap_fail);
|
2016-05-18 04:35:58 +00:00
|
|
|
return (err);
|
2018-09-06 18:51:52 +00:00
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
m_head = *m_headp;
|
|
|
|
}
|
|
|
|
|
|
|
|
retry:
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
err = bus_dmamap_load_mbuf_sg(buf_tag, map, m_head, segs, &nsegs,
|
2018-11-27 20:01:05 +00:00
|
|
|
BUS_DMA_NOWAIT);
|
2016-05-18 04:35:58 +00:00
|
|
|
defrag:
|
|
|
|
if (__predict_false(err)) {
|
|
|
|
switch (err) {
|
|
|
|
case EFBIG:
|
|
|
|
/* try collapse once and defrag once */
|
2018-04-30 23:53:27 +00:00
|
|
|
if (remap == 0) {
|
2016-05-18 04:35:58 +00:00
|
|
|
m_head = m_collapse(*m_headp, M_NOWAIT, max_segs);
|
2018-04-30 23:53:27 +00:00
|
|
|
/* try defrag if collapsing fails */
|
|
|
|
if (m_head == NULL)
|
|
|
|
remap++;
|
|
|
|
}
|
2018-09-06 18:51:52 +00:00
|
|
|
if (remap == 1) {
|
|
|
|
txq->ift_mbuf_defrag++;
|
2016-05-18 04:35:58 +00:00
|
|
|
m_head = m_defrag(*m_headp, M_NOWAIT);
|
2018-09-06 18:51:52 +00:00
|
|
|
}
|
iflib: prevent possible infinite loop in iflib_encap
From Jake:
iflib_encap calls bus_dmamap_load_mbuf_sg. Upon it returning EFBIG, an
m_collapse and an m_defrag are attempted to shrink the mbuf cluster to
fit within the DMA segment limitations.
However, if we call m_defrag, and then bus_dmamap_load_mbuf_sg returns
EFBIG on the now defragmented mbuf, we will continuously re-call
bus_dmamap_load_mbuf_sg over and over.
This happens because m_head isn't NULL, and remap is >1, so we don't try
to m_collapse or m_defrag again. The only way we exit the loop is if
m_head is NULL. However, m_head can't be modified by the call to
bus_dmamap_load_mbuf_sg, because we don't pass it as a double pointer.
I believe this will be an incredibly rare occurrence, because it is
unlikely that bus_dmamap_load_mbuf_sg will actually fail on the second
defragment with an EFBIG error. However, it still seems like
a possibility that we should account for.
Fix the exit check to ensure that if remap is >1, we will also exit,
even if m_head is not NULL.
Submitted by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed by: shurd@, gallatin@
MFC after: 1 week
Sponsored by: Intel Corporation
Differential Revision: https://reviews.freebsd.org/D19468
2019-03-19 17:49:03 +00:00
|
|
|
/*
|
|
|
|
* remap should never be >1 unless bus_dmamap_load_mbuf_sg
|
|
|
|
* failed to map an mbuf that was run through m_defrag
|
|
|
|
*/
|
|
|
|
MPASS(remap <= 1);
|
|
|
|
if (__predict_false(m_head == NULL || remap > 1))
|
2016-05-18 04:35:58 +00:00
|
|
|
goto defrag_failed;
|
iflib: prevent possible infinite loop in iflib_encap
From Jake:
iflib_encap calls bus_dmamap_load_mbuf_sg. Upon it returning EFBIG, an
m_collapse and an m_defrag are attempted to shrink the mbuf cluster to
fit within the DMA segment limitations.
However, if we call m_defrag, and then bus_dmamap_load_mbuf_sg returns
EFBIG on the now defragmented mbuf, we will continuously re-call
bus_dmamap_load_mbuf_sg over and over.
This happens because m_head isn't NULL, and remap is >1, so we don't try
to m_collapse or m_defrag again. The only way we exit the loop is if
m_head is NULL. However, m_head can't be modified by the call to
bus_dmamap_load_mbuf_sg, because we don't pass it as a double pointer.
I believe this will be an incredibly rare occurrence, because it is
unlikely that bus_dmamap_load_mbuf_sg will actually fail on the second
defragment with an EFBIG error. However, it still seems like
a possibility that we should account for.
Fix the exit check to ensure that if remap is >1, we will also exit,
even if m_head is not NULL.
Submitted by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed by: shurd@, gallatin@
MFC after: 1 week
Sponsored by: Intel Corporation
Differential Revision: https://reviews.freebsd.org/D19468
2019-03-19 17:49:03 +00:00
|
|
|
remap++;
|
2016-05-18 04:35:58 +00:00
|
|
|
*m_headp = m_head;
|
|
|
|
goto retry;
|
|
|
|
break;
|
|
|
|
case ENOMEM:
|
|
|
|
txq->ift_no_tx_dma_setup++;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
txq->ift_no_tx_dma_setup++;
|
|
|
|
m_freem(*m_headp);
|
|
|
|
DBG_COUNTER_INC(tx_frees);
|
|
|
|
*m_headp = NULL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
txq->ift_map_failed++;
|
|
|
|
DBG_COUNTER_INC(encap_load_mbuf_fail);
|
2018-09-06 18:51:52 +00:00
|
|
|
DBG_COUNTER_INC(encap_txd_encap_fail);
|
2016-05-18 04:35:58 +00:00
|
|
|
return (err);
|
|
|
|
}
|
2018-11-27 20:01:05 +00:00
|
|
|
ifsd_m[pidx] = m_head;
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
|
|
|
* XXX assumes a 1 to 1 relationship between segments and
|
|
|
|
* descriptors - this does not hold true on all drivers, e.g.
|
|
|
|
* cxgb
|
|
|
|
*/
|
|
|
|
if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
|
|
|
|
txq->ift_no_desc_avail++;
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_unload(buf_tag, map);
|
2016-05-18 04:35:58 +00:00
|
|
|
DBG_COUNTER_INC(encap_txq_avail_fail);
|
2018-09-06 18:51:52 +00:00
|
|
|
DBG_COUNTER_INC(encap_txd_encap_fail);
|
2016-08-12 21:29:44 +00:00
|
|
|
if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
|
2016-05-18 04:35:58 +00:00
|
|
|
GROUPTASK_ENQUEUE(&txq->ift_task);
|
|
|
|
return (ENOBUFS);
|
|
|
|
}
|
2017-03-13 22:53:06 +00:00
|
|
|
/*
|
|
|
|
* On Intel cards we can greatly reduce the number of TX interrupts
|
|
|
|
* we see by only setting report status on every Nth descriptor.
|
|
|
|
* However, this also means that the driver will need to keep track
|
|
|
|
* of the descriptors that RS was set on to check them for the DD bit.
|
|
|
|
*/
|
|
|
|
txq->ift_rs_pending += nsegs + 1;
|
|
|
|
if (txq->ift_rs_pending > TXQ_MAX_RS_DEFERRED(txq) ||
|
2018-05-07 18:11:22 +00:00
|
|
|
iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs) <= MAX_TX_DESC(ctx) + 2) {
|
2017-03-13 22:53:06 +00:00
|
|
|
pi.ipi_flags |= IPI_TX_INTR;
|
|
|
|
txq->ift_rs_pending = 0;
|
|
|
|
}
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
pi.ipi_segs = segs;
|
|
|
|
pi.ipi_nsegs = nsegs;
|
|
|
|
|
2016-08-12 21:29:44 +00:00
|
|
|
MPASS(pidx >= 0 && pidx < txq->ift_size);
|
2016-05-18 04:35:58 +00:00
|
|
|
#ifdef PKT_DEBUG
|
|
|
|
print_pkt(&pi);
|
|
|
|
#endif
|
|
|
|
if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) {
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
bus_dmamap_sync(buf_tag, map, BUS_DMASYNC_PREWRITE);
|
2016-05-18 04:35:58 +00:00
|
|
|
DBG_COUNTER_INC(tx_encap);
|
2017-03-13 22:53:06 +00:00
|
|
|
MPASS(pi.ipi_new_pidx < txq->ift_size);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
ndesc = pi.ipi_new_pidx - pi.ipi_pidx;
|
|
|
|
if (pi.ipi_new_pidx < pi.ipi_pidx) {
|
2016-08-12 21:29:44 +00:00
|
|
|
ndesc += txq->ift_size;
|
2016-05-18 04:35:58 +00:00
|
|
|
txq->ift_gen = 1;
|
|
|
|
}
|
2017-01-02 00:56:33 +00:00
|
|
|
/*
|
|
|
|
* drivers can need as many as
|
|
|
|
* two sentinels
|
|
|
|
*/
|
|
|
|
MPASS(ndesc <= pi.ipi_nsegs + 2);
|
2016-05-18 04:35:58 +00:00
|
|
|
MPASS(pi.ipi_new_pidx != pidx);
|
|
|
|
MPASS(ndesc > 0);
|
|
|
|
txq->ift_in_use += ndesc;
|
2020-12-19 01:08:33 +00:00
|
|
|
txq->ift_db_pending += ndesc;
|
2017-03-13 22:53:06 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
|
|
|
* We update the last software descriptor again here because there may
|
|
|
|
* be a sentinel and/or there may be more mbufs than segments
|
|
|
|
*/
|
|
|
|
txq->ift_pidx = pi.ipi_new_pidx;
|
|
|
|
txq->ift_npending += pi.ipi_ndescs;
|
2018-04-30 23:53:27 +00:00
|
|
|
} else {
|
2016-08-12 21:29:44 +00:00
|
|
|
*m_headp = m_head = iflib_remove_mbuf(txq);
|
2018-04-30 23:53:27 +00:00
|
|
|
if (err == EFBIG) {
|
|
|
|
txq->ift_txd_encap_efbig++;
|
|
|
|
if (remap < 2) {
|
|
|
|
remap = 1;
|
|
|
|
goto defrag;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
goto defrag_failed;
|
|
|
|
}
|
2018-09-06 18:51:52 +00:00
|
|
|
/*
|
|
|
|
* err can't possibly be non-zero here, so we don't neet to test it
|
|
|
|
* to see if we need to DBG_COUNTER_INC(encap_txd_encap_fail).
|
|
|
|
*/
|
2016-05-18 04:35:58 +00:00
|
|
|
return (err);
|
|
|
|
|
|
|
|
defrag_failed:
|
|
|
|
txq->ift_mbuf_defrag_failed++;
|
|
|
|
txq->ift_map_failed++;
|
|
|
|
m_freem(*m_headp);
|
|
|
|
DBG_COUNTER_INC(tx_frees);
|
|
|
|
*m_headp = NULL;
|
2018-09-06 18:51:52 +00:00
|
|
|
DBG_COUNTER_INC(encap_txd_encap_fail);
|
2016-05-18 04:35:58 +00:00
|
|
|
return (ENOMEM);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
iflib_tx_desc_free(iflib_txq_t txq, int n)
|
|
|
|
{
|
|
|
|
uint32_t qsize, cidx, mask, gen;
|
|
|
|
struct mbuf *m, **ifsd_m;
|
2017-03-13 22:53:06 +00:00
|
|
|
bool do_prefetch;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
cidx = txq->ift_cidx;
|
|
|
|
gen = txq->ift_gen;
|
2016-08-12 21:29:44 +00:00
|
|
|
qsize = txq->ift_size;
|
2016-05-18 04:35:58 +00:00
|
|
|
mask = qsize-1;
|
|
|
|
ifsd_m = txq->ift_sds.ifsd_m;
|
2017-03-13 22:53:06 +00:00
|
|
|
do_prefetch = (txq->ift_ctx->ifc_flags & IFC_PREFETCH);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2018-05-06 00:57:52 +00:00
|
|
|
while (n-- > 0) {
|
2017-03-13 22:53:06 +00:00
|
|
|
if (do_prefetch) {
|
|
|
|
prefetch(ifsd_m[(cidx + 3) & mask]);
|
|
|
|
prefetch(ifsd_m[(cidx + 4) & mask]);
|
|
|
|
}
|
2018-11-27 20:01:05 +00:00
|
|
|
if ((m = ifsd_m[cidx]) != NULL) {
|
2016-05-18 04:35:58 +00:00
|
|
|
prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]);
|
2019-01-16 05:44:14 +00:00
|
|
|
if (m->m_pkthdr.csum_flags & CSUM_TSO) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_sync(txq->ift_tso_buf_tag,
|
2019-01-16 05:44:14 +00:00
|
|
|
txq->ift_sds.ifsd_tso_map[cidx],
|
|
|
|
BUS_DMASYNC_POSTWRITE);
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_unload(txq->ift_tso_buf_tag,
|
2019-01-16 05:44:14 +00:00
|
|
|
txq->ift_sds.ifsd_tso_map[cidx]);
|
|
|
|
} else {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_sync(txq->ift_buf_tag,
|
2019-01-16 05:44:14 +00:00
|
|
|
txq->ift_sds.ifsd_map[cidx],
|
|
|
|
BUS_DMASYNC_POSTWRITE);
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
bus_dmamap_unload(txq->ift_buf_tag,
|
2019-01-16 05:44:14 +00:00
|
|
|
txq->ift_sds.ifsd_map[cidx]);
|
|
|
|
}
|
2018-11-27 20:01:05 +00:00
|
|
|
/* XXX we don't support any drivers that batch packets yet */
|
|
|
|
MPASS(m->m_nextpkt == NULL);
|
|
|
|
m_freem(m);
|
|
|
|
ifsd_m[cidx] = NULL;
|
2016-05-18 04:35:58 +00:00
|
|
|
#if MEMORY_LOGGING
|
2018-11-27 20:01:05 +00:00
|
|
|
txq->ift_dequeued++;
|
2016-05-18 04:35:58 +00:00
|
|
|
#endif
|
2018-11-27 20:01:05 +00:00
|
|
|
DBG_COUNTER_INC(tx_frees);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
if (__predict_false(++cidx == qsize)) {
|
|
|
|
cidx = 0;
|
|
|
|
gen = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
txq->ift_cidx = cidx;
|
|
|
|
txq->ift_gen = gen;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __inline int
|
|
|
|
iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh)
|
|
|
|
{
|
|
|
|
int reclaim;
|
|
|
|
if_ctx_t ctx = txq->ift_ctx;
|
|
|
|
|
|
|
|
KASSERT(thresh >= 0, ("invalid threshold to reclaim"));
|
|
|
|
MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Need a rate-limiting check so that this isn't called every time
|
|
|
|
*/
|
|
|
|
iflib_tx_credits_update(ctx, txq);
|
|
|
|
reclaim = DESC_RECLAIMABLE(txq);
|
|
|
|
|
|
|
|
if (reclaim <= thresh /* + MAX_TX_DESC(txq->ift_ctx) */) {
|
|
|
|
#ifdef INVARIANTS
|
|
|
|
if (iflib_verbose_debug) {
|
|
|
|
printf("%s processed=%ju cleaned=%ju tx_nsegments=%d reclaim=%d thresh=%d\n", __FUNCTION__,
|
|
|
|
txq->ift_processed, txq->ift_cleaned, txq->ift_ctx->ifc_softc_ctx.isc_tx_nsegments,
|
|
|
|
reclaim, thresh);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
iflib_tx_desc_free(txq, reclaim);
|
|
|
|
txq->ift_cleaned += reclaim;
|
|
|
|
txq->ift_in_use -= reclaim;
|
|
|
|
|
|
|
|
return (reclaim);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct mbuf **
|
2017-03-13 22:53:06 +00:00
|
|
|
_ring_peek_one(struct ifmp_ring *r, int cidx, int offset, int remaining)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
2017-03-13 22:53:06 +00:00
|
|
|
int next, size;
|
|
|
|
struct mbuf **items;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
size = r->size;
|
|
|
|
next = (cidx + CACHE_PTR_INCREMENT) & (size-1);
|
|
|
|
items = __DEVOLATILE(struct mbuf **, &r->items[0]);
|
|
|
|
|
|
|
|
prefetch(items[(cidx + offset) & (size-1)]);
|
|
|
|
if (remaining > 1) {
|
2017-10-23 20:50:08 +00:00
|
|
|
prefetch2cachelines(&items[next]);
|
|
|
|
prefetch2cachelines(items[(cidx + offset + 1) & (size-1)]);
|
|
|
|
prefetch2cachelines(items[(cidx + offset + 2) & (size-1)]);
|
|
|
|
prefetch2cachelines(items[(cidx + offset + 3) & (size-1)]);
|
2017-03-13 22:53:06 +00:00
|
|
|
}
|
|
|
|
return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (size-1)]));
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
iflib_txq_check_drain(iflib_txq_t txq, int budget)
|
|
|
|
{
|
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
ifmp_ring_check_drainage(txq->ift_br, budget);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static uint32_t
|
|
|
|
iflib_txq_can_drain(struct ifmp_ring *r)
|
|
|
|
{
|
|
|
|
iflib_txq_t txq = r->cookie;
|
|
|
|
if_ctx_t ctx = txq->ift_ctx;
|
|
|
|
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
if (TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2)
|
|
|
|
return (1);
|
2019-01-16 05:44:14 +00:00
|
|
|
bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
|
|
|
|
BUS_DMASYNC_POSTREAD);
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
return (ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id,
|
|
|
|
false));
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static uint32_t
|
|
|
|
iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
|
|
|
|
{
|
|
|
|
iflib_txq_t txq = r->cookie;
|
|
|
|
if_ctx_t ctx = txq->ift_ctx;
|
2019-05-06 20:56:41 +00:00
|
|
|
if_t ifp = ctx->ifc_ifp;
|
2019-06-26 15:28:21 +00:00
|
|
|
struct mbuf *m, **mp;
|
2020-12-19 01:08:33 +00:00
|
|
|
int avail, bytes_sent, skipped, count, err, i;
|
|
|
|
int mcast_sent, pkt_sent, reclaimed;
|
2019-06-26 15:28:21 +00:00
|
|
|
bool do_prefetch, rang, ring;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) ||
|
|
|
|
!LINK_ACTIVE(ctx))) {
|
|
|
|
DBG_COUNTER_INC(txq_drain_notready);
|
|
|
|
return (0);
|
|
|
|
}
|
2017-03-13 22:53:06 +00:00
|
|
|
reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
|
2020-12-19 01:08:33 +00:00
|
|
|
rang = iflib_txd_db_check(txq, reclaimed && txq->ift_db_pending);
|
2016-05-18 04:35:58 +00:00
|
|
|
avail = IDXDIFF(pidx, cidx, r->size);
|
2020-12-19 01:08:33 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) {
|
2020-12-19 01:08:33 +00:00
|
|
|
/*
|
|
|
|
* The driver is unloading so we need to free all pending packets.
|
|
|
|
*/
|
2016-05-18 04:35:58 +00:00
|
|
|
DBG_COUNTER_INC(txq_drain_flushing);
|
|
|
|
for (i = 0; i < avail; i++) {
|
2018-08-29 16:21:34 +00:00
|
|
|
if (__predict_true(r->items[(cidx + i) & (r->size-1)] != (void *)txq))
|
2020-11-11 18:00:06 +00:00
|
|
|
m_freem(r->items[(cidx + i) & (r->size-1)]);
|
2016-05-18 04:35:58 +00:00
|
|
|
r->items[(cidx + i) & (r->size-1)] = NULL;
|
|
|
|
}
|
|
|
|
return (avail);
|
|
|
|
}
|
2017-03-13 22:53:06 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
|
|
|
|
txq->ift_qstatus = IFLIB_QUEUE_IDLE;
|
|
|
|
CALLOUT_LOCK(txq);
|
|
|
|
callout_stop(&txq->ift_timer);
|
|
|
|
CALLOUT_UNLOCK(txq);
|
|
|
|
DBG_COUNTER_INC(txq_drain_oactive);
|
|
|
|
return (0);
|
|
|
|
}
|
2020-12-19 01:08:33 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we've reclaimed any packets this queue cannot be hung.
|
|
|
|
*/
|
2017-03-13 22:53:06 +00:00
|
|
|
if (reclaimed)
|
|
|
|
txq->ift_qstatus = IFLIB_QUEUE_IDLE;
|
2020-12-19 01:08:33 +00:00
|
|
|
skipped = mcast_sent = bytes_sent = pkt_sent = 0;
|
2016-05-18 04:35:58 +00:00
|
|
|
count = MIN(avail, TX_BATCH_SIZE);
|
2016-11-18 04:19:21 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
if (iflib_verbose_debug)
|
|
|
|
printf("%s avail=%d ifc_flags=%x txq_avail=%d ", __FUNCTION__,
|
|
|
|
avail, ctx->ifc_flags, TXQ_AVAIL(txq));
|
|
|
|
#endif
|
2017-03-13 22:53:06 +00:00
|
|
|
do_prefetch = (ctx->ifc_flags & IFC_PREFETCH);
|
2018-05-04 18:57:05 +00:00
|
|
|
err = 0;
|
2020-12-19 01:08:33 +00:00
|
|
|
for (i = 0; i < count && TXQ_AVAIL(txq) >= MAX_TX_DESC(ctx) + 2; i++) {
|
2018-05-04 18:57:05 +00:00
|
|
|
int rem = do_prefetch ? count - i : 0;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
mp = _ring_peek_one(r, cidx, i, rem);
|
2016-11-18 04:19:21 +00:00
|
|
|
MPASS(mp != NULL && *mp != NULL);
|
2020-12-19 01:08:33 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Completion interrupts will use the address of the txq
|
|
|
|
* as a sentinel to enqueue _something_ in order to acquire
|
|
|
|
* the lock on the mp_ring (there's no direct lock call).
|
|
|
|
* We obviously whave to check for these sentinel cases
|
|
|
|
* and skip them.
|
|
|
|
*/
|
2017-03-13 22:53:06 +00:00
|
|
|
if (__predict_false(*mp == (struct mbuf *)txq)) {
|
2020-12-19 01:08:33 +00:00
|
|
|
skipped++;
|
2017-03-13 22:53:06 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
err = iflib_encap(txq, mp);
|
|
|
|
if (__predict_false(err)) {
|
2016-11-18 04:19:21 +00:00
|
|
|
/* no room - bail out */
|
2017-03-13 22:53:06 +00:00
|
|
|
if (err == ENOBUFS)
|
|
|
|
break;
|
2020-12-19 01:08:33 +00:00
|
|
|
skipped++;
|
2016-11-18 04:19:21 +00:00
|
|
|
/* we can't send this packet - skip it */
|
2016-05-18 04:35:58 +00:00
|
|
|
continue;
|
2016-11-18 04:19:21 +00:00
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
pkt_sent++;
|
|
|
|
m = *mp;
|
|
|
|
DBG_COUNTER_INC(tx_sent);
|
|
|
|
bytes_sent += m->m_pkthdr.len;
|
2017-03-13 22:53:06 +00:00
|
|
|
mcast_sent += !!(m->m_flags & M_MCAST);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
if (__predict_false(!(ifp->if_drv_flags & IFF_DRV_RUNNING)))
|
2016-05-18 04:35:58 +00:00
|
|
|
break;
|
2020-12-19 01:08:33 +00:00
|
|
|
ETHER_BPF_MTAP(ifp, m);
|
|
|
|
rang = iflib_txd_db_check(txq, false);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
/* deliberate use of bitwise or to avoid gratuitous short-circuit */
|
2020-12-19 01:08:33 +00:00
|
|
|
ring = rang ? false : (iflib_min_tx_latency | err);
|
|
|
|
iflib_txd_db_check(txq, ring);
|
2016-05-18 04:35:58 +00:00
|
|
|
if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
|
|
|
|
if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
|
|
|
|
if (mcast_sent)
|
|
|
|
if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent);
|
2016-11-18 04:19:21 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
if (iflib_verbose_debug)
|
2020-12-19 01:08:33 +00:00
|
|
|
printf("consumed=%d\n", skipped + pkt_sent);
|
2016-11-18 04:19:21 +00:00
|
|
|
#endif
|
2020-12-19 01:08:33 +00:00
|
|
|
return (skipped + pkt_sent);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
2016-11-18 04:19:21 +00:00
|
|
|
static uint32_t
|
|
|
|
iflib_txq_drain_always(struct ifmp_ring *r)
|
|
|
|
{
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint32_t
|
|
|
|
iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
|
|
|
|
{
|
|
|
|
int i, avail;
|
|
|
|
struct mbuf **mp;
|
|
|
|
iflib_txq_t txq;
|
|
|
|
|
|
|
|
txq = r->cookie;
|
|
|
|
|
|
|
|
txq->ift_qstatus = IFLIB_QUEUE_IDLE;
|
|
|
|
CALLOUT_LOCK(txq);
|
|
|
|
callout_stop(&txq->ift_timer);
|
|
|
|
CALLOUT_UNLOCK(txq);
|
|
|
|
|
|
|
|
avail = IDXDIFF(pidx, cidx, r->size);
|
|
|
|
for (i = 0; i < avail; i++) {
|
2017-03-13 22:53:06 +00:00
|
|
|
mp = _ring_peek_one(r, cidx, i, avail - i);
|
|
|
|
if (__predict_false(*mp == (struct mbuf *)txq))
|
|
|
|
continue;
|
2016-11-18 04:19:21 +00:00
|
|
|
m_freem(*mp);
|
2018-09-06 18:51:52 +00:00
|
|
|
DBG_COUNTER_INC(tx_frees);
|
2016-11-18 04:19:21 +00:00
|
|
|
}
|
|
|
|
MPASS(ifmp_ring_is_stalled(r) == 0);
|
|
|
|
return (avail);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
iflib_ifmp_purge(iflib_txq_t txq)
|
|
|
|
{
|
|
|
|
struct ifmp_ring *r;
|
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
r = txq->ift_br;
|
2016-11-18 04:19:21 +00:00
|
|
|
r->drain = iflib_txq_drain_free;
|
|
|
|
r->can_drain = iflib_txq_drain_always;
|
|
|
|
|
|
|
|
ifmp_ring_check_drainage(r, r->size);
|
|
|
|
|
|
|
|
r->drain = iflib_txq_drain;
|
|
|
|
r->can_drain = iflib_txq_can_drain;
|
|
|
|
}
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
static void
|
2016-08-12 21:29:44 +00:00
|
|
|
_task_fn_tx(void *context)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
|
|
|
iflib_txq_t txq = context;
|
|
|
|
if_ctx_t ctx = txq->ift_ctx;
|
2019-02-12 22:33:17 +00:00
|
|
|
if_t ifp = ctx->ifc_ifp;
|
2018-07-20 17:45:26 +00:00
|
|
|
int abdicate = ctx->ifc_sysctl_tx_abdicate;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2017-01-02 00:56:33 +00:00
|
|
|
#ifdef IFLIB_DIAGNOSTICS
|
|
|
|
txq->ift_cpu_exec_count[curcpu]++;
|
|
|
|
#endif
|
2020-06-25 19:35:43 +00:00
|
|
|
if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
|
2016-05-18 04:35:58 +00:00
|
|
|
return;
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
#ifdef DEV_NETMAP
|
2020-06-25 19:35:43 +00:00
|
|
|
if ((if_getcapenable(ifp) & IFCAP_NETMAP) &&
|
|
|
|
netmap_tx_irq(ifp, txq->ift_id))
|
|
|
|
goto skip_ifmp;
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
#endif
|
2018-07-25 22:46:36 +00:00
|
|
|
#ifdef ALTQ
|
|
|
|
if (ALTQ_IS_ENABLED(&ifp->if_snd))
|
|
|
|
iflib_altq_if_start(ifp);
|
|
|
|
#endif
|
2017-03-13 22:53:06 +00:00
|
|
|
if (txq->ift_db_pending)
|
2018-07-20 17:45:26 +00:00
|
|
|
ifmp_ring_enqueue(txq->ift_br, (void **)&txq, 1, TX_BATCH_SIZE, abdicate);
|
|
|
|
else if (!abdicate)
|
|
|
|
ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
|
|
|
|
/*
|
|
|
|
* When abdicating, we always need to check drainage, not just when we don't enqueue
|
|
|
|
*/
|
|
|
|
if (abdicate)
|
|
|
|
ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
|
2020-06-25 19:35:43 +00:00
|
|
|
#ifdef DEV_NETMAP
|
|
|
|
skip_ifmp:
|
|
|
|
#endif
|
2017-03-13 22:53:06 +00:00
|
|
|
if (ctx->ifc_flags & IFC_LEGACY)
|
|
|
|
IFDI_INTR_ENABLE(ctx);
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
else
|
|
|
|
IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2016-08-12 21:29:44 +00:00
|
|
|
_task_fn_rx(void *context)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
|
|
|
iflib_rxq_t rxq = context;
|
|
|
|
if_ctx_t ctx = rxq->ifr_ctx;
|
2020-02-12 08:30:07 +00:00
|
|
|
uint8_t more;
|
2017-09-23 01:37:01 +00:00
|
|
|
uint16_t budget;
|
2020-06-09 19:15:43 +00:00
|
|
|
#ifdef DEV_NETMAP
|
|
|
|
u_int work = 0;
|
|
|
|
int nmirq;
|
|
|
|
#endif
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2017-01-02 00:56:33 +00:00
|
|
|
#ifdef IFLIB_DIAGNOSTICS
|
|
|
|
rxq->ifr_cpu_exec_count[curcpu]++;
|
|
|
|
#endif
|
2016-05-18 04:35:58 +00:00
|
|
|
DBG_COUNTER_INC(task_fn_rxs);
|
|
|
|
if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
|
|
|
|
return;
|
2017-09-20 20:40:49 +00:00
|
|
|
#ifdef DEV_NETMAP
|
2020-06-09 19:15:43 +00:00
|
|
|
nmirq = netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &work);
|
|
|
|
if (nmirq != NM_IRQ_PASS) {
|
|
|
|
more = (nmirq == NM_IRQ_RESCHED) ? IFLIB_RXEOF_MORE : 0;
|
|
|
|
goto skip_rxeof;
|
2017-09-20 20:40:49 +00:00
|
|
|
}
|
|
|
|
#endif
|
2017-09-23 01:37:01 +00:00
|
|
|
budget = ctx->ifc_sysctl_rx_budget;
|
|
|
|
if (budget == 0)
|
|
|
|
budget = 16; /* XXX */
|
2020-02-12 08:30:07 +00:00
|
|
|
more = iflib_rxeof(rxq, budget);
|
|
|
|
#ifdef DEV_NETMAP
|
|
|
|
skip_rxeof:
|
|
|
|
#endif
|
|
|
|
if ((more & IFLIB_RXEOF_MORE) == 0) {
|
2016-05-18 04:35:58 +00:00
|
|
|
if (ctx->ifc_flags & IFC_LEGACY)
|
|
|
|
IFDI_INTR_ENABLE(ctx);
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
else
|
|
|
|
IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
|
|
|
|
DBG_COUNTER_INC(rx_intr_enables);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
|
|
|
|
return;
|
2020-02-12 08:30:07 +00:00
|
|
|
|
|
|
|
if (more & IFLIB_RXEOF_MORE)
|
2016-05-18 04:35:58 +00:00
|
|
|
GROUPTASK_ENQUEUE(&rxq->ifr_task);
|
2020-02-12 08:30:07 +00:00
|
|
|
else if (more & IFLIB_RXEOF_EMPTY)
|
|
|
|
callout_reset_curcpu(&rxq->ifr_watchdog, 1, &_task_fn_rx_watchdog, rxq);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2016-08-12 21:29:44 +00:00
|
|
|
_task_fn_admin(void *context)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
|
|
|
if_ctx_t ctx = context;
|
|
|
|
if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
|
|
|
|
iflib_txq_t txq;
|
2017-09-16 02:41:38 +00:00
|
|
|
int i;
|
2018-10-12 22:40:54 +00:00
|
|
|
bool oactive, running, do_reset, do_watchdog, in_detach;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2018-04-12 14:35:37 +00:00
|
|
|
STATE_LOCK(ctx);
|
|
|
|
running = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING);
|
|
|
|
oactive = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE);
|
|
|
|
do_reset = (ctx->ifc_flags & IFC_DO_RESET);
|
|
|
|
do_watchdog = (ctx->ifc_flags & IFC_DO_WATCHDOG);
|
2018-10-12 22:40:54 +00:00
|
|
|
in_detach = (ctx->ifc_flags & IFC_IN_DETACH);
|
2018-04-12 14:35:37 +00:00
|
|
|
ctx->ifc_flags &= ~(IFC_DO_RESET|IFC_DO_WATCHDOG);
|
|
|
|
STATE_UNLOCK(ctx);
|
|
|
|
|
2018-10-12 22:40:54 +00:00
|
|
|
if ((!running && !oactive) && !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
|
|
|
|
return;
|
|
|
|
if (in_detach)
|
2018-04-12 14:35:37 +00:00
|
|
|
return;
|
2017-09-13 01:18:42 +00:00
|
|
|
|
2017-09-16 02:41:38 +00:00
|
|
|
CTX_LOCK(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
|
|
|
|
CALLOUT_LOCK(txq);
|
|
|
|
callout_stop(&txq->ift_timer);
|
|
|
|
CALLOUT_UNLOCK(txq);
|
|
|
|
}
|
2021-03-02 23:24:29 +00:00
|
|
|
if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_ADMINCQ)
|
|
|
|
IFDI_ADMIN_COMPLETION_HANDLE(ctx);
|
2018-04-12 14:35:37 +00:00
|
|
|
if (do_watchdog) {
|
|
|
|
ctx->ifc_watchdog_events++;
|
|
|
|
IFDI_WATCHDOG_RESET(ctx);
|
|
|
|
}
|
2017-09-16 02:41:38 +00:00
|
|
|
IFDI_UPDATE_ADMIN_STATUS(ctx);
|
2018-07-20 17:24:45 +00:00
|
|
|
for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
|
2020-12-19 01:08:33 +00:00
|
|
|
callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer, txq,
|
2020-10-27 21:53:33 +00:00
|
|
|
txq->ift_timer.c_cpu);
|
2018-07-20 17:24:45 +00:00
|
|
|
}
|
2017-09-16 02:41:38 +00:00
|
|
|
IFDI_LINK_INTR_ENABLE(ctx);
|
2018-04-12 14:35:37 +00:00
|
|
|
if (do_reset)
|
2017-09-16 02:41:38 +00:00
|
|
|
iflib_if_init_locked(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
|
2017-09-16 02:41:38 +00:00
|
|
|
if (LINK_ACTIVE(ctx) == 0)
|
2016-05-18 04:35:58 +00:00
|
|
|
return;
|
|
|
|
for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
|
|
|
|
iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2016-08-12 21:29:44 +00:00
|
|
|
_task_fn_iov(void *context)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
|
|
|
if_ctx_t ctx = context;
|
|
|
|
|
2018-10-12 22:40:54 +00:00
|
|
|
if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) &&
|
|
|
|
!(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
|
2016-05-18 04:35:58 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
CTX_LOCK(ctx);
|
|
|
|
IFDI_VFLR_HANDLE(ctx);
|
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
if_int_delay_info_t info;
|
|
|
|
if_ctx_t ctx;
|
|
|
|
|
|
|
|
info = (if_int_delay_info_t)arg1;
|
|
|
|
ctx = info->iidi_ctx;
|
|
|
|
info->iidi_req = req;
|
|
|
|
info->iidi_oidp = oidp;
|
|
|
|
CTX_LOCK(ctx);
|
|
|
|
err = IFDI_SYSCTL_INT_DELAY(ctx, info);
|
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*********************************************************************
|
|
|
|
*
|
|
|
|
* IFNET FUNCTIONS
|
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
|
|
|
|
static void
|
|
|
|
iflib_if_init_locked(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
iflib_stop(ctx);
|
|
|
|
iflib_init_locked(ctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
iflib_if_init(void *arg)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = arg;
|
|
|
|
|
|
|
|
CTX_LOCK(ctx);
|
|
|
|
iflib_if_init_locked(ctx);
|
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
iflib_if_transmit(if_t ifp, struct mbuf *m)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = if_getsoftc(ifp);
|
|
|
|
|
|
|
|
iflib_txq_t txq;
|
2016-08-12 21:29:44 +00:00
|
|
|
int err, qidx;
|
2018-07-20 17:45:26 +00:00
|
|
|
int abdicate = ctx->ifc_sysctl_tx_abdicate;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
if (__predict_false((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) {
|
|
|
|
DBG_COUNTER_INC(tx_frees);
|
|
|
|
m_freem(m);
|
2019-03-28 20:46:45 +00:00
|
|
|
return (ENETDOWN);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
2016-08-12 21:29:44 +00:00
|
|
|
MPASS(m->m_nextpkt == NULL);
|
2018-07-25 22:46:36 +00:00
|
|
|
/* ALTQ-enabled interfaces always use queue 0. */
|
2016-05-18 04:35:58 +00:00
|
|
|
qidx = 0;
|
2018-07-25 22:46:36 +00:00
|
|
|
if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m) && !ALTQ_IS_ENABLED(&ifp->if_snd))
|
2016-05-18 04:35:58 +00:00
|
|
|
qidx = QIDX(ctx, m);
|
|
|
|
/*
|
|
|
|
* XXX calculate buf_ring based on flowid (divvy up bits?)
|
|
|
|
*/
|
|
|
|
txq = &ctx->ifc_txqs[qidx];
|
|
|
|
|
|
|
|
#ifdef DRIVER_BACKPRESSURE
|
|
|
|
if (txq->ift_closed) {
|
|
|
|
while (m != NULL) {
|
|
|
|
next = m->m_nextpkt;
|
|
|
|
m->m_nextpkt = NULL;
|
|
|
|
m_freem(m);
|
2018-09-06 18:51:52 +00:00
|
|
|
DBG_COUNTER_INC(tx_frees);
|
2016-05-18 04:35:58 +00:00
|
|
|
m = next;
|
|
|
|
}
|
|
|
|
return (ENOBUFS);
|
|
|
|
}
|
|
|
|
#endif
|
2016-08-12 21:29:44 +00:00
|
|
|
#ifdef notyet
|
2016-05-18 04:35:58 +00:00
|
|
|
qidx = count = 0;
|
|
|
|
mp = marr;
|
|
|
|
next = m;
|
|
|
|
do {
|
|
|
|
count++;
|
|
|
|
next = next->m_nextpkt;
|
|
|
|
} while (next != NULL);
|
|
|
|
|
2016-06-07 19:49:08 +00:00
|
|
|
if (count > nitems(marr))
|
2016-05-18 04:35:58 +00:00
|
|
|
if ((mp = malloc(count*sizeof(struct mbuf *), M_IFLIB, M_NOWAIT)) == NULL) {
|
|
|
|
/* XXX check nextpkt */
|
|
|
|
m_freem(m);
|
|
|
|
/* XXX simplify for now */
|
|
|
|
DBG_COUNTER_INC(tx_frees);
|
|
|
|
return (ENOBUFS);
|
|
|
|
}
|
|
|
|
for (next = m, i = 0; next != NULL; i++) {
|
|
|
|
mp[i] = next;
|
|
|
|
next = next->m_nextpkt;
|
|
|
|
mp[i]->m_nextpkt = NULL;
|
|
|
|
}
|
2016-08-12 21:29:44 +00:00
|
|
|
#endif
|
2016-05-18 04:35:58 +00:00
|
|
|
DBG_COUNTER_INC(tx_seen);
|
2018-07-20 17:45:26 +00:00
|
|
|
err = ifmp_ring_enqueue(txq->ift_br, (void **)&m, 1, TX_BATCH_SIZE, abdicate);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2018-07-20 17:45:26 +00:00
|
|
|
if (abdicate)
|
|
|
|
GROUPTASK_ENQUEUE(&txq->ift_task);
|
|
|
|
if (err) {
|
|
|
|
if (!abdicate)
|
|
|
|
GROUPTASK_ENQUEUE(&txq->ift_task);
|
2016-05-18 04:35:58 +00:00
|
|
|
/* support forthcoming later */
|
|
|
|
#ifdef DRIVER_BACKPRESSURE
|
|
|
|
txq->ift_closed = TRUE;
|
|
|
|
#endif
|
2017-03-13 22:53:06 +00:00
|
|
|
ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
|
2016-08-12 21:29:44 +00:00
|
|
|
m_freem(m);
|
2018-09-06 18:51:52 +00:00
|
|
|
DBG_COUNTER_INC(tx_frees);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
2018-07-25 22:46:36 +00:00
|
|
|
#ifdef ALTQ
|
|
|
|
/*
|
|
|
|
* The overall approach to integrating iflib with ALTQ is to continue to use
|
|
|
|
* the iflib mp_ring machinery between the ALTQ queue(s) and the hardware
|
|
|
|
* ring. Technically, when using ALTQ, queueing to an intermediate mp_ring
|
|
|
|
* is redundant/unnecessary, but doing so minimizes the amount of
|
|
|
|
* ALTQ-specific code required in iflib. It is assumed that the overhead of
|
|
|
|
* redundantly queueing to an intermediate mp_ring is swamped by the
|
|
|
|
* performance limitations inherent in using ALTQ.
|
|
|
|
*
|
|
|
|
* When ALTQ support is compiled in, all iflib drivers will use a transmit
|
|
|
|
* routine, iflib_altq_if_transmit(), that checks if ALTQ is enabled for the
|
|
|
|
* given interface. If ALTQ is enabled for an interface, then all
|
|
|
|
* transmitted packets for that interface will be submitted to the ALTQ
|
|
|
|
* subsystem via IFQ_ENQUEUE(). We don't use the legacy if_transmit()
|
|
|
|
* implementation because it uses IFQ_HANDOFF(), which will duplicatively
|
|
|
|
* update stats that the iflib machinery handles, and which is sensitve to
|
|
|
|
* the disused IFF_DRV_OACTIVE flag. Additionally, iflib_altq_if_start()
|
|
|
|
* will be installed as the start routine for use by ALTQ facilities that
|
|
|
|
* need to trigger queue drains on a scheduled basis.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
iflib_altq_if_start(if_t ifp)
|
|
|
|
{
|
|
|
|
struct ifaltq *ifq = &ifp->if_snd;
|
|
|
|
struct mbuf *m;
|
2020-09-01 21:19:14 +00:00
|
|
|
|
2018-07-25 22:46:36 +00:00
|
|
|
IFQ_LOCK(ifq);
|
|
|
|
IFQ_DEQUEUE_NOLOCK(ifq, m);
|
|
|
|
while (m != NULL) {
|
|
|
|
iflib_if_transmit(ifp, m);
|
|
|
|
IFQ_DEQUEUE_NOLOCK(ifq, m);
|
|
|
|
}
|
|
|
|
IFQ_UNLOCK(ifq);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
iflib_altq_if_transmit(if_t ifp, struct mbuf *m)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
|
|
|
if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
|
|
|
|
IFQ_ENQUEUE(&ifp->if_snd, m, err);
|
|
|
|
if (err == 0)
|
|
|
|
iflib_altq_if_start(ifp);
|
|
|
|
} else
|
|
|
|
err = iflib_if_transmit(ifp, m);
|
|
|
|
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
#endif /* ALTQ */
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
static void
|
|
|
|
iflib_if_qflush(if_t ifp)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = if_getsoftc(ifp);
|
|
|
|
iflib_txq_t txq = ctx->ifc_txqs;
|
|
|
|
int i;
|
|
|
|
|
2018-04-12 14:35:37 +00:00
|
|
|
STATE_LOCK(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
ctx->ifc_flags |= IFC_QFLUSH;
|
2018-04-12 14:35:37 +00:00
|
|
|
STATE_UNLOCK(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
for (i = 0; i < NTXQSETS(ctx); i++, txq++)
|
2017-03-13 22:53:06 +00:00
|
|
|
while (!(ifmp_ring_is_idle(txq->ift_br) || ifmp_ring_is_stalled(txq->ift_br)))
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_txq_check_drain(txq, 0);
|
2018-04-12 14:35:37 +00:00
|
|
|
STATE_LOCK(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
ctx->ifc_flags &= ~IFC_QFLUSH;
|
2018-04-12 14:35:37 +00:00
|
|
|
STATE_UNLOCK(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2018-07-25 22:46:36 +00:00
|
|
|
/*
|
|
|
|
* When ALTQ is enabled, this will also take care of purging the
|
|
|
|
* ALTQ queue(s).
|
|
|
|
*/
|
2016-05-18 04:35:58 +00:00
|
|
|
if_qflush(ifp);
|
|
|
|
}
|
|
|
|
|
2018-09-20 19:35:35 +00:00
|
|
|
#define IFCAP_FLAGS (IFCAP_HWCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \
|
|
|
|
IFCAP_TSO | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \
|
|
|
|
IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | \
|
2021-01-28 21:08:48 +00:00
|
|
|
IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWCSUM | IFCAP_MEXTPG)
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
static int
|
|
|
|
iflib_if_ioctl(if_t ifp, u_long command, caddr_t data)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = if_getsoftc(ifp);
|
|
|
|
struct ifreq *ifr = (struct ifreq *)data;
|
|
|
|
#if defined(INET) || defined(INET6)
|
|
|
|
struct ifaddr *ifa = (struct ifaddr *)data;
|
|
|
|
#endif
|
2019-05-06 20:56:41 +00:00
|
|
|
bool avoid_reset = false;
|
2016-05-18 04:35:58 +00:00
|
|
|
int err = 0, reinit = 0, bits;
|
|
|
|
|
|
|
|
switch (command) {
|
|
|
|
case SIOCSIFADDR:
|
|
|
|
#ifdef INET
|
|
|
|
if (ifa->ifa_addr->sa_family == AF_INET)
|
2019-05-06 20:56:41 +00:00
|
|
|
avoid_reset = true;
|
2016-05-18 04:35:58 +00:00
|
|
|
#endif
|
|
|
|
#ifdef INET6
|
|
|
|
if (ifa->ifa_addr->sa_family == AF_INET6)
|
2019-05-06 20:56:41 +00:00
|
|
|
avoid_reset = true;
|
2016-05-18 04:35:58 +00:00
|
|
|
#endif
|
|
|
|
/*
|
|
|
|
** Calling init results in link renegotiation,
|
|
|
|
** so we avoid doing it when possible.
|
|
|
|
*/
|
|
|
|
if (avoid_reset) {
|
|
|
|
if_setflagbits(ifp, IFF_UP,0);
|
2018-06-18 17:27:43 +00:00
|
|
|
if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
|
2016-05-18 04:35:58 +00:00
|
|
|
reinit = 1;
|
|
|
|
#ifdef INET
|
|
|
|
if (!(if_getflags(ifp) & IFF_NOARP))
|
|
|
|
arp_ifinit(ifp, ifa);
|
|
|
|
#endif
|
|
|
|
} else
|
|
|
|
err = ether_ioctl(ifp, command, data);
|
|
|
|
break;
|
|
|
|
case SIOCSIFMTU:
|
|
|
|
CTX_LOCK(ctx);
|
|
|
|
if (ifr->ifr_mtu == if_getmtu(ifp)) {
|
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
bits = if_getdrvflags(ifp);
|
|
|
|
/* stop the driver and free any clusters before proceeding */
|
|
|
|
iflib_stop(ctx);
|
|
|
|
|
|
|
|
if ((err = IFDI_MTU_SET(ctx, ifr->ifr_mtu)) == 0) {
|
2018-04-12 14:35:37 +00:00
|
|
|
STATE_LOCK(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
if (ifr->ifr_mtu > ctx->ifc_max_fl_buf_size)
|
|
|
|
ctx->ifc_flags |= IFC_MULTISEG;
|
|
|
|
else
|
|
|
|
ctx->ifc_flags &= ~IFC_MULTISEG;
|
2018-04-12 14:35:37 +00:00
|
|
|
STATE_UNLOCK(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
err = if_setmtu(ifp, ifr->ifr_mtu);
|
|
|
|
}
|
|
|
|
iflib_init_locked(ctx);
|
2018-04-12 14:35:37 +00:00
|
|
|
STATE_LOCK(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
if_setdrvflags(ifp, bits);
|
2018-04-12 14:35:37 +00:00
|
|
|
STATE_UNLOCK(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
break;
|
|
|
|
case SIOCSIFFLAGS:
|
2017-09-16 02:41:38 +00:00
|
|
|
CTX_LOCK(ctx);
|
|
|
|
if (if_getflags(ifp) & IFF_UP) {
|
|
|
|
if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
|
|
|
|
if ((if_getflags(ifp) ^ ctx->ifc_if_flags) &
|
|
|
|
(IFF_PROMISC | IFF_ALLMULTI)) {
|
2020-07-27 01:17:59 +00:00
|
|
|
CTX_UNLOCK(ctx);
|
2017-09-16 02:41:38 +00:00
|
|
|
err = IFDI_PROMISC_SET(ctx, if_getflags(ifp));
|
2020-07-27 01:17:59 +00:00
|
|
|
CTX_LOCK(ctx);
|
2017-09-16 02:41:38 +00:00
|
|
|
}
|
|
|
|
} else
|
|
|
|
reinit = 1;
|
|
|
|
} else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
|
|
|
|
iflib_stop(ctx);
|
|
|
|
}
|
|
|
|
ctx->ifc_if_flags = if_getflags(ifp);
|
|
|
|
CTX_UNLOCK(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
break;
|
|
|
|
case SIOCADDMULTI:
|
|
|
|
case SIOCDELMULTI:
|
|
|
|
if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
|
2017-09-16 02:41:38 +00:00
|
|
|
CTX_LOCK(ctx);
|
|
|
|
IFDI_INTR_DISABLE(ctx);
|
|
|
|
IFDI_MULTI_SET(ctx);
|
|
|
|
IFDI_INTR_ENABLE(ctx);
|
|
|
|
CTX_UNLOCK(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
case SIOCSIFMEDIA:
|
|
|
|
CTX_LOCK(ctx);
|
|
|
|
IFDI_MEDIA_SET(ctx);
|
|
|
|
CTX_UNLOCK(ctx);
|
2019-05-06 20:56:41 +00:00
|
|
|
/* FALLTHROUGH */
|
2016-05-18 04:35:58 +00:00
|
|
|
case SIOCGIFMEDIA:
|
2017-12-01 17:58:20 +00:00
|
|
|
case SIOCGIFXMEDIA:
|
2019-05-03 20:05:31 +00:00
|
|
|
err = ifmedia_ioctl(ifp, ifr, ctx->ifc_mediap, command);
|
2016-05-18 04:35:58 +00:00
|
|
|
break;
|
|
|
|
case SIOCGI2C:
|
|
|
|
{
|
|
|
|
struct ifi2creq i2c;
|
|
|
|
|
2018-03-30 18:50:13 +00:00
|
|
|
err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
|
2016-05-18 04:35:58 +00:00
|
|
|
if (err != 0)
|
|
|
|
break;
|
|
|
|
if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) {
|
|
|
|
err = EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (i2c.len > sizeof(i2c.data)) {
|
|
|
|
err = EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((err = IFDI_I2C_REQ(ctx, &i2c)) == 0)
|
2018-03-30 18:50:13 +00:00
|
|
|
err = copyout(&i2c, ifr_data_get_ptr(ifr),
|
|
|
|
sizeof(i2c));
|
2016-05-18 04:35:58 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case SIOCSIFCAP:
|
|
|
|
{
|
2018-09-20 19:35:35 +00:00
|
|
|
int mask, setmask, oldmask;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2018-09-20 19:35:35 +00:00
|
|
|
oldmask = if_getcapenable(ifp);
|
|
|
|
mask = ifr->ifr_reqcap ^ oldmask;
|
2021-01-28 21:08:48 +00:00
|
|
|
mask &= ctx->ifc_softc_ctx.isc_capabilities | IFCAP_MEXTPG;
|
2016-05-18 04:35:58 +00:00
|
|
|
setmask = 0;
|
|
|
|
#ifdef TCP_OFFLOAD
|
|
|
|
setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6);
|
|
|
|
#endif
|
|
|
|
setmask |= (mask & IFCAP_FLAGS);
|
2018-09-20 19:35:35 +00:00
|
|
|
setmask |= (mask & IFCAP_WOL);
|
|
|
|
|
|
|
|
/*
|
2018-11-07 19:31:48 +00:00
|
|
|
* If any RX csum has changed, change all the ones that
|
|
|
|
* are supported by the driver.
|
2018-09-20 19:35:35 +00:00
|
|
|
*/
|
2018-11-07 19:31:48 +00:00
|
|
|
if (setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
|
|
|
|
setmask |= ctx->ifc_softc_ctx.isc_capabilities &
|
|
|
|
(IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6);
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* want to ensure that traffic has stopped before we change any of the flags
|
|
|
|
*/
|
|
|
|
if (setmask) {
|
|
|
|
CTX_LOCK(ctx);
|
|
|
|
bits = if_getdrvflags(ifp);
|
2018-09-20 19:35:35 +00:00
|
|
|
if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_stop(ctx);
|
2018-04-12 14:35:37 +00:00
|
|
|
STATE_LOCK(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
if_togglecapenable(ifp, setmask);
|
2021-12-28 10:47:13 +00:00
|
|
|
ctx->ifc_softc_ctx.isc_capenable ^= setmask;
|
2018-04-12 14:35:37 +00:00
|
|
|
STATE_UNLOCK(ctx);
|
2018-09-20 19:35:35 +00:00
|
|
|
if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_init_locked(ctx);
|
2018-04-12 14:35:37 +00:00
|
|
|
STATE_LOCK(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
if_setdrvflags(ifp, bits);
|
2018-04-12 14:35:37 +00:00
|
|
|
STATE_UNLOCK(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
}
|
2018-09-20 19:35:35 +00:00
|
|
|
if_vlancap(ifp);
|
2016-05-18 04:35:58 +00:00
|
|
|
break;
|
2018-06-18 17:27:43 +00:00
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
case SIOCGPRIVATE_0:
|
|
|
|
case SIOCSDRVSPEC:
|
|
|
|
case SIOCGDRVSPEC:
|
|
|
|
CTX_LOCK(ctx);
|
|
|
|
err = IFDI_PRIV_IOCTL(ctx, command, data);
|
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
err = ether_ioctl(ifp, command, data);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (reinit)
|
|
|
|
iflib_if_init(ctx);
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint64_t
|
|
|
|
iflib_if_get_counter(if_t ifp, ift_counter cnt)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = if_getsoftc(ifp);
|
|
|
|
|
|
|
|
return (IFDI_GET_COUNTER(ctx, cnt));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*********************************************************************
|
|
|
|
*
|
|
|
|
* OTHER FUNCTIONS EXPORTED TO THE STACK
|
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
|
|
|
|
static void
|
|
|
|
iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = if_getsoftc(ifp);
|
|
|
|
|
|
|
|
if ((void *)ctx != arg)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if ((vtag == 0) || (vtag > 4095))
|
|
|
|
return;
|
|
|
|
|
2019-09-24 17:03:31 +00:00
|
|
|
if (iflib_in_detach(ctx))
|
|
|
|
return;
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
CTX_LOCK(ctx);
|
iflib: Stop interface before (un)registering VLAN
This patch is intended to solve a specific problem that iavf(4)
encounters, but what it does can be extended to solve other issues.
To summarize the iavf(4) issue, if the PF driver configures VLAN
anti-spoof, then the VF driver needs to make sure no untagged traffic is
sent if a VLAN is configured, and vice-versa. This can be an issue when
a VLAN is being registered or unregistered, e.g. when a packet may be on
the ring with a VLAN in it, but the VLANs are being unregistered. This
can cause that tagged packet to go out and cause an MDD event.
To fix this, include a new interface-dependent function that drivers can
implement named IFDI_NEEDS_RESTART(). Right now, this function is called
in iflib_vlan_unregister/register() to determine whether the interface
needs to be stopped and started when a VLAN is registered or
unregistered. The default return value of IFDI_NEEDS_RESTART() is true,
so this fixes the MDD problem that iavf(4) encounters, since the
interface rings are flushed during a stop/init.
A future change to iavf(4) will implement that function just in case the
default value changes, and to make it explicit that this interface reset
is required when a VLAN is added or removed.
Reviewed by: gallatin@
MFC after: 1 week
Sponsored by: Intel Corporation
Differential Revision: https://reviews.freebsd.org/D22086
2020-04-27 22:02:44 +00:00
|
|
|
/* Driver may need all untagged packets to be flushed */
|
|
|
|
if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
|
|
|
|
iflib_stop(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
IFDI_VLAN_REGISTER(ctx, vtag);
|
iflib: Stop interface before (un)registering VLAN
This patch is intended to solve a specific problem that iavf(4)
encounters, but what it does can be extended to solve other issues.
To summarize the iavf(4) issue, if the PF driver configures VLAN
anti-spoof, then the VF driver needs to make sure no untagged traffic is
sent if a VLAN is configured, and vice-versa. This can be an issue when
a VLAN is being registered or unregistered, e.g. when a packet may be on
the ring with a VLAN in it, but the VLANs are being unregistered. This
can cause that tagged packet to go out and cause an MDD event.
To fix this, include a new interface-dependent function that drivers can
implement named IFDI_NEEDS_RESTART(). Right now, this function is called
in iflib_vlan_unregister/register() to determine whether the interface
needs to be stopped and started when a VLAN is registered or
unregistered. The default return value of IFDI_NEEDS_RESTART() is true,
so this fixes the MDD problem that iavf(4) encounters, since the
interface rings are flushed during a stop/init.
A future change to iavf(4) will implement that function just in case the
default value changes, and to make it explicit that this interface reset
is required when a VLAN is added or removed.
Reviewed by: gallatin@
MFC after: 1 week
Sponsored by: Intel Corporation
Differential Revision: https://reviews.freebsd.org/D22086
2020-04-27 22:02:44 +00:00
|
|
|
/* Re-init to load the changes, if required */
|
|
|
|
if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
|
|
|
|
iflib_init_locked(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
iflib_vlan_unregister(void *arg, if_t ifp, uint16_t vtag)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = if_getsoftc(ifp);
|
|
|
|
|
|
|
|
if ((void *)ctx != arg)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if ((vtag == 0) || (vtag > 4095))
|
|
|
|
return;
|
|
|
|
|
|
|
|
CTX_LOCK(ctx);
|
iflib: Stop interface before (un)registering VLAN
This patch is intended to solve a specific problem that iavf(4)
encounters, but what it does can be extended to solve other issues.
To summarize the iavf(4) issue, if the PF driver configures VLAN
anti-spoof, then the VF driver needs to make sure no untagged traffic is
sent if a VLAN is configured, and vice-versa. This can be an issue when
a VLAN is being registered or unregistered, e.g. when a packet may be on
the ring with a VLAN in it, but the VLANs are being unregistered. This
can cause that tagged packet to go out and cause an MDD event.
To fix this, include a new interface-dependent function that drivers can
implement named IFDI_NEEDS_RESTART(). Right now, this function is called
in iflib_vlan_unregister/register() to determine whether the interface
needs to be stopped and started when a VLAN is registered or
unregistered. The default return value of IFDI_NEEDS_RESTART() is true,
so this fixes the MDD problem that iavf(4) encounters, since the
interface rings are flushed during a stop/init.
A future change to iavf(4) will implement that function just in case the
default value changes, and to make it explicit that this interface reset
is required when a VLAN is added or removed.
Reviewed by: gallatin@
MFC after: 1 week
Sponsored by: Intel Corporation
Differential Revision: https://reviews.freebsd.org/D22086
2020-04-27 22:02:44 +00:00
|
|
|
/* Driver may need all tagged packets to be flushed */
|
|
|
|
if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
|
|
|
|
iflib_stop(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
IFDI_VLAN_UNREGISTER(ctx, vtag);
|
iflib: Stop interface before (un)registering VLAN
This patch is intended to solve a specific problem that iavf(4)
encounters, but what it does can be extended to solve other issues.
To summarize the iavf(4) issue, if the PF driver configures VLAN
anti-spoof, then the VF driver needs to make sure no untagged traffic is
sent if a VLAN is configured, and vice-versa. This can be an issue when
a VLAN is being registered or unregistered, e.g. when a packet may be on
the ring with a VLAN in it, but the VLANs are being unregistered. This
can cause that tagged packet to go out and cause an MDD event.
To fix this, include a new interface-dependent function that drivers can
implement named IFDI_NEEDS_RESTART(). Right now, this function is called
in iflib_vlan_unregister/register() to determine whether the interface
needs to be stopped and started when a VLAN is registered or
unregistered. The default return value of IFDI_NEEDS_RESTART() is true,
so this fixes the MDD problem that iavf(4) encounters, since the
interface rings are flushed during a stop/init.
A future change to iavf(4) will implement that function just in case the
default value changes, and to make it explicit that this interface reset
is required when a VLAN is added or removed.
Reviewed by: gallatin@
MFC after: 1 week
Sponsored by: Intel Corporation
Differential Revision: https://reviews.freebsd.org/D22086
2020-04-27 22:02:44 +00:00
|
|
|
/* Re-init to load the changes, if required */
|
|
|
|
if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
|
|
|
|
iflib_init_locked(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
iflib_led_func(void *arg, int onoff)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = arg;
|
|
|
|
|
|
|
|
CTX_LOCK(ctx);
|
|
|
|
IFDI_LED_FUNC(ctx, onoff);
|
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*********************************************************************
|
|
|
|
*
|
|
|
|
* BUS FUNCTION DEFINITIONS
|
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
|
|
|
|
int
|
|
|
|
iflib_device_probe(device_t dev)
|
|
|
|
{
|
2019-06-15 11:07:41 +00:00
|
|
|
const pci_vendor_info_t *ent;
|
2016-05-18 04:35:58 +00:00
|
|
|
if_shared_ctx_t sctx;
|
2019-06-15 11:07:41 +00:00
|
|
|
uint16_t pci_device_id, pci_rev_id, pci_subdevice_id, pci_subvendor_id;
|
|
|
|
uint16_t pci_vendor_id;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
|
|
|
|
return (ENOTSUP);
|
|
|
|
|
|
|
|
pci_vendor_id = pci_get_vendor(dev);
|
|
|
|
pci_device_id = pci_get_device(dev);
|
|
|
|
pci_subvendor_id = pci_get_subvendor(dev);
|
|
|
|
pci_subdevice_id = pci_get_subdevice(dev);
|
|
|
|
pci_rev_id = pci_get_revid(dev);
|
|
|
|
if (sctx->isc_parse_devinfo != NULL)
|
|
|
|
sctx->isc_parse_devinfo(&pci_device_id, &pci_subvendor_id, &pci_subdevice_id, &pci_rev_id);
|
|
|
|
|
|
|
|
ent = sctx->isc_vendor_info;
|
|
|
|
while (ent->pvi_vendor_id != 0) {
|
|
|
|
if (pci_vendor_id != ent->pvi_vendor_id) {
|
|
|
|
ent++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if ((pci_device_id == ent->pvi_device_id) &&
|
|
|
|
((pci_subvendor_id == ent->pvi_subvendor_id) ||
|
|
|
|
(ent->pvi_subvendor_id == 0)) &&
|
|
|
|
((pci_subdevice_id == ent->pvi_subdevice_id) ||
|
|
|
|
(ent->pvi_subdevice_id == 0)) &&
|
|
|
|
((pci_rev_id == ent->pvi_rev_id) ||
|
|
|
|
(ent->pvi_rev_id == 0))) {
|
|
|
|
device_set_desc_copy(dev, ent->pvi_name);
|
|
|
|
/* this needs to be changed to zero if the bus probing code
|
|
|
|
* ever stops re-probing on best match because the sctx
|
|
|
|
* may have its values over written by register calls
|
|
|
|
* in subsequent probes
|
|
|
|
*/
|
|
|
|
return (BUS_PROBE_DEFAULT);
|
|
|
|
}
|
|
|
|
ent++;
|
|
|
|
}
|
|
|
|
return (ENXIO);
|
|
|
|
}
|
|
|
|
|
2019-05-29 22:24:10 +00:00
|
|
|
int
|
|
|
|
iflib_device_probe_vendor(device_t dev)
|
|
|
|
{
|
|
|
|
int probe;
|
|
|
|
|
|
|
|
probe = iflib_device_probe(dev);
|
|
|
|
if (probe == BUS_PROBE_DEFAULT)
|
|
|
|
return (BUS_PROBE_VENDOR);
|
|
|
|
else
|
|
|
|
return (probe);
|
|
|
|
}
|
|
|
|
|
2018-05-11 20:08:28 +00:00
|
|
|
static void
|
|
|
|
iflib_reset_qvalues(if_ctx_t ctx)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
2018-05-11 20:08:28 +00:00
|
|
|
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
|
|
|
|
if_shared_ctx_t sctx = ctx->ifc_sctx;
|
|
|
|
device_t dev = ctx->ifc_dev;
|
2018-05-19 05:27:49 +00:00
|
|
|
int i;
|
2017-01-02 00:56:33 +00:00
|
|
|
|
2016-08-12 21:29:44 +00:00
|
|
|
if (ctx->ifc_sysctl_ntxqs != 0)
|
|
|
|
scctx->isc_ntxqsets = ctx->ifc_sysctl_ntxqs;
|
|
|
|
if (ctx->ifc_sysctl_nrxqs != 0)
|
|
|
|
scctx->isc_nrxqsets = ctx->ifc_sysctl_nrxqs;
|
|
|
|
|
|
|
|
for (i = 0; i < sctx->isc_ntxqs; i++) {
|
|
|
|
if (ctx->ifc_sysctl_ntxds[i] != 0)
|
|
|
|
scctx->isc_ntxd[i] = ctx->ifc_sysctl_ntxds[i];
|
|
|
|
else
|
|
|
|
scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < sctx->isc_nrxqs; i++) {
|
|
|
|
if (ctx->ifc_sysctl_nrxds[i] != 0)
|
|
|
|
scctx->isc_nrxd[i] = ctx->ifc_sysctl_nrxds[i];
|
|
|
|
else
|
|
|
|
scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < sctx->isc_nrxqs; i++) {
|
|
|
|
if (scctx->isc_nrxd[i] < sctx->isc_nrxd_min[i]) {
|
|
|
|
device_printf(dev, "nrxd%d: %d less than nrxd_min %d - resetting to min\n",
|
|
|
|
i, scctx->isc_nrxd[i], sctx->isc_nrxd_min[i]);
|
|
|
|
scctx->isc_nrxd[i] = sctx->isc_nrxd_min[i];
|
|
|
|
}
|
|
|
|
if (scctx->isc_nrxd[i] > sctx->isc_nrxd_max[i]) {
|
|
|
|
device_printf(dev, "nrxd%d: %d greater than nrxd_max %d - resetting to max\n",
|
|
|
|
i, scctx->isc_nrxd[i], sctx->isc_nrxd_max[i]);
|
|
|
|
scctx->isc_nrxd[i] = sctx->isc_nrxd_max[i];
|
|
|
|
}
|
2019-05-10 00:41:42 +00:00
|
|
|
if (!powerof2(scctx->isc_nrxd[i])) {
|
|
|
|
device_printf(dev, "nrxd%d: %d is not a power of 2 - using default value of %d\n",
|
|
|
|
i, scctx->isc_nrxd[i], sctx->isc_nrxd_default[i]);
|
|
|
|
scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
|
|
|
|
}
|
2016-08-12 21:29:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < sctx->isc_ntxqs; i++) {
|
|
|
|
if (scctx->isc_ntxd[i] < sctx->isc_ntxd_min[i]) {
|
|
|
|
device_printf(dev, "ntxd%d: %d less than ntxd_min %d - resetting to min\n",
|
|
|
|
i, scctx->isc_ntxd[i], sctx->isc_ntxd_min[i]);
|
|
|
|
scctx->isc_ntxd[i] = sctx->isc_ntxd_min[i];
|
|
|
|
}
|
|
|
|
if (scctx->isc_ntxd[i] > sctx->isc_ntxd_max[i]) {
|
|
|
|
device_printf(dev, "ntxd%d: %d greater than ntxd_max %d - resetting to max\n",
|
|
|
|
i, scctx->isc_ntxd[i], sctx->isc_ntxd_max[i]);
|
|
|
|
scctx->isc_ntxd[i] = sctx->isc_ntxd_max[i];
|
|
|
|
}
|
2019-05-10 00:41:42 +00:00
|
|
|
if (!powerof2(scctx->isc_ntxd[i])) {
|
|
|
|
device_printf(dev, "ntxd%d: %d is not a power of 2 - using default value of %d\n",
|
|
|
|
i, scctx->isc_ntxd[i], sctx->isc_ntxd_default[i]);
|
|
|
|
scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
|
|
|
|
}
|
2016-08-12 21:29:44 +00:00
|
|
|
}
|
2018-05-11 20:08:28 +00:00
|
|
|
}
|
|
|
|
|
2019-04-24 13:32:04 +00:00
|
|
|
static void
|
|
|
|
iflib_add_pfil(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
struct pfil_head *pfil;
|
|
|
|
struct pfil_head_args pa;
|
|
|
|
iflib_rxq_t rxq;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
pa.pa_version = PFIL_VERSION;
|
|
|
|
pa.pa_flags = PFIL_IN;
|
|
|
|
pa.pa_type = PFIL_TYPE_ETHERNET;
|
|
|
|
pa.pa_headname = ctx->ifc_ifp->if_xname;
|
|
|
|
pfil = pfil_head_register(&pa);
|
|
|
|
|
|
|
|
for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
|
|
|
|
rxq->pfil = pfil;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
iflib_rem_pfil(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
struct pfil_head *pfil;
|
|
|
|
iflib_rxq_t rxq;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
rxq = ctx->ifc_rxqs;
|
|
|
|
pfil = rxq->pfil;
|
|
|
|
for (i = 0; i < NRXQSETS(ctx); i++, rxq++) {
|
|
|
|
rxq->pfil = NULL;
|
|
|
|
}
|
|
|
|
pfil_head_unregister(pfil);
|
|
|
|
}
|
|
|
|
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Advance forward by n members of the cpuset ctx->ifc_cpus starting from
|
|
|
|
* cpuid and wrapping as necessary.
|
|
|
|
*/
|
|
|
|
static unsigned int
|
|
|
|
cpuid_advance(if_ctx_t ctx, unsigned int cpuid, unsigned int n)
|
|
|
|
{
|
|
|
|
unsigned int first_valid;
|
|
|
|
unsigned int last_valid;
|
|
|
|
|
|
|
|
/* cpuid should always be in the valid set */
|
|
|
|
MPASS(CPU_ISSET(cpuid, &ctx->ifc_cpus));
|
|
|
|
|
|
|
|
/* valid set should never be empty */
|
|
|
|
MPASS(!CPU_EMPTY(&ctx->ifc_cpus));
|
|
|
|
|
|
|
|
first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
|
|
|
|
last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
|
|
|
|
n = n % CPU_COUNT(&ctx->ifc_cpus);
|
|
|
|
while (n > 0) {
|
|
|
|
do {
|
|
|
|
cpuid++;
|
|
|
|
if (cpuid > last_valid)
|
|
|
|
cpuid = first_valid;
|
|
|
|
} while (!CPU_ISSET(cpuid, &ctx->ifc_cpus));
|
|
|
|
n--;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (cpuid);
|
|
|
|
}
|
|
|
|
|
|
|
|
#if defined(SMP) && defined(SCHED_ULE)
|
|
|
|
extern struct cpu_group *cpu_top; /* CPU topology */
|
|
|
|
|
|
|
|
static int
|
|
|
|
find_child_with_core(int cpu, struct cpu_group *grp)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (grp->cg_children == 0)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
MPASS(grp->cg_child);
|
|
|
|
for (i = 0; i < grp->cg_children; i++) {
|
|
|
|
if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask))
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find an L2 neighbor of the given CPU or return -1 if none found. This
|
|
|
|
* does not distinguish among multiple L2 neighbors if the given CPU has
|
|
|
|
* more than one (it will always return the same result in that case).
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
find_l2_neighbor(int cpu)
|
|
|
|
{
|
|
|
|
struct cpu_group *grp;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
grp = cpu_top;
|
|
|
|
if (grp == NULL)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the smallest CPU group that contains the given core.
|
|
|
|
*/
|
|
|
|
i = 0;
|
|
|
|
while ((i = find_child_with_core(cpu, grp)) != -1) {
|
|
|
|
/*
|
|
|
|
* If the smallest group containing the given CPU has less
|
|
|
|
* than two members, we conclude the given CPU has no
|
|
|
|
* L2 neighbor.
|
|
|
|
*/
|
|
|
|
if (grp->cg_child[i].cg_count <= 1)
|
|
|
|
return (-1);
|
|
|
|
grp = &grp->cg_child[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Must share L2. */
|
|
|
|
if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Select the first member of the set that isn't the reference
|
|
|
|
* CPU, which at this point is guaranteed to exist.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < CPU_SETSIZE; i++) {
|
|
|
|
if (CPU_ISSET(i, &grp->cg_mask) && i != cpu)
|
|
|
|
return (i);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Should never be reached */
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
static int
|
|
|
|
find_l2_neighbor(int cpu)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* CPU mapping behaviors
|
|
|
|
* ---------------------
|
|
|
|
* 'separate txrx' refers to the separate_txrx sysctl
|
|
|
|
* 'use logical' refers to the use_logical_cores sysctl
|
|
|
|
* 'INTR CPUS' indicates whether bus_get_cpus(INTR_CPUS) succeeded
|
|
|
|
*
|
|
|
|
* separate use INTR
|
|
|
|
* txrx logical CPUS result
|
|
|
|
* ---------- --------- ------ ------------------------------------------------
|
|
|
|
* - - X RX and TX queues mapped to consecutive physical
|
|
|
|
* cores with RX/TX pairs on same core and excess
|
|
|
|
* of either following
|
|
|
|
* - X X RX and TX queues mapped to consecutive cores
|
|
|
|
* of any type with RX/TX pairs on same core and
|
|
|
|
* excess of either following
|
|
|
|
* X - X RX and TX queues mapped to consecutive physical
|
|
|
|
* cores; all RX then all TX
|
|
|
|
* X X X RX queues mapped to consecutive physical cores
|
|
|
|
* first, then TX queues mapped to L2 neighbor of
|
|
|
|
* the corresponding RX queue if one exists,
|
|
|
|
* otherwise to consecutive physical cores
|
|
|
|
* - n/a - RX and TX queues mapped to consecutive cores of
|
|
|
|
* any type with RX/TX pairs on same core and excess
|
|
|
|
* of either following
|
|
|
|
* X n/a - RX and TX queues mapped to consecutive cores of
|
|
|
|
* any type; all RX then all TX
|
|
|
|
*/
|
|
|
|
static unsigned int
|
|
|
|
get_cpuid_for_queue(if_ctx_t ctx, unsigned int base_cpuid, unsigned int qid,
|
|
|
|
bool is_tx)
|
|
|
|
{
|
|
|
|
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
|
|
|
|
unsigned int core_index;
|
|
|
|
|
|
|
|
if (ctx->ifc_sysctl_separate_txrx) {
|
|
|
|
/*
|
|
|
|
* When using separate CPUs for TX and RX, the assignment
|
|
|
|
* will always be of a consecutive CPU out of the set of
|
|
|
|
* context CPUs, except for the specific case where the
|
|
|
|
* context CPUs are phsyical cores, the use of logical cores
|
|
|
|
* has been enabled, the assignment is for TX, the TX qid
|
|
|
|
* corresponds to an RX qid, and the CPU assigned to the
|
|
|
|
* corresponding RX queue has an L2 neighbor.
|
|
|
|
*/
|
|
|
|
if (ctx->ifc_sysctl_use_logical_cores &&
|
|
|
|
ctx->ifc_cpus_are_physical_cores &&
|
|
|
|
is_tx && qid < scctx->isc_nrxqsets) {
|
|
|
|
int l2_neighbor;
|
|
|
|
unsigned int rx_cpuid;
|
|
|
|
|
|
|
|
rx_cpuid = cpuid_advance(ctx, base_cpuid, qid);
|
|
|
|
l2_neighbor = find_l2_neighbor(rx_cpuid);
|
|
|
|
if (l2_neighbor != -1) {
|
|
|
|
return (l2_neighbor);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* ... else fall through to the normal
|
|
|
|
* consecutive-after-RX assignment scheme.
|
|
|
|
*
|
|
|
|
* Note that we are assuming that all RX queue CPUs
|
|
|
|
* have an L2 neighbor, or all do not. If a mixed
|
|
|
|
* scenario is possible, we will have to keep track
|
|
|
|
* separately of how many queues prior to this one
|
|
|
|
* were not able to be assigned to an L2 neighbor.
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
if (is_tx)
|
|
|
|
core_index = scctx->isc_nrxqsets + qid;
|
|
|
|
else
|
|
|
|
core_index = qid;
|
|
|
|
} else {
|
|
|
|
core_index = qid;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (cpuid_advance(ctx, base_cpuid, core_index));
|
|
|
|
}
|
|
|
|
|
2019-04-25 21:24:56 +00:00
|
|
|
static uint16_t
|
|
|
|
get_ctx_core_offset(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
|
|
|
|
struct cpu_offset *op;
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
cpuset_t assigned_cpus;
|
|
|
|
unsigned int cores_consumed;
|
|
|
|
unsigned int base_cpuid = ctx->ifc_sysctl_core_offset;
|
|
|
|
unsigned int first_valid;
|
|
|
|
unsigned int last_valid;
|
|
|
|
unsigned int i;
|
2019-04-25 21:24:56 +00:00
|
|
|
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
|
|
|
|
last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
|
2019-04-25 21:24:56 +00:00
|
|
|
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
if (base_cpuid != CORE_OFFSET_UNSPECIFIED) {
|
|
|
|
/*
|
|
|
|
* Align the user-chosen base CPU ID to the next valid CPU
|
|
|
|
* for this device. If the chosen base CPU ID is smaller
|
|
|
|
* than the first valid CPU or larger than the last valid
|
|
|
|
* CPU, we assume the user does not know what the valid
|
|
|
|
* range is for this device and is thinking in terms of a
|
|
|
|
* zero-based reference frame, and so we shift the given
|
|
|
|
* value into the valid range (and wrap accordingly) so the
|
|
|
|
* intent is translated to the proper frame of reference.
|
|
|
|
* If the base CPU ID is within the valid first/last, but
|
|
|
|
* does not correspond to a valid CPU, it is advanced to the
|
|
|
|
* next valid CPU (wrapping if necessary).
|
|
|
|
*/
|
|
|
|
if (base_cpuid < first_valid || base_cpuid > last_valid) {
|
|
|
|
/* shift from zero-based to first_valid-based */
|
|
|
|
base_cpuid += first_valid;
|
|
|
|
/* wrap to range [first_valid, last_valid] */
|
|
|
|
base_cpuid = (base_cpuid - first_valid) %
|
|
|
|
(last_valid - first_valid + 1);
|
|
|
|
}
|
|
|
|
if (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus)) {
|
|
|
|
/*
|
|
|
|
* base_cpuid is in [first_valid, last_valid], but
|
|
|
|
* not a member of the valid set. In this case,
|
|
|
|
* there will always be a member of the valid set
|
|
|
|
* with a CPU ID that is greater than base_cpuid,
|
|
|
|
* and we simply advance to it.
|
|
|
|
*/
|
|
|
|
while (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus))
|
|
|
|
base_cpuid++;
|
|
|
|
}
|
|
|
|
return (base_cpuid);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine how many cores will be consumed by performing the CPU
|
|
|
|
* assignments and counting how many of the assigned CPUs correspond
|
|
|
|
* to CPUs in the set of context CPUs. This is done using the CPU
|
|
|
|
* ID first_valid as the base CPU ID, as the base CPU must be within
|
|
|
|
* the set of context CPUs.
|
|
|
|
*
|
|
|
|
* Note not all assigned CPUs will be in the set of context CPUs
|
|
|
|
* when separate CPUs are being allocated to TX and RX queues,
|
|
|
|
* assignment to logical cores has been enabled, the set of context
|
|
|
|
* CPUs contains only physical CPUs, and TX queues are mapped to L2
|
|
|
|
* neighbors of CPUs that RX queues have been mapped to - in this
|
|
|
|
* case we do only want to count how many CPUs in the set of context
|
|
|
|
* CPUs have been consumed, as that determines the next CPU in that
|
|
|
|
* set to start allocating at for the next device for which
|
|
|
|
* core_offset is not set.
|
|
|
|
*/
|
|
|
|
CPU_ZERO(&assigned_cpus);
|
|
|
|
for (i = 0; i < scctx->isc_ntxqsets; i++)
|
|
|
|
CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, true),
|
|
|
|
&assigned_cpus);
|
|
|
|
for (i = 0; i < scctx->isc_nrxqsets; i++)
|
|
|
|
CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, false),
|
|
|
|
&assigned_cpus);
|
|
|
|
CPU_AND(&assigned_cpus, &ctx->ifc_cpus);
|
|
|
|
cores_consumed = CPU_COUNT(&assigned_cpus);
|
2019-04-25 21:24:56 +00:00
|
|
|
|
|
|
|
mtx_lock(&cpu_offset_mtx);
|
|
|
|
SLIST_FOREACH(op, &cpu_offsets, entries) {
|
|
|
|
if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
base_cpuid = op->next_cpuid;
|
|
|
|
op->next_cpuid = cpuid_advance(ctx, op->next_cpuid,
|
|
|
|
cores_consumed);
|
2019-04-25 21:24:56 +00:00
|
|
|
MPASS(op->refcount < UINT_MAX);
|
|
|
|
op->refcount++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
if (base_cpuid == CORE_OFFSET_UNSPECIFIED) {
|
|
|
|
base_cpuid = first_valid;
|
2019-04-25 21:24:56 +00:00
|
|
|
op = malloc(sizeof(struct cpu_offset), M_IFLIB,
|
|
|
|
M_NOWAIT | M_ZERO);
|
|
|
|
if (op == NULL) {
|
|
|
|
device_printf(ctx->ifc_dev,
|
|
|
|
"allocation for cpu offset failed.\n");
|
|
|
|
} else {
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
op->next_cpuid = cpuid_advance(ctx, base_cpuid,
|
|
|
|
cores_consumed);
|
2019-04-25 21:24:56 +00:00
|
|
|
op->refcount = 1;
|
|
|
|
CPU_COPY(&ctx->ifc_cpus, &op->set);
|
|
|
|
SLIST_INSERT_HEAD(&cpu_offsets, op, entries);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mtx_unlock(&cpu_offset_mtx);
|
|
|
|
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
return (base_cpuid);
|
2019-04-25 21:24:56 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
unref_ctx_core_offset(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
struct cpu_offset *op, *top;
|
|
|
|
|
|
|
|
mtx_lock(&cpu_offset_mtx);
|
|
|
|
SLIST_FOREACH_SAFE(op, &cpu_offsets, entries, top) {
|
|
|
|
if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
|
|
|
|
MPASS(op->refcount > 0);
|
|
|
|
op->refcount--;
|
|
|
|
if (op->refcount == 0) {
|
|
|
|
SLIST_REMOVE(&cpu_offsets, op, cpu_offset, entries);
|
|
|
|
free(op, M_IFLIB);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mtx_unlock(&cpu_offset_mtx);
|
|
|
|
}
|
|
|
|
|
2018-05-11 20:08:28 +00:00
|
|
|
int
|
|
|
|
iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ctxp)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx;
|
|
|
|
if_t ifp;
|
|
|
|
if_softc_ctx_t scctx;
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
kobjop_desc_t kobj_desc;
|
|
|
|
kobj_method_t *kobj_method;
|
2019-05-10 00:41:42 +00:00
|
|
|
int err, msix, rid;
|
2020-07-20 21:08:56 +00:00
|
|
|
int num_txd, num_rxd;
|
2018-05-11 20:08:28 +00:00
|
|
|
|
|
|
|
ctx = malloc(sizeof(* ctx), M_IFLIB, M_WAITOK|M_ZERO);
|
|
|
|
|
|
|
|
if (sc == NULL) {
|
|
|
|
sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
|
|
|
|
device_set_softc(dev, ctx);
|
|
|
|
ctx->ifc_flags |= IFC_SC_ALLOCATED;
|
|
|
|
}
|
|
|
|
|
|
|
|
ctx->ifc_sctx = sctx;
|
|
|
|
ctx->ifc_dev = dev;
|
|
|
|
ctx->ifc_softc = sc;
|
|
|
|
|
|
|
|
if ((err = iflib_register(ctx)) != 0) {
|
|
|
|
device_printf(dev, "iflib_register failed %d\n", err);
|
2019-01-22 00:56:44 +00:00
|
|
|
goto fail_ctx_free;
|
2018-05-11 20:08:28 +00:00
|
|
|
}
|
|
|
|
iflib_add_device_sysctl_pre(ctx);
|
|
|
|
|
|
|
|
scctx = &ctx->ifc_softc_ctx;
|
|
|
|
ifp = ctx->ifc_ifp;
|
2017-09-16 02:41:38 +00:00
|
|
|
|
2018-05-11 20:08:28 +00:00
|
|
|
iflib_reset_qvalues(ctx);
|
2018-05-03 17:02:31 +00:00
|
|
|
CTX_LOCK(ctx);
|
2017-09-16 02:41:38 +00:00
|
|
|
if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
|
2016-05-18 04:35:58 +00:00
|
|
|
device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
|
2019-01-22 00:56:44 +00:00
|
|
|
goto fail_unlock;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
2017-01-02 00:56:33 +00:00
|
|
|
_iflib_pre_assert(scctx);
|
|
|
|
ctx->ifc_txrx = *scctx->isc_txrx;
|
|
|
|
|
2021-02-25 01:29:33 +00:00
|
|
|
MPASS(scctx->isc_dma_width <= flsll(BUS_SPACE_MAXADDR));
|
2021-02-24 22:56:45 +00:00
|
|
|
|
2019-05-03 20:05:31 +00:00
|
|
|
if (sctx->isc_flags & IFLIB_DRIVER_MEDIA)
|
|
|
|
ctx->ifc_mediap = scctx->isc_media;
|
|
|
|
|
2017-01-02 00:56:33 +00:00
|
|
|
#ifdef INVARIANTS
|
Assorted TSO fixes for em(4)/iflib(9) and dead code removal:
- Ever since the workaround for the silicon bug of TSO4 causing MAC hangs
was committed in r295133, CSUM_TSO always got disabled unconditionally
by em(4) on the first invocation of em_init_locked(). However, even with
that problem fixed, it turned out that for at least e. g. 82579 not all
necessary TSO workarounds are in place, still causing MAC hangs even at
Gigabit speed. Thus, for stable/11, TSO usage was deliberately disabled
in r323292 (r323293 for stable/10) for the EM-class by default, allowing
users to turn it on if it happens to work with their particular EM MAC
in a Gigabit-only environment.
In head, the TSO workaround for speeds other than Gigabit was lost with
the conversion to iflib(9) in r311849 (possibly along with another one
or two TSO workarounds). Yet at the same time, for EM-class MACs TSO4
got enabled by default again, causing device hangs. Therefore, change the
default for this hardware class back to have TSO4 off, allowing users
to turn it on manually if it happens to work in their environment as
we do in stable/{10,11}. An alternative would be to add a whitelist of
EM-class devices where TSO4 actually is reliable with the workarounds in
place, but given that the advantage of TSO at Gigabit speed is rather
limited - especially with the overhead of these workarounds -, that's
really not worth it. [1]
This change includes the addition of an isc_capabilities to struct
if_softc_ctx so iflib(9) can also handle interface capabilities that
shouldn't be enabled by default which is used to handle the default-off
capabilities of e1000 as suggested by shurd@ and moving their handling
from em_setup_interface() to em_if_attach_pre() accordingly.
- Although 82543 support TSO4 in theory, the former lem(4) didn't have
support for TSO4, presumably because TSO4 is even more broken in the
LEM-class of MACs than the later EM ones. Still, TSO4 for LEM-class
devices was enabled as part of the conversion to iflib(9) in r311849,
causing device hangs. So revert back to the pre-r311849 behavior of
not supporting TSO4 for LEM-class at all, which includes not creating
a TSO DMA tag in iflib(9) for devices not having IFCAP_TSO4 set. [2]
- In fact, the FreeBSD TCP stack can handle a TSO size of IP_MAXPACKET
(65535) rather than FREEBSD_TSO_SIZE_MAX (65518). However, the TSO
DMA must have a maxsize of the maximum TSO size plus the size of a
VLAN header for software VLAN tagging. The iflib(9) converted em(4),
thus, first correctly sets scctx->isc_tx_tso_size_max to EM_TSO_SIZE
in em_if_attach_pre(), but later on overrides it with IP_MAXPACKET
in em_setup_interface() (apparently, left-over from pre-iflib(9)
times). So remove the later and correct iflib(9) to correctly cap
the maximum TSO size reported to the stack at IP_MAXPACKET. While at
it, let iflib(9) use if_sethwtsomax*().
This change includes the addition of isc_tso_max{seg,}size DMA engine
constraints for the TSO DMA tag to struct if_shared_ctx and letting
iflib_txsd_alloc() automatically adjust the maxsize of that tag in case
IFCAP_VLAN_MTU is supported as requested by shurd@.
- Move the if_setifheaderlen(9) call for adjusting the maximum Ethernet
header length from {ixgbe,ixl,ixlv,ixv,em}_setup_interface() to iflib(9)
so adjustment is automatically done in case IFCAP_VLAN_MTU is supported.
As a consequence, this adjustment now is also done in case of bnxt(4)
which missed it previously.
- Move the reduction of the maximum TSO segment count reported to the
stack by the number of m_pullup(9) calls (which in the worst case,
can add another mbuf and, thus, the requirement for another DMA
segment each) in the transmit path for performance reasons from
em_setup_interface() to iflib_txsd_alloc() as these pull-ups are now
done in iflib_parse_header() rather than in the no longer existing
em_xmit(). Moreover, this optimization applies to all drivers using
iflib(9) and not just em(4); all in-tree iflib(9) consumers still
have enough room to handle full size TSO packets. Also, reduce the
adjustment to the maximum number of m_pullup(9)'s now performed in
iflib_parse_header().
- Prior to the conversion of em(4)/igb(4)/lem(4) and ixl(4) to iflib(9)
in r311849 and r335338 respectively, these drivers didn't enable
IFCAP_VLAN_HWFILTER by default due to VLAN events not being passed
through by lagg(4). With iflib(9), IFCAP_VLAN_HWFILTER was turned on
by default but also lagg(4) was fixed in that regard in r203548. So
just remove the now redundant and defunct IFCAP_VLAN_HWFILTER handling
in {em,ixl,ixlv}_setup_interface().
- Nuke other redundant IFCAP_* setting in {em,ixl,ixlv}_setup_interface()
which is (more completely) already done in {em,ixl,ixlv}_if_attach_pre()
now.
- Remove some redundant/dead setting of scctx->isc_tx_csum_flags in
em_if_attach_pre().
- Remove some IFCAP_* duplicated either directly or indirectly (e. g.
via IFCAP_HWCSUM) in {EM,IGB,IXL}_CAPS.
- Don't bother to fiddle with IFCAP_HWSTATS in ixgbe(4)/ixgbev(4) as
iflib(9) adds that capability unconditionally.
- Remove some unused macros from em(4).
- Bump __FreeBSD_version as some of the above changes require the modules
of drivers using iflib(9) to be recompiled.
Okayed by: sbruno@ at 201806 DevSummit Transport Working Group [1]
Reviewed by: sbruno (earlier version), erj
PR: 219428 (part of; comment #10) [1], 220997 (part of; comment #3) [2]
Differential Revision: https://reviews.freebsd.org/D15720
2018-07-15 19:04:23 +00:00
|
|
|
if (scctx->isc_capabilities & IFCAP_TXCSUM)
|
2017-01-02 00:56:33 +00:00
|
|
|
MPASS(scctx->isc_tx_csum_flags);
|
|
|
|
#endif
|
|
|
|
|
2019-09-27 19:17:40 +00:00
|
|
|
if_setcapabilities(ifp,
|
2021-01-28 21:08:48 +00:00
|
|
|
scctx->isc_capabilities | IFCAP_HWSTATS | IFCAP_MEXTPG);
|
2019-09-27 19:17:40 +00:00
|
|
|
if_setcapenable(ifp,
|
2021-01-28 21:08:48 +00:00
|
|
|
scctx->isc_capenable | IFCAP_HWSTATS | IFCAP_MEXTPG);
|
2017-01-02 00:56:33 +00:00
|
|
|
|
|
|
|
if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
|
|
|
|
scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
|
|
|
|
if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
|
|
|
|
scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
|
2016-08-12 21:29:44 +00:00
|
|
|
|
2020-07-20 21:08:56 +00:00
|
|
|
num_txd = iflib_num_tx_descs(ctx);
|
|
|
|
num_rxd = iflib_num_rx_descs(ctx);
|
2016-08-12 21:29:44 +00:00
|
|
|
|
|
|
|
/* XXX change for per-queue sizes */
|
2019-05-06 20:56:41 +00:00
|
|
|
device_printf(dev, "Using %d TX descriptors and %d RX descriptors\n",
|
2020-07-20 21:08:56 +00:00
|
|
|
num_txd, num_rxd);
|
2016-08-12 21:29:44 +00:00
|
|
|
|
2020-07-20 21:08:56 +00:00
|
|
|
if (scctx->isc_tx_nsegments > num_txd / MAX_SINGLE_PACKET_FRACTION)
|
|
|
|
scctx->isc_tx_nsegments = max(1, num_txd /
|
2016-08-12 21:29:44 +00:00
|
|
|
MAX_SINGLE_PACKET_FRACTION);
|
2020-07-20 21:08:56 +00:00
|
|
|
if (scctx->isc_tx_tso_segments_max > num_txd /
|
2016-08-12 21:29:44 +00:00
|
|
|
MAX_SINGLE_PACKET_FRACTION)
|
|
|
|
scctx->isc_tx_tso_segments_max = max(1,
|
2020-07-20 21:08:56 +00:00
|
|
|
num_txd / MAX_SINGLE_PACKET_FRACTION);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
|
Assorted TSO fixes for em(4)/iflib(9) and dead code removal:
- Ever since the workaround for the silicon bug of TSO4 causing MAC hangs
was committed in r295133, CSUM_TSO always got disabled unconditionally
by em(4) on the first invocation of em_init_locked(). However, even with
that problem fixed, it turned out that for at least e. g. 82579 not all
necessary TSO workarounds are in place, still causing MAC hangs even at
Gigabit speed. Thus, for stable/11, TSO usage was deliberately disabled
in r323292 (r323293 for stable/10) for the EM-class by default, allowing
users to turn it on if it happens to work with their particular EM MAC
in a Gigabit-only environment.
In head, the TSO workaround for speeds other than Gigabit was lost with
the conversion to iflib(9) in r311849 (possibly along with another one
or two TSO workarounds). Yet at the same time, for EM-class MACs TSO4
got enabled by default again, causing device hangs. Therefore, change the
default for this hardware class back to have TSO4 off, allowing users
to turn it on manually if it happens to work in their environment as
we do in stable/{10,11}. An alternative would be to add a whitelist of
EM-class devices where TSO4 actually is reliable with the workarounds in
place, but given that the advantage of TSO at Gigabit speed is rather
limited - especially with the overhead of these workarounds -, that's
really not worth it. [1]
This change includes the addition of an isc_capabilities to struct
if_softc_ctx so iflib(9) can also handle interface capabilities that
shouldn't be enabled by default which is used to handle the default-off
capabilities of e1000 as suggested by shurd@ and moving their handling
from em_setup_interface() to em_if_attach_pre() accordingly.
- Although 82543 support TSO4 in theory, the former lem(4) didn't have
support for TSO4, presumably because TSO4 is even more broken in the
LEM-class of MACs than the later EM ones. Still, TSO4 for LEM-class
devices was enabled as part of the conversion to iflib(9) in r311849,
causing device hangs. So revert back to the pre-r311849 behavior of
not supporting TSO4 for LEM-class at all, which includes not creating
a TSO DMA tag in iflib(9) for devices not having IFCAP_TSO4 set. [2]
- In fact, the FreeBSD TCP stack can handle a TSO size of IP_MAXPACKET
(65535) rather than FREEBSD_TSO_SIZE_MAX (65518). However, the TSO
DMA must have a maxsize of the maximum TSO size plus the size of a
VLAN header for software VLAN tagging. The iflib(9) converted em(4),
thus, first correctly sets scctx->isc_tx_tso_size_max to EM_TSO_SIZE
in em_if_attach_pre(), but later on overrides it with IP_MAXPACKET
in em_setup_interface() (apparently, left-over from pre-iflib(9)
times). So remove the later and correct iflib(9) to correctly cap
the maximum TSO size reported to the stack at IP_MAXPACKET. While at
it, let iflib(9) use if_sethwtsomax*().
This change includes the addition of isc_tso_max{seg,}size DMA engine
constraints for the TSO DMA tag to struct if_shared_ctx and letting
iflib_txsd_alloc() automatically adjust the maxsize of that tag in case
IFCAP_VLAN_MTU is supported as requested by shurd@.
- Move the if_setifheaderlen(9) call for adjusting the maximum Ethernet
header length from {ixgbe,ixl,ixlv,ixv,em}_setup_interface() to iflib(9)
so adjustment is automatically done in case IFCAP_VLAN_MTU is supported.
As a consequence, this adjustment now is also done in case of bnxt(4)
which missed it previously.
- Move the reduction of the maximum TSO segment count reported to the
stack by the number of m_pullup(9) calls (which in the worst case,
can add another mbuf and, thus, the requirement for another DMA
segment each) in the transmit path for performance reasons from
em_setup_interface() to iflib_txsd_alloc() as these pull-ups are now
done in iflib_parse_header() rather than in the no longer existing
em_xmit(). Moreover, this optimization applies to all drivers using
iflib(9) and not just em(4); all in-tree iflib(9) consumers still
have enough room to handle full size TSO packets. Also, reduce the
adjustment to the maximum number of m_pullup(9)'s now performed in
iflib_parse_header().
- Prior to the conversion of em(4)/igb(4)/lem(4) and ixl(4) to iflib(9)
in r311849 and r335338 respectively, these drivers didn't enable
IFCAP_VLAN_HWFILTER by default due to VLAN events not being passed
through by lagg(4). With iflib(9), IFCAP_VLAN_HWFILTER was turned on
by default but also lagg(4) was fixed in that regard in r203548. So
just remove the now redundant and defunct IFCAP_VLAN_HWFILTER handling
in {em,ixl,ixlv}_setup_interface().
- Nuke other redundant IFCAP_* setting in {em,ixl,ixlv}_setup_interface()
which is (more completely) already done in {em,ixl,ixlv}_if_attach_pre()
now.
- Remove some redundant/dead setting of scctx->isc_tx_csum_flags in
em_if_attach_pre().
- Remove some IFCAP_* duplicated either directly or indirectly (e. g.
via IFCAP_HWCSUM) in {EM,IGB,IXL}_CAPS.
- Don't bother to fiddle with IFCAP_HWSTATS in ixgbe(4)/ixgbev(4) as
iflib(9) adds that capability unconditionally.
- Remove some unused macros from em(4).
- Bump __FreeBSD_version as some of the above changes require the modules
of drivers using iflib(9) to be recompiled.
Okayed by: sbruno@ at 201806 DevSummit Transport Working Group [1]
Reviewed by: sbruno (earlier version), erj
PR: 219428 (part of; comment #10) [1], 220997 (part of; comment #3) [2]
Differential Revision: https://reviews.freebsd.org/D15720
2018-07-15 19:04:23 +00:00
|
|
|
if (if_getcapabilities(ifp) & IFCAP_TSO) {
|
|
|
|
/*
|
|
|
|
* The stack can't handle a TSO size larger than IP_MAXPACKET,
|
|
|
|
* but some MACs do.
|
|
|
|
*/
|
|
|
|
if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max,
|
|
|
|
IP_MAXPACKET));
|
|
|
|
/*
|
|
|
|
* Take maximum number of m_pullup(9)'s in iflib_parse_header()
|
|
|
|
* into account. In the worst case, each of these calls will
|
|
|
|
* add another mbuf and, thus, the requirement for another DMA
|
|
|
|
* segment. So for best performance, it doesn't make sense to
|
|
|
|
* advertize a maximum of TSO segments that typically will
|
|
|
|
* require defragmentation in iflib_encap().
|
|
|
|
*/
|
|
|
|
if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3);
|
|
|
|
if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max);
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
if (scctx->isc_rss_table_size == 0)
|
|
|
|
scctx->isc_rss_table_size = 64;
|
2016-08-12 21:29:44 +00:00
|
|
|
scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
|
2016-11-18 04:19:21 +00:00
|
|
|
|
|
|
|
GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
|
|
|
|
/* XXX format name */
|
Make taskqgroup_attach{,_cpu}(9) work across architectures
So far, intr_{g,s}etaffinity(9) take a single int for identifying
a device interrupt. This approach doesn't work on all architectures
supported, as a single int isn't sufficient to globally specify a
device interrupt. In particular, with multiple interrupt controllers
in one system as found on e. g. arm and arm64 machines, an interrupt
number as returned by rman_get_start(9) may be only unique relative
to the bus and, thus, interrupt controller, a certain device hangs
off from.
In turn, this makes taskqgroup_attach{,_cpu}(9) and - internal to
the gtaskqueue implementation - taskqgroup_attach_deferred{,_cpu}()
not work across architectures. Yet in turn, iflib(4) as gtaskqueue
consumer so far doesn't fit architectures where interrupt numbers
aren't globally unique.
However, at least for intr_setaffinity(..., CPU_WHICH_IRQ, ...) as
employed by the gtaskqueue implementation to bind an interrupt to a
particular CPU, using bus_bind_intr(9) instead is equivalent from
a functional point of view, with bus_bind_intr(9) taking the device
and interrupt resource arguments required for uniquely specifying a
device interrupt.
Thus, change the gtaskqueue implementation to employ bus_bind_intr(9)
instead and intr_{g,s}etaffinity(9) to take the device and interrupt
resource arguments required respectively. This change also moves
struct grouptask from <sys/_task.h> to <sys/gtaskqueue.h> and wraps
struct gtask along with the gtask_fn_t typedef into #ifdef _KERNEL
as userland likes to include <sys/_task.h> or indirectly drags it
in - for better or worse also with _KERNEL defined -, which with
device_t and struct resource dependencies otherwise is no longer
as easily possible now.
The userland inclusion problem probably can be improved a bit by
introducing a _WANT_TASK (as well as a _WANT_MOUNT) akin to the
existing _WANT_PRISON etc., which is orthogonal to this change,
though, and likely needs an exp-run.
While at it:
- Change the gt_cpu member in the grouptask structure to be of type
int as used elswhere for specifying CPUs (an int16_t may be too
narrow sooner or later),
- move the gtaskqueue_enqueue_fn typedef from <sys/gtaskqueue.h> to
the gtaskqueue implementation as it's only used and needed there,
- change the GTASK_INIT macro to use "gtask" rather than "task" as
argument given that it actually operates on a struct gtask rather
than a struct task, and
- let subr_gtaskqueue.c consistently use __func__ to print functions
names.
Reported by: mmel
Reviewed by: mmel
Differential Revision: https://reviews.freebsd.org/D19139
2019-02-12 21:23:59 +00:00
|
|
|
taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx,
|
|
|
|
NULL, NULL, "admin");
|
2017-11-29 18:14:57 +00:00
|
|
|
|
2017-11-29 18:21:17 +00:00
|
|
|
/* Set up cpu set. If it fails, use the set of all CPUs. */
|
2017-11-29 18:14:57 +00:00
|
|
|
if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) != 0) {
|
|
|
|
device_printf(dev, "Unable to fetch CPU list\n");
|
|
|
|
CPU_COPY(&all_cpus, &ctx->ifc_cpus);
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
ctx->ifc_cpus_are_physical_cores = false;
|
|
|
|
} else
|
|
|
|
ctx->ifc_cpus_are_physical_cores = true;
|
2017-11-29 18:14:57 +00:00
|
|
|
MPASS(CPU_COUNT(&ctx->ifc_cpus) > 0);
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
2019-01-30 13:21:26 +00:00
|
|
|
** Now set up MSI or MSI-X, should return us the number of supported
|
|
|
|
** vectors (will be 1 for a legacy interrupt and MSI).
|
2016-05-18 04:35:58 +00:00
|
|
|
*/
|
|
|
|
if (sctx->isc_flags & IFLIB_SKIP_MSIX) {
|
|
|
|
msix = scctx->isc_vectors;
|
|
|
|
} else if (scctx->isc_msix_bar != 0)
|
2017-01-25 14:37:05 +00:00
|
|
|
/*
|
|
|
|
* The simple fact that isc_msix_bar is not 0 does not mean we
|
|
|
|
* we have a good value there that is known to work.
|
|
|
|
*/
|
2016-05-18 04:35:58 +00:00
|
|
|
msix = iflib_msix_init(ctx);
|
|
|
|
else {
|
|
|
|
scctx->isc_vectors = 1;
|
|
|
|
scctx->isc_ntxqsets = 1;
|
|
|
|
scctx->isc_nrxqsets = 1;
|
|
|
|
scctx->isc_intr = IFLIB_INTR_LEGACY;
|
|
|
|
msix = 0;
|
|
|
|
}
|
|
|
|
/* Get memory for the station queues */
|
|
|
|
if ((err = iflib_queues_alloc(ctx))) {
|
|
|
|
device_printf(dev, "Unable to allocate queue memory\n");
|
2019-01-22 00:56:44 +00:00
|
|
|
goto fail_intr_free;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
2018-05-08 17:15:10 +00:00
|
|
|
if ((err = iflib_qset_structures_setup(ctx)))
|
2016-05-18 04:35:58 +00:00
|
|
|
goto fail_queues;
|
2017-01-26 13:50:09 +00:00
|
|
|
|
2019-04-25 21:24:56 +00:00
|
|
|
/*
|
|
|
|
* Now that we know how many queues there are, get the core offset.
|
|
|
|
*/
|
|
|
|
ctx->ifc_sysctl_core_offset = get_ctx_core_offset(ctx);
|
|
|
|
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
if (msix > 1) {
|
|
|
|
/*
|
|
|
|
* When using MSI-X, ensure that ifdi_{r,t}x_queue_intr_enable
|
|
|
|
* aren't the default NULL implementation.
|
|
|
|
*/
|
|
|
|
kobj_desc = &ifdi_rx_queue_intr_enable_desc;
|
|
|
|
kobj_method = kobj_lookup_method(((kobj_t)ctx)->ops->cls, NULL,
|
|
|
|
kobj_desc);
|
|
|
|
if (kobj_method == &kobj_desc->deflt) {
|
|
|
|
device_printf(dev,
|
|
|
|
"MSI-X requires ifdi_rx_queue_intr_enable method");
|
|
|
|
err = EOPNOTSUPP;
|
|
|
|
goto fail_queues;
|
|
|
|
}
|
|
|
|
kobj_desc = &ifdi_tx_queue_intr_enable_desc;
|
|
|
|
kobj_method = kobj_lookup_method(((kobj_t)ctx)->ops->cls, NULL,
|
|
|
|
kobj_desc);
|
|
|
|
if (kobj_method == &kobj_desc->deflt) {
|
|
|
|
device_printf(dev,
|
|
|
|
"MSI-X requires ifdi_tx_queue_intr_enable method");
|
|
|
|
err = EOPNOTSUPP;
|
|
|
|
goto fail_queues;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Assign the MSI-X vectors.
|
|
|
|
* Note that the default NULL ifdi_msix_intr_assign method will
|
|
|
|
* fail here, too.
|
|
|
|
*/
|
|
|
|
err = IFDI_MSIX_INTR_ASSIGN(ctx, msix);
|
|
|
|
if (err != 0) {
|
|
|
|
device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n",
|
|
|
|
err);
|
|
|
|
goto fail_queues;
|
|
|
|
}
|
iflib: Prevent kernel panic caused by loading driver with a specific interrupt configuration
If a device has only 1 MSI-X interrupt available and does not support either
MSI or legacy interrupts, iflib_device_register() will fail, leak memory and
MSI resources, and the driver will not load. Worse, if another iflib-using
driver tries to unload afterwards, a kernel panic will occur because the
previous failed iflib driver loead did not properly call "taskqgroup_detach()"
during it's cleanup.
This patch is band-aid for this situation -- don't try allocating MSI or legacy
interrupts if a single MSI-X interrupt was allocated, but fail to load instead.
As well, during the cleanup, properly call taskqgroup_detach() on the admin
task to prevent panics when other iflib drivers unload.
This whole interrupt allocation process actually needs re-doing to properly
support devices with only a single MSI-X interrupt, devices that only support
MSI-X, non-PCI devices, and multiple non-MSIX interrupts, as well.
Signed-off-by: Eric Joyner <erj@freebsd.org>
Reviewed by: marius@
MFC after: 1 week
Sponsored by: Intel Corporation
Differential Revision: https://reviews.freebsd.org/D20747
2019-08-01 17:37:25 +00:00
|
|
|
} else if (scctx->isc_intr != IFLIB_INTR_MSIX) {
|
2016-05-18 04:35:58 +00:00
|
|
|
rid = 0;
|
|
|
|
if (scctx->isc_intr == IFLIB_INTR_MSI) {
|
|
|
|
MPASS(msix == 1);
|
|
|
|
rid = 1;
|
|
|
|
}
|
2016-08-12 21:29:44 +00:00
|
|
|
if ((err = iflib_legacy_setup(ctx, ctx->isc_legacy_intr, ctx->ifc_softc, &rid, "irq0")) != 0) {
|
2016-05-18 04:35:58 +00:00
|
|
|
device_printf(dev, "iflib_legacy_setup failed %d\n", err);
|
2019-01-22 00:56:44 +00:00
|
|
|
goto fail_queues;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
iflib: Prevent kernel panic caused by loading driver with a specific interrupt configuration
If a device has only 1 MSI-X interrupt available and does not support either
MSI or legacy interrupts, iflib_device_register() will fail, leak memory and
MSI resources, and the driver will not load. Worse, if another iflib-using
driver tries to unload afterwards, a kernel panic will occur because the
previous failed iflib driver loead did not properly call "taskqgroup_detach()"
during it's cleanup.
This patch is band-aid for this situation -- don't try allocating MSI or legacy
interrupts if a single MSI-X interrupt was allocated, but fail to load instead.
As well, during the cleanup, properly call taskqgroup_detach() on the admin
task to prevent panics when other iflib drivers unload.
This whole interrupt allocation process actually needs re-doing to properly
support devices with only a single MSI-X interrupt, devices that only support
MSI-X, non-PCI devices, and multiple non-MSIX interrupts, as well.
Signed-off-by: Eric Joyner <erj@freebsd.org>
Reviewed by: marius@
MFC after: 1 week
Sponsored by: Intel Corporation
Differential Revision: https://reviews.freebsd.org/D20747
2019-08-01 17:37:25 +00:00
|
|
|
} else {
|
|
|
|
device_printf(dev,
|
|
|
|
"Cannot use iflib with only 1 MSI-X interrupt!\n");
|
|
|
|
err = ENODEV;
|
2021-02-01 16:13:00 +00:00
|
|
|
goto fail_queues;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
Assorted TSO fixes for em(4)/iflib(9) and dead code removal:
- Ever since the workaround for the silicon bug of TSO4 causing MAC hangs
was committed in r295133, CSUM_TSO always got disabled unconditionally
by em(4) on the first invocation of em_init_locked(). However, even with
that problem fixed, it turned out that for at least e. g. 82579 not all
necessary TSO workarounds are in place, still causing MAC hangs even at
Gigabit speed. Thus, for stable/11, TSO usage was deliberately disabled
in r323292 (r323293 for stable/10) for the EM-class by default, allowing
users to turn it on if it happens to work with their particular EM MAC
in a Gigabit-only environment.
In head, the TSO workaround for speeds other than Gigabit was lost with
the conversion to iflib(9) in r311849 (possibly along with another one
or two TSO workarounds). Yet at the same time, for EM-class MACs TSO4
got enabled by default again, causing device hangs. Therefore, change the
default for this hardware class back to have TSO4 off, allowing users
to turn it on manually if it happens to work in their environment as
we do in stable/{10,11}. An alternative would be to add a whitelist of
EM-class devices where TSO4 actually is reliable with the workarounds in
place, but given that the advantage of TSO at Gigabit speed is rather
limited - especially with the overhead of these workarounds -, that's
really not worth it. [1]
This change includes the addition of an isc_capabilities to struct
if_softc_ctx so iflib(9) can also handle interface capabilities that
shouldn't be enabled by default which is used to handle the default-off
capabilities of e1000 as suggested by shurd@ and moving their handling
from em_setup_interface() to em_if_attach_pre() accordingly.
- Although 82543 support TSO4 in theory, the former lem(4) didn't have
support for TSO4, presumably because TSO4 is even more broken in the
LEM-class of MACs than the later EM ones. Still, TSO4 for LEM-class
devices was enabled as part of the conversion to iflib(9) in r311849,
causing device hangs. So revert back to the pre-r311849 behavior of
not supporting TSO4 for LEM-class at all, which includes not creating
a TSO DMA tag in iflib(9) for devices not having IFCAP_TSO4 set. [2]
- In fact, the FreeBSD TCP stack can handle a TSO size of IP_MAXPACKET
(65535) rather than FREEBSD_TSO_SIZE_MAX (65518). However, the TSO
DMA must have a maxsize of the maximum TSO size plus the size of a
VLAN header for software VLAN tagging. The iflib(9) converted em(4),
thus, first correctly sets scctx->isc_tx_tso_size_max to EM_TSO_SIZE
in em_if_attach_pre(), but later on overrides it with IP_MAXPACKET
in em_setup_interface() (apparently, left-over from pre-iflib(9)
times). So remove the later and correct iflib(9) to correctly cap
the maximum TSO size reported to the stack at IP_MAXPACKET. While at
it, let iflib(9) use if_sethwtsomax*().
This change includes the addition of isc_tso_max{seg,}size DMA engine
constraints for the TSO DMA tag to struct if_shared_ctx and letting
iflib_txsd_alloc() automatically adjust the maxsize of that tag in case
IFCAP_VLAN_MTU is supported as requested by shurd@.
- Move the if_setifheaderlen(9) call for adjusting the maximum Ethernet
header length from {ixgbe,ixl,ixlv,ixv,em}_setup_interface() to iflib(9)
so adjustment is automatically done in case IFCAP_VLAN_MTU is supported.
As a consequence, this adjustment now is also done in case of bnxt(4)
which missed it previously.
- Move the reduction of the maximum TSO segment count reported to the
stack by the number of m_pullup(9) calls (which in the worst case,
can add another mbuf and, thus, the requirement for another DMA
segment each) in the transmit path for performance reasons from
em_setup_interface() to iflib_txsd_alloc() as these pull-ups are now
done in iflib_parse_header() rather than in the no longer existing
em_xmit(). Moreover, this optimization applies to all drivers using
iflib(9) and not just em(4); all in-tree iflib(9) consumers still
have enough room to handle full size TSO packets. Also, reduce the
adjustment to the maximum number of m_pullup(9)'s now performed in
iflib_parse_header().
- Prior to the conversion of em(4)/igb(4)/lem(4) and ixl(4) to iflib(9)
in r311849 and r335338 respectively, these drivers didn't enable
IFCAP_VLAN_HWFILTER by default due to VLAN events not being passed
through by lagg(4). With iflib(9), IFCAP_VLAN_HWFILTER was turned on
by default but also lagg(4) was fixed in that regard in r203548. So
just remove the now redundant and defunct IFCAP_VLAN_HWFILTER handling
in {em,ixl,ixlv}_setup_interface().
- Nuke other redundant IFCAP_* setting in {em,ixl,ixlv}_setup_interface()
which is (more completely) already done in {em,ixl,ixlv}_if_attach_pre()
now.
- Remove some redundant/dead setting of scctx->isc_tx_csum_flags in
em_if_attach_pre().
- Remove some IFCAP_* duplicated either directly or indirectly (e. g.
via IFCAP_HWCSUM) in {EM,IGB,IXL}_CAPS.
- Don't bother to fiddle with IFCAP_HWSTATS in ixgbe(4)/ixgbev(4) as
iflib(9) adds that capability unconditionally.
- Remove some unused macros from em(4).
- Bump __FreeBSD_version as some of the above changes require the modules
of drivers using iflib(9) to be recompiled.
Okayed by: sbruno@ at 201806 DevSummit Transport Working Group [1]
Reviewed by: sbruno (earlier version), erj
PR: 219428 (part of; comment #10) [1], 220997 (part of; comment #3) [2]
Differential Revision: https://reviews.freebsd.org/D15720
2018-07-15 19:04:23 +00:00
|
|
|
|
2019-04-17 17:19:54 +00:00
|
|
|
ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet);
|
Assorted TSO fixes for em(4)/iflib(9) and dead code removal:
- Ever since the workaround for the silicon bug of TSO4 causing MAC hangs
was committed in r295133, CSUM_TSO always got disabled unconditionally
by em(4) on the first invocation of em_init_locked(). However, even with
that problem fixed, it turned out that for at least e. g. 82579 not all
necessary TSO workarounds are in place, still causing MAC hangs even at
Gigabit speed. Thus, for stable/11, TSO usage was deliberately disabled
in r323292 (r323293 for stable/10) for the EM-class by default, allowing
users to turn it on if it happens to work with their particular EM MAC
in a Gigabit-only environment.
In head, the TSO workaround for speeds other than Gigabit was lost with
the conversion to iflib(9) in r311849 (possibly along with another one
or two TSO workarounds). Yet at the same time, for EM-class MACs TSO4
got enabled by default again, causing device hangs. Therefore, change the
default for this hardware class back to have TSO4 off, allowing users
to turn it on manually if it happens to work in their environment as
we do in stable/{10,11}. An alternative would be to add a whitelist of
EM-class devices where TSO4 actually is reliable with the workarounds in
place, but given that the advantage of TSO at Gigabit speed is rather
limited - especially with the overhead of these workarounds -, that's
really not worth it. [1]
This change includes the addition of an isc_capabilities to struct
if_softc_ctx so iflib(9) can also handle interface capabilities that
shouldn't be enabled by default which is used to handle the default-off
capabilities of e1000 as suggested by shurd@ and moving their handling
from em_setup_interface() to em_if_attach_pre() accordingly.
- Although 82543 support TSO4 in theory, the former lem(4) didn't have
support for TSO4, presumably because TSO4 is even more broken in the
LEM-class of MACs than the later EM ones. Still, TSO4 for LEM-class
devices was enabled as part of the conversion to iflib(9) in r311849,
causing device hangs. So revert back to the pre-r311849 behavior of
not supporting TSO4 for LEM-class at all, which includes not creating
a TSO DMA tag in iflib(9) for devices not having IFCAP_TSO4 set. [2]
- In fact, the FreeBSD TCP stack can handle a TSO size of IP_MAXPACKET
(65535) rather than FREEBSD_TSO_SIZE_MAX (65518). However, the TSO
DMA must have a maxsize of the maximum TSO size plus the size of a
VLAN header for software VLAN tagging. The iflib(9) converted em(4),
thus, first correctly sets scctx->isc_tx_tso_size_max to EM_TSO_SIZE
in em_if_attach_pre(), but later on overrides it with IP_MAXPACKET
in em_setup_interface() (apparently, left-over from pre-iflib(9)
times). So remove the later and correct iflib(9) to correctly cap
the maximum TSO size reported to the stack at IP_MAXPACKET. While at
it, let iflib(9) use if_sethwtsomax*().
This change includes the addition of isc_tso_max{seg,}size DMA engine
constraints for the TSO DMA tag to struct if_shared_ctx and letting
iflib_txsd_alloc() automatically adjust the maxsize of that tag in case
IFCAP_VLAN_MTU is supported as requested by shurd@.
- Move the if_setifheaderlen(9) call for adjusting the maximum Ethernet
header length from {ixgbe,ixl,ixlv,ixv,em}_setup_interface() to iflib(9)
so adjustment is automatically done in case IFCAP_VLAN_MTU is supported.
As a consequence, this adjustment now is also done in case of bnxt(4)
which missed it previously.
- Move the reduction of the maximum TSO segment count reported to the
stack by the number of m_pullup(9) calls (which in the worst case,
can add another mbuf and, thus, the requirement for another DMA
segment each) in the transmit path for performance reasons from
em_setup_interface() to iflib_txsd_alloc() as these pull-ups are now
done in iflib_parse_header() rather than in the no longer existing
em_xmit(). Moreover, this optimization applies to all drivers using
iflib(9) and not just em(4); all in-tree iflib(9) consumers still
have enough room to handle full size TSO packets. Also, reduce the
adjustment to the maximum number of m_pullup(9)'s now performed in
iflib_parse_header().
- Prior to the conversion of em(4)/igb(4)/lem(4) and ixl(4) to iflib(9)
in r311849 and r335338 respectively, these drivers didn't enable
IFCAP_VLAN_HWFILTER by default due to VLAN events not being passed
through by lagg(4). With iflib(9), IFCAP_VLAN_HWFILTER was turned on
by default but also lagg(4) was fixed in that regard in r203548. So
just remove the now redundant and defunct IFCAP_VLAN_HWFILTER handling
in {em,ixl,ixlv}_setup_interface().
- Nuke other redundant IFCAP_* setting in {em,ixl,ixlv}_setup_interface()
which is (more completely) already done in {em,ixl,ixlv}_if_attach_pre()
now.
- Remove some redundant/dead setting of scctx->isc_tx_csum_flags in
em_if_attach_pre().
- Remove some IFCAP_* duplicated either directly or indirectly (e. g.
via IFCAP_HWCSUM) in {EM,IGB,IXL}_CAPS.
- Don't bother to fiddle with IFCAP_HWSTATS in ixgbe(4)/ixgbev(4) as
iflib(9) adds that capability unconditionally.
- Remove some unused macros from em(4).
- Bump __FreeBSD_version as some of the above changes require the modules
of drivers using iflib(9) to be recompiled.
Okayed by: sbruno@ at 201806 DevSummit Transport Working Group [1]
Reviewed by: sbruno (earlier version), erj
PR: 219428 (part of; comment #10) [1], 220997 (part of; comment #3) [2]
Differential Revision: https://reviews.freebsd.org/D15720
2018-07-15 19:04:23 +00:00
|
|
|
|
2017-09-16 02:41:38 +00:00
|
|
|
if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
|
2016-05-18 04:35:58 +00:00
|
|
|
device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
|
|
|
|
goto fail_detach;
|
|
|
|
}
|
Assorted TSO fixes for em(4)/iflib(9) and dead code removal:
- Ever since the workaround for the silicon bug of TSO4 causing MAC hangs
was committed in r295133, CSUM_TSO always got disabled unconditionally
by em(4) on the first invocation of em_init_locked(). However, even with
that problem fixed, it turned out that for at least e. g. 82579 not all
necessary TSO workarounds are in place, still causing MAC hangs even at
Gigabit speed. Thus, for stable/11, TSO usage was deliberately disabled
in r323292 (r323293 for stable/10) for the EM-class by default, allowing
users to turn it on if it happens to work with their particular EM MAC
in a Gigabit-only environment.
In head, the TSO workaround for speeds other than Gigabit was lost with
the conversion to iflib(9) in r311849 (possibly along with another one
or two TSO workarounds). Yet at the same time, for EM-class MACs TSO4
got enabled by default again, causing device hangs. Therefore, change the
default for this hardware class back to have TSO4 off, allowing users
to turn it on manually if it happens to work in their environment as
we do in stable/{10,11}. An alternative would be to add a whitelist of
EM-class devices where TSO4 actually is reliable with the workarounds in
place, but given that the advantage of TSO at Gigabit speed is rather
limited - especially with the overhead of these workarounds -, that's
really not worth it. [1]
This change includes the addition of an isc_capabilities to struct
if_softc_ctx so iflib(9) can also handle interface capabilities that
shouldn't be enabled by default which is used to handle the default-off
capabilities of e1000 as suggested by shurd@ and moving their handling
from em_setup_interface() to em_if_attach_pre() accordingly.
- Although 82543 support TSO4 in theory, the former lem(4) didn't have
support for TSO4, presumably because TSO4 is even more broken in the
LEM-class of MACs than the later EM ones. Still, TSO4 for LEM-class
devices was enabled as part of the conversion to iflib(9) in r311849,
causing device hangs. So revert back to the pre-r311849 behavior of
not supporting TSO4 for LEM-class at all, which includes not creating
a TSO DMA tag in iflib(9) for devices not having IFCAP_TSO4 set. [2]
- In fact, the FreeBSD TCP stack can handle a TSO size of IP_MAXPACKET
(65535) rather than FREEBSD_TSO_SIZE_MAX (65518). However, the TSO
DMA must have a maxsize of the maximum TSO size plus the size of a
VLAN header for software VLAN tagging. The iflib(9) converted em(4),
thus, first correctly sets scctx->isc_tx_tso_size_max to EM_TSO_SIZE
in em_if_attach_pre(), but later on overrides it with IP_MAXPACKET
in em_setup_interface() (apparently, left-over from pre-iflib(9)
times). So remove the later and correct iflib(9) to correctly cap
the maximum TSO size reported to the stack at IP_MAXPACKET. While at
it, let iflib(9) use if_sethwtsomax*().
This change includes the addition of isc_tso_max{seg,}size DMA engine
constraints for the TSO DMA tag to struct if_shared_ctx and letting
iflib_txsd_alloc() automatically adjust the maxsize of that tag in case
IFCAP_VLAN_MTU is supported as requested by shurd@.
- Move the if_setifheaderlen(9) call for adjusting the maximum Ethernet
header length from {ixgbe,ixl,ixlv,ixv,em}_setup_interface() to iflib(9)
so adjustment is automatically done in case IFCAP_VLAN_MTU is supported.
As a consequence, this adjustment now is also done in case of bnxt(4)
which missed it previously.
- Move the reduction of the maximum TSO segment count reported to the
stack by the number of m_pullup(9) calls (which in the worst case,
can add another mbuf and, thus, the requirement for another DMA
segment each) in the transmit path for performance reasons from
em_setup_interface() to iflib_txsd_alloc() as these pull-ups are now
done in iflib_parse_header() rather than in the no longer existing
em_xmit(). Moreover, this optimization applies to all drivers using
iflib(9) and not just em(4); all in-tree iflib(9) consumers still
have enough room to handle full size TSO packets. Also, reduce the
adjustment to the maximum number of m_pullup(9)'s now performed in
iflib_parse_header().
- Prior to the conversion of em(4)/igb(4)/lem(4) and ixl(4) to iflib(9)
in r311849 and r335338 respectively, these drivers didn't enable
IFCAP_VLAN_HWFILTER by default due to VLAN events not being passed
through by lagg(4). With iflib(9), IFCAP_VLAN_HWFILTER was turned on
by default but also lagg(4) was fixed in that regard in r203548. So
just remove the now redundant and defunct IFCAP_VLAN_HWFILTER handling
in {em,ixl,ixlv}_setup_interface().
- Nuke other redundant IFCAP_* setting in {em,ixl,ixlv}_setup_interface()
which is (more completely) already done in {em,ixl,ixlv}_if_attach_pre()
now.
- Remove some redundant/dead setting of scctx->isc_tx_csum_flags in
em_if_attach_pre().
- Remove some IFCAP_* duplicated either directly or indirectly (e. g.
via IFCAP_HWCSUM) in {EM,IGB,IXL}_CAPS.
- Don't bother to fiddle with IFCAP_HWSTATS in ixgbe(4)/ixgbev(4) as
iflib(9) adds that capability unconditionally.
- Remove some unused macros from em(4).
- Bump __FreeBSD_version as some of the above changes require the modules
of drivers using iflib(9) to be recompiled.
Okayed by: sbruno@ at 201806 DevSummit Transport Working Group [1]
Reviewed by: sbruno (earlier version), erj
PR: 219428 (part of; comment #10) [1], 220997 (part of; comment #3) [2]
Differential Revision: https://reviews.freebsd.org/D15720
2018-07-15 19:04:23 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
|
|
|
|
* This must appear after the call to ether_ifattach() because
|
|
|
|
* ether_ifattach() sets if_hdrlen to the default value.
|
|
|
|
*/
|
|
|
|
if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
|
|
|
|
if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
if ((err = iflib_netmap_attach(ctx))) {
|
|
|
|
device_printf(ctx->ifc_dev, "netmap attach failed: %d\n", err);
|
|
|
|
goto fail_detach;
|
|
|
|
}
|
|
|
|
*ctxp = ctx;
|
|
|
|
|
2019-10-17 16:23:03 +00:00
|
|
|
DEBUGNET_SET(ctx->ifc_ifp, iflib);
|
2018-05-06 00:57:52 +00:00
|
|
|
|
2016-08-12 21:29:44 +00:00
|
|
|
if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_add_device_sysctl_post(ctx);
|
2019-04-24 13:32:04 +00:00
|
|
|
iflib_add_pfil(ctx);
|
2017-01-15 00:50:10 +00:00
|
|
|
ctx->ifc_flags |= IFC_INIT_DONE;
|
2018-05-03 17:02:31 +00:00
|
|
|
CTX_UNLOCK(ctx);
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
return (0);
|
2018-10-12 22:40:54 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
fail_detach:
|
|
|
|
ether_ifdetach(ctx->ifc_ifp);
|
|
|
|
fail_queues:
|
2021-02-01 16:13:00 +00:00
|
|
|
iflib_tqg_detach(ctx);
|
2018-05-08 16:56:02 +00:00
|
|
|
iflib_tx_structures_free(ctx);
|
|
|
|
iflib_rx_structures_free(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
IFDI_DETACH(ctx);
|
2021-02-01 16:13:00 +00:00
|
|
|
IFDI_QUEUES_FREE(ctx);
|
|
|
|
fail_intr_free:
|
|
|
|
iflib_free_intr_mem(ctx);
|
2019-01-22 00:56:44 +00:00
|
|
|
fail_unlock:
|
2018-05-03 17:02:31 +00:00
|
|
|
CTX_UNLOCK(ctx);
|
2019-08-16 23:33:44 +00:00
|
|
|
iflib_deregister(ctx);
|
2019-01-22 00:56:44 +00:00
|
|
|
fail_ctx_free:
|
2019-07-24 21:43:41 +00:00
|
|
|
device_set_softc(ctx->ifc_dev, NULL);
|
2019-01-22 00:56:44 +00:00
|
|
|
if (ctx->ifc_flags & IFC_SC_ALLOCATED)
|
|
|
|
free(ctx->ifc_softc, M_IFLIB);
|
|
|
|
free(ctx, M_IFLIB);
|
2016-05-18 04:35:58 +00:00
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
2018-05-11 20:08:28 +00:00
|
|
|
int
|
|
|
|
iflib_pseudo_register(device_t dev, if_shared_ctx_t sctx, if_ctx_t *ctxp,
|
|
|
|
struct iflib_cloneattach_ctx *clctx)
|
|
|
|
{
|
2020-07-20 21:08:56 +00:00
|
|
|
int num_txd, num_rxd;
|
2018-05-11 20:08:28 +00:00
|
|
|
int err;
|
|
|
|
if_ctx_t ctx;
|
|
|
|
if_t ifp;
|
|
|
|
if_softc_ctx_t scctx;
|
|
|
|
int i;
|
|
|
|
void *sc;
|
|
|
|
|
|
|
|
ctx = malloc(sizeof(*ctx), M_IFLIB, M_WAITOK|M_ZERO);
|
|
|
|
sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
|
|
|
|
ctx->ifc_flags |= IFC_SC_ALLOCATED;
|
|
|
|
if (sctx->isc_flags & (IFLIB_PSEUDO|IFLIB_VIRTUAL))
|
|
|
|
ctx->ifc_flags |= IFC_PSEUDO;
|
|
|
|
|
|
|
|
ctx->ifc_sctx = sctx;
|
|
|
|
ctx->ifc_softc = sc;
|
|
|
|
ctx->ifc_dev = dev;
|
|
|
|
|
|
|
|
if ((err = iflib_register(ctx)) != 0) {
|
|
|
|
device_printf(dev, "%s: iflib_register failed %d\n", __func__, err);
|
2019-01-22 00:56:44 +00:00
|
|
|
goto fail_ctx_free;
|
2018-05-11 20:08:28 +00:00
|
|
|
}
|
|
|
|
iflib_add_device_sysctl_pre(ctx);
|
|
|
|
|
|
|
|
scctx = &ctx->ifc_softc_ctx;
|
|
|
|
ifp = ctx->ifc_ifp;
|
|
|
|
|
|
|
|
iflib_reset_qvalues(ctx);
|
2019-03-28 20:43:47 +00:00
|
|
|
CTX_LOCK(ctx);
|
2018-05-11 20:08:28 +00:00
|
|
|
if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
|
|
|
|
device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
|
2019-03-28 20:43:47 +00:00
|
|
|
goto fail_unlock;
|
2018-05-11 20:08:28 +00:00
|
|
|
}
|
|
|
|
if (sctx->isc_flags & IFLIB_GEN_MAC)
|
2019-04-17 17:19:54 +00:00
|
|
|
ether_gen_addr(ifp, &ctx->ifc_mac);
|
2018-05-11 20:08:28 +00:00
|
|
|
if ((err = IFDI_CLONEATTACH(ctx, clctx->cc_ifc, clctx->cc_name,
|
|
|
|
clctx->cc_params)) != 0) {
|
|
|
|
device_printf(dev, "IFDI_CLONEATTACH failed %d\n", err);
|
2020-06-21 22:02:49 +00:00
|
|
|
goto fail_unlock;
|
2018-05-11 20:08:28 +00:00
|
|
|
}
|
|
|
|
#ifdef INVARIANTS
|
Assorted TSO fixes for em(4)/iflib(9) and dead code removal:
- Ever since the workaround for the silicon bug of TSO4 causing MAC hangs
was committed in r295133, CSUM_TSO always got disabled unconditionally
by em(4) on the first invocation of em_init_locked(). However, even with
that problem fixed, it turned out that for at least e. g. 82579 not all
necessary TSO workarounds are in place, still causing MAC hangs even at
Gigabit speed. Thus, for stable/11, TSO usage was deliberately disabled
in r323292 (r323293 for stable/10) for the EM-class by default, allowing
users to turn it on if it happens to work with their particular EM MAC
in a Gigabit-only environment.
In head, the TSO workaround for speeds other than Gigabit was lost with
the conversion to iflib(9) in r311849 (possibly along with another one
or two TSO workarounds). Yet at the same time, for EM-class MACs TSO4
got enabled by default again, causing device hangs. Therefore, change the
default for this hardware class back to have TSO4 off, allowing users
to turn it on manually if it happens to work in their environment as
we do in stable/{10,11}. An alternative would be to add a whitelist of
EM-class devices where TSO4 actually is reliable with the workarounds in
place, but given that the advantage of TSO at Gigabit speed is rather
limited - especially with the overhead of these workarounds -, that's
really not worth it. [1]
This change includes the addition of an isc_capabilities to struct
if_softc_ctx so iflib(9) can also handle interface capabilities that
shouldn't be enabled by default which is used to handle the default-off
capabilities of e1000 as suggested by shurd@ and moving their handling
from em_setup_interface() to em_if_attach_pre() accordingly.
- Although 82543 support TSO4 in theory, the former lem(4) didn't have
support for TSO4, presumably because TSO4 is even more broken in the
LEM-class of MACs than the later EM ones. Still, TSO4 for LEM-class
devices was enabled as part of the conversion to iflib(9) in r311849,
causing device hangs. So revert back to the pre-r311849 behavior of
not supporting TSO4 for LEM-class at all, which includes not creating
a TSO DMA tag in iflib(9) for devices not having IFCAP_TSO4 set. [2]
- In fact, the FreeBSD TCP stack can handle a TSO size of IP_MAXPACKET
(65535) rather than FREEBSD_TSO_SIZE_MAX (65518). However, the TSO
DMA must have a maxsize of the maximum TSO size plus the size of a
VLAN header for software VLAN tagging. The iflib(9) converted em(4),
thus, first correctly sets scctx->isc_tx_tso_size_max to EM_TSO_SIZE
in em_if_attach_pre(), but later on overrides it with IP_MAXPACKET
in em_setup_interface() (apparently, left-over from pre-iflib(9)
times). So remove the later and correct iflib(9) to correctly cap
the maximum TSO size reported to the stack at IP_MAXPACKET. While at
it, let iflib(9) use if_sethwtsomax*().
This change includes the addition of isc_tso_max{seg,}size DMA engine
constraints for the TSO DMA tag to struct if_shared_ctx and letting
iflib_txsd_alloc() automatically adjust the maxsize of that tag in case
IFCAP_VLAN_MTU is supported as requested by shurd@.
- Move the if_setifheaderlen(9) call for adjusting the maximum Ethernet
header length from {ixgbe,ixl,ixlv,ixv,em}_setup_interface() to iflib(9)
so adjustment is automatically done in case IFCAP_VLAN_MTU is supported.
As a consequence, this adjustment now is also done in case of bnxt(4)
which missed it previously.
- Move the reduction of the maximum TSO segment count reported to the
stack by the number of m_pullup(9) calls (which in the worst case,
can add another mbuf and, thus, the requirement for another DMA
segment each) in the transmit path for performance reasons from
em_setup_interface() to iflib_txsd_alloc() as these pull-ups are now
done in iflib_parse_header() rather than in the no longer existing
em_xmit(). Moreover, this optimization applies to all drivers using
iflib(9) and not just em(4); all in-tree iflib(9) consumers still
have enough room to handle full size TSO packets. Also, reduce the
adjustment to the maximum number of m_pullup(9)'s now performed in
iflib_parse_header().
- Prior to the conversion of em(4)/igb(4)/lem(4) and ixl(4) to iflib(9)
in r311849 and r335338 respectively, these drivers didn't enable
IFCAP_VLAN_HWFILTER by default due to VLAN events not being passed
through by lagg(4). With iflib(9), IFCAP_VLAN_HWFILTER was turned on
by default but also lagg(4) was fixed in that regard in r203548. So
just remove the now redundant and defunct IFCAP_VLAN_HWFILTER handling
in {em,ixl,ixlv}_setup_interface().
- Nuke other redundant IFCAP_* setting in {em,ixl,ixlv}_setup_interface()
which is (more completely) already done in {em,ixl,ixlv}_if_attach_pre()
now.
- Remove some redundant/dead setting of scctx->isc_tx_csum_flags in
em_if_attach_pre().
- Remove some IFCAP_* duplicated either directly or indirectly (e. g.
via IFCAP_HWCSUM) in {EM,IGB,IXL}_CAPS.
- Don't bother to fiddle with IFCAP_HWSTATS in ixgbe(4)/ixgbev(4) as
iflib(9) adds that capability unconditionally.
- Remove some unused macros from em(4).
- Bump __FreeBSD_version as some of the above changes require the modules
of drivers using iflib(9) to be recompiled.
Okayed by: sbruno@ at 201806 DevSummit Transport Working Group [1]
Reviewed by: sbruno (earlier version), erj
PR: 219428 (part of; comment #10) [1], 220997 (part of; comment #3) [2]
Differential Revision: https://reviews.freebsd.org/D15720
2018-07-15 19:04:23 +00:00
|
|
|
if (scctx->isc_capabilities & IFCAP_TXCSUM)
|
2018-05-11 20:08:28 +00:00
|
|
|
MPASS(scctx->isc_tx_csum_flags);
|
|
|
|
#endif
|
|
|
|
|
Assorted TSO fixes for em(4)/iflib(9) and dead code removal:
- Ever since the workaround for the silicon bug of TSO4 causing MAC hangs
was committed in r295133, CSUM_TSO always got disabled unconditionally
by em(4) on the first invocation of em_init_locked(). However, even with
that problem fixed, it turned out that for at least e. g. 82579 not all
necessary TSO workarounds are in place, still causing MAC hangs even at
Gigabit speed. Thus, for stable/11, TSO usage was deliberately disabled
in r323292 (r323293 for stable/10) for the EM-class by default, allowing
users to turn it on if it happens to work with their particular EM MAC
in a Gigabit-only environment.
In head, the TSO workaround for speeds other than Gigabit was lost with
the conversion to iflib(9) in r311849 (possibly along with another one
or two TSO workarounds). Yet at the same time, for EM-class MACs TSO4
got enabled by default again, causing device hangs. Therefore, change the
default for this hardware class back to have TSO4 off, allowing users
to turn it on manually if it happens to work in their environment as
we do in stable/{10,11}. An alternative would be to add a whitelist of
EM-class devices where TSO4 actually is reliable with the workarounds in
place, but given that the advantage of TSO at Gigabit speed is rather
limited - especially with the overhead of these workarounds -, that's
really not worth it. [1]
This change includes the addition of an isc_capabilities to struct
if_softc_ctx so iflib(9) can also handle interface capabilities that
shouldn't be enabled by default which is used to handle the default-off
capabilities of e1000 as suggested by shurd@ and moving their handling
from em_setup_interface() to em_if_attach_pre() accordingly.
- Although 82543 support TSO4 in theory, the former lem(4) didn't have
support for TSO4, presumably because TSO4 is even more broken in the
LEM-class of MACs than the later EM ones. Still, TSO4 for LEM-class
devices was enabled as part of the conversion to iflib(9) in r311849,
causing device hangs. So revert back to the pre-r311849 behavior of
not supporting TSO4 for LEM-class at all, which includes not creating
a TSO DMA tag in iflib(9) for devices not having IFCAP_TSO4 set. [2]
- In fact, the FreeBSD TCP stack can handle a TSO size of IP_MAXPACKET
(65535) rather than FREEBSD_TSO_SIZE_MAX (65518). However, the TSO
DMA must have a maxsize of the maximum TSO size plus the size of a
VLAN header for software VLAN tagging. The iflib(9) converted em(4),
thus, first correctly sets scctx->isc_tx_tso_size_max to EM_TSO_SIZE
in em_if_attach_pre(), but later on overrides it with IP_MAXPACKET
in em_setup_interface() (apparently, left-over from pre-iflib(9)
times). So remove the later and correct iflib(9) to correctly cap
the maximum TSO size reported to the stack at IP_MAXPACKET. While at
it, let iflib(9) use if_sethwtsomax*().
This change includes the addition of isc_tso_max{seg,}size DMA engine
constraints for the TSO DMA tag to struct if_shared_ctx and letting
iflib_txsd_alloc() automatically adjust the maxsize of that tag in case
IFCAP_VLAN_MTU is supported as requested by shurd@.
- Move the if_setifheaderlen(9) call for adjusting the maximum Ethernet
header length from {ixgbe,ixl,ixlv,ixv,em}_setup_interface() to iflib(9)
so adjustment is automatically done in case IFCAP_VLAN_MTU is supported.
As a consequence, this adjustment now is also done in case of bnxt(4)
which missed it previously.
- Move the reduction of the maximum TSO segment count reported to the
stack by the number of m_pullup(9) calls (which in the worst case,
can add another mbuf and, thus, the requirement for another DMA
segment each) in the transmit path for performance reasons from
em_setup_interface() to iflib_txsd_alloc() as these pull-ups are now
done in iflib_parse_header() rather than in the no longer existing
em_xmit(). Moreover, this optimization applies to all drivers using
iflib(9) and not just em(4); all in-tree iflib(9) consumers still
have enough room to handle full size TSO packets. Also, reduce the
adjustment to the maximum number of m_pullup(9)'s now performed in
iflib_parse_header().
- Prior to the conversion of em(4)/igb(4)/lem(4) and ixl(4) to iflib(9)
in r311849 and r335338 respectively, these drivers didn't enable
IFCAP_VLAN_HWFILTER by default due to VLAN events not being passed
through by lagg(4). With iflib(9), IFCAP_VLAN_HWFILTER was turned on
by default but also lagg(4) was fixed in that regard in r203548. So
just remove the now redundant and defunct IFCAP_VLAN_HWFILTER handling
in {em,ixl,ixlv}_setup_interface().
- Nuke other redundant IFCAP_* setting in {em,ixl,ixlv}_setup_interface()
which is (more completely) already done in {em,ixl,ixlv}_if_attach_pre()
now.
- Remove some redundant/dead setting of scctx->isc_tx_csum_flags in
em_if_attach_pre().
- Remove some IFCAP_* duplicated either directly or indirectly (e. g.
via IFCAP_HWCSUM) in {EM,IGB,IXL}_CAPS.
- Don't bother to fiddle with IFCAP_HWSTATS in ixgbe(4)/ixgbev(4) as
iflib(9) adds that capability unconditionally.
- Remove some unused macros from em(4).
- Bump __FreeBSD_version as some of the above changes require the modules
of drivers using iflib(9) to be recompiled.
Okayed by: sbruno@ at 201806 DevSummit Transport Working Group [1]
Reviewed by: sbruno (earlier version), erj
PR: 219428 (part of; comment #10) [1], 220997 (part of; comment #3) [2]
Differential Revision: https://reviews.freebsd.org/D15720
2018-07-15 19:04:23 +00:00
|
|
|
if_setcapabilities(ifp, scctx->isc_capabilities | IFCAP_HWSTATS | IFCAP_LINKSTATE);
|
2018-05-11 20:08:28 +00:00
|
|
|
if_setcapenable(ifp, scctx->isc_capenable | IFCAP_HWSTATS | IFCAP_LINKSTATE);
|
|
|
|
|
|
|
|
ifp->if_flags |= IFF_NOGROUP;
|
|
|
|
if (sctx->isc_flags & IFLIB_PSEUDO) {
|
2020-06-21 22:02:49 +00:00
|
|
|
ifmedia_add(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO, 0, NULL);
|
|
|
|
ifmedia_set(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO);
|
|
|
|
if (sctx->isc_flags & IFLIB_PSEUDO_ETHER) {
|
|
|
|
ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet);
|
|
|
|
} else {
|
|
|
|
if_attach(ctx->ifc_ifp);
|
|
|
|
bpfattach(ctx->ifc_ifp, DLT_NULL, sizeof(u_int32_t));
|
|
|
|
}
|
2018-05-11 20:08:28 +00:00
|
|
|
|
|
|
|
if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
|
|
|
|
device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
|
|
|
|
goto fail_detach;
|
|
|
|
}
|
|
|
|
*ctxp = ctx;
|
|
|
|
|
Assorted TSO fixes for em(4)/iflib(9) and dead code removal:
- Ever since the workaround for the silicon bug of TSO4 causing MAC hangs
was committed in r295133, CSUM_TSO always got disabled unconditionally
by em(4) on the first invocation of em_init_locked(). However, even with
that problem fixed, it turned out that for at least e. g. 82579 not all
necessary TSO workarounds are in place, still causing MAC hangs even at
Gigabit speed. Thus, for stable/11, TSO usage was deliberately disabled
in r323292 (r323293 for stable/10) for the EM-class by default, allowing
users to turn it on if it happens to work with their particular EM MAC
in a Gigabit-only environment.
In head, the TSO workaround for speeds other than Gigabit was lost with
the conversion to iflib(9) in r311849 (possibly along with another one
or two TSO workarounds). Yet at the same time, for EM-class MACs TSO4
got enabled by default again, causing device hangs. Therefore, change the
default for this hardware class back to have TSO4 off, allowing users
to turn it on manually if it happens to work in their environment as
we do in stable/{10,11}. An alternative would be to add a whitelist of
EM-class devices where TSO4 actually is reliable with the workarounds in
place, but given that the advantage of TSO at Gigabit speed is rather
limited - especially with the overhead of these workarounds -, that's
really not worth it. [1]
This change includes the addition of an isc_capabilities to struct
if_softc_ctx so iflib(9) can also handle interface capabilities that
shouldn't be enabled by default which is used to handle the default-off
capabilities of e1000 as suggested by shurd@ and moving their handling
from em_setup_interface() to em_if_attach_pre() accordingly.
- Although 82543 support TSO4 in theory, the former lem(4) didn't have
support for TSO4, presumably because TSO4 is even more broken in the
LEM-class of MACs than the later EM ones. Still, TSO4 for LEM-class
devices was enabled as part of the conversion to iflib(9) in r311849,
causing device hangs. So revert back to the pre-r311849 behavior of
not supporting TSO4 for LEM-class at all, which includes not creating
a TSO DMA tag in iflib(9) for devices not having IFCAP_TSO4 set. [2]
- In fact, the FreeBSD TCP stack can handle a TSO size of IP_MAXPACKET
(65535) rather than FREEBSD_TSO_SIZE_MAX (65518). However, the TSO
DMA must have a maxsize of the maximum TSO size plus the size of a
VLAN header for software VLAN tagging. The iflib(9) converted em(4),
thus, first correctly sets scctx->isc_tx_tso_size_max to EM_TSO_SIZE
in em_if_attach_pre(), but later on overrides it with IP_MAXPACKET
in em_setup_interface() (apparently, left-over from pre-iflib(9)
times). So remove the later and correct iflib(9) to correctly cap
the maximum TSO size reported to the stack at IP_MAXPACKET. While at
it, let iflib(9) use if_sethwtsomax*().
This change includes the addition of isc_tso_max{seg,}size DMA engine
constraints for the TSO DMA tag to struct if_shared_ctx and letting
iflib_txsd_alloc() automatically adjust the maxsize of that tag in case
IFCAP_VLAN_MTU is supported as requested by shurd@.
- Move the if_setifheaderlen(9) call for adjusting the maximum Ethernet
header length from {ixgbe,ixl,ixlv,ixv,em}_setup_interface() to iflib(9)
so adjustment is automatically done in case IFCAP_VLAN_MTU is supported.
As a consequence, this adjustment now is also done in case of bnxt(4)
which missed it previously.
- Move the reduction of the maximum TSO segment count reported to the
stack by the number of m_pullup(9) calls (which in the worst case,
can add another mbuf and, thus, the requirement for another DMA
segment each) in the transmit path for performance reasons from
em_setup_interface() to iflib_txsd_alloc() as these pull-ups are now
done in iflib_parse_header() rather than in the no longer existing
em_xmit(). Moreover, this optimization applies to all drivers using
iflib(9) and not just em(4); all in-tree iflib(9) consumers still
have enough room to handle full size TSO packets. Also, reduce the
adjustment to the maximum number of m_pullup(9)'s now performed in
iflib_parse_header().
- Prior to the conversion of em(4)/igb(4)/lem(4) and ixl(4) to iflib(9)
in r311849 and r335338 respectively, these drivers didn't enable
IFCAP_VLAN_HWFILTER by default due to VLAN events not being passed
through by lagg(4). With iflib(9), IFCAP_VLAN_HWFILTER was turned on
by default but also lagg(4) was fixed in that regard in r203548. So
just remove the now redundant and defunct IFCAP_VLAN_HWFILTER handling
in {em,ixl,ixlv}_setup_interface().
- Nuke other redundant IFCAP_* setting in {em,ixl,ixlv}_setup_interface()
which is (more completely) already done in {em,ixl,ixlv}_if_attach_pre()
now.
- Remove some redundant/dead setting of scctx->isc_tx_csum_flags in
em_if_attach_pre().
- Remove some IFCAP_* duplicated either directly or indirectly (e. g.
via IFCAP_HWCSUM) in {EM,IGB,IXL}_CAPS.
- Don't bother to fiddle with IFCAP_HWSTATS in ixgbe(4)/ixgbev(4) as
iflib(9) adds that capability unconditionally.
- Remove some unused macros from em(4).
- Bump __FreeBSD_version as some of the above changes require the modules
of drivers using iflib(9) to be recompiled.
Okayed by: sbruno@ at 201806 DevSummit Transport Working Group [1]
Reviewed by: sbruno (earlier version), erj
PR: 219428 (part of; comment #10) [1], 220997 (part of; comment #3) [2]
Differential Revision: https://reviews.freebsd.org/D15720
2018-07-15 19:04:23 +00:00
|
|
|
/*
|
|
|
|
* Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
|
|
|
|
* This must appear after the call to ether_ifattach() because
|
|
|
|
* ether_ifattach() sets if_hdrlen to the default value.
|
|
|
|
*/
|
|
|
|
if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
|
|
|
|
if_setifheaderlen(ifp,
|
|
|
|
sizeof(struct ether_vlan_header));
|
|
|
|
|
2018-05-11 20:08:28 +00:00
|
|
|
if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
|
|
|
|
iflib_add_device_sysctl_post(ctx);
|
|
|
|
ctx->ifc_flags |= IFC_INIT_DONE;
|
2020-05-31 18:42:00 +00:00
|
|
|
CTX_UNLOCK(ctx);
|
2018-05-11 20:08:28 +00:00
|
|
|
return (0);
|
|
|
|
}
|
2020-06-21 22:02:49 +00:00
|
|
|
ifmedia_add(ctx->ifc_mediap, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL);
|
|
|
|
ifmedia_add(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO, 0, NULL);
|
|
|
|
ifmedia_set(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO);
|
|
|
|
|
2018-05-11 20:08:28 +00:00
|
|
|
_iflib_pre_assert(scctx);
|
|
|
|
ctx->ifc_txrx = *scctx->isc_txrx;
|
|
|
|
|
|
|
|
if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
|
|
|
|
scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
|
|
|
|
if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
|
|
|
|
scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
|
|
|
|
|
2020-07-20 21:08:56 +00:00
|
|
|
num_txd = iflib_num_tx_descs(ctx);
|
|
|
|
num_rxd = iflib_num_rx_descs(ctx);
|
2018-05-11 20:08:28 +00:00
|
|
|
|
|
|
|
/* XXX change for per-queue sizes */
|
2019-05-06 20:56:41 +00:00
|
|
|
device_printf(dev, "Using %d TX descriptors and %d RX descriptors\n",
|
2020-07-20 21:08:56 +00:00
|
|
|
num_txd, num_rxd);
|
2018-05-11 20:08:28 +00:00
|
|
|
|
2020-07-20 21:08:56 +00:00
|
|
|
if (scctx->isc_tx_nsegments > num_txd / MAX_SINGLE_PACKET_FRACTION)
|
|
|
|
scctx->isc_tx_nsegments = max(1, num_txd /
|
2018-05-11 20:08:28 +00:00
|
|
|
MAX_SINGLE_PACKET_FRACTION);
|
2020-07-20 21:08:56 +00:00
|
|
|
if (scctx->isc_tx_tso_segments_max > num_txd /
|
2018-05-11 20:08:28 +00:00
|
|
|
MAX_SINGLE_PACKET_FRACTION)
|
|
|
|
scctx->isc_tx_tso_segments_max = max(1,
|
2020-07-20 21:08:56 +00:00
|
|
|
num_txd / MAX_SINGLE_PACKET_FRACTION);
|
2018-05-11 20:08:28 +00:00
|
|
|
|
|
|
|
/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
|
Assorted TSO fixes for em(4)/iflib(9) and dead code removal:
- Ever since the workaround for the silicon bug of TSO4 causing MAC hangs
was committed in r295133, CSUM_TSO always got disabled unconditionally
by em(4) on the first invocation of em_init_locked(). However, even with
that problem fixed, it turned out that for at least e. g. 82579 not all
necessary TSO workarounds are in place, still causing MAC hangs even at
Gigabit speed. Thus, for stable/11, TSO usage was deliberately disabled
in r323292 (r323293 for stable/10) for the EM-class by default, allowing
users to turn it on if it happens to work with their particular EM MAC
in a Gigabit-only environment.
In head, the TSO workaround for speeds other than Gigabit was lost with
the conversion to iflib(9) in r311849 (possibly along with another one
or two TSO workarounds). Yet at the same time, for EM-class MACs TSO4
got enabled by default again, causing device hangs. Therefore, change the
default for this hardware class back to have TSO4 off, allowing users
to turn it on manually if it happens to work in their environment as
we do in stable/{10,11}. An alternative would be to add a whitelist of
EM-class devices where TSO4 actually is reliable with the workarounds in
place, but given that the advantage of TSO at Gigabit speed is rather
limited - especially with the overhead of these workarounds -, that's
really not worth it. [1]
This change includes the addition of an isc_capabilities to struct
if_softc_ctx so iflib(9) can also handle interface capabilities that
shouldn't be enabled by default which is used to handle the default-off
capabilities of e1000 as suggested by shurd@ and moving their handling
from em_setup_interface() to em_if_attach_pre() accordingly.
- Although 82543 support TSO4 in theory, the former lem(4) didn't have
support for TSO4, presumably because TSO4 is even more broken in the
LEM-class of MACs than the later EM ones. Still, TSO4 for LEM-class
devices was enabled as part of the conversion to iflib(9) in r311849,
causing device hangs. So revert back to the pre-r311849 behavior of
not supporting TSO4 for LEM-class at all, which includes not creating
a TSO DMA tag in iflib(9) for devices not having IFCAP_TSO4 set. [2]
- In fact, the FreeBSD TCP stack can handle a TSO size of IP_MAXPACKET
(65535) rather than FREEBSD_TSO_SIZE_MAX (65518). However, the TSO
DMA must have a maxsize of the maximum TSO size plus the size of a
VLAN header for software VLAN tagging. The iflib(9) converted em(4),
thus, first correctly sets scctx->isc_tx_tso_size_max to EM_TSO_SIZE
in em_if_attach_pre(), but later on overrides it with IP_MAXPACKET
in em_setup_interface() (apparently, left-over from pre-iflib(9)
times). So remove the later and correct iflib(9) to correctly cap
the maximum TSO size reported to the stack at IP_MAXPACKET. While at
it, let iflib(9) use if_sethwtsomax*().
This change includes the addition of isc_tso_max{seg,}size DMA engine
constraints for the TSO DMA tag to struct if_shared_ctx and letting
iflib_txsd_alloc() automatically adjust the maxsize of that tag in case
IFCAP_VLAN_MTU is supported as requested by shurd@.
- Move the if_setifheaderlen(9) call for adjusting the maximum Ethernet
header length from {ixgbe,ixl,ixlv,ixv,em}_setup_interface() to iflib(9)
so adjustment is automatically done in case IFCAP_VLAN_MTU is supported.
As a consequence, this adjustment now is also done in case of bnxt(4)
which missed it previously.
- Move the reduction of the maximum TSO segment count reported to the
stack by the number of m_pullup(9) calls (which in the worst case,
can add another mbuf and, thus, the requirement for another DMA
segment each) in the transmit path for performance reasons from
em_setup_interface() to iflib_txsd_alloc() as these pull-ups are now
done in iflib_parse_header() rather than in the no longer existing
em_xmit(). Moreover, this optimization applies to all drivers using
iflib(9) and not just em(4); all in-tree iflib(9) consumers still
have enough room to handle full size TSO packets. Also, reduce the
adjustment to the maximum number of m_pullup(9)'s now performed in
iflib_parse_header().
- Prior to the conversion of em(4)/igb(4)/lem(4) and ixl(4) to iflib(9)
in r311849 and r335338 respectively, these drivers didn't enable
IFCAP_VLAN_HWFILTER by default due to VLAN events not being passed
through by lagg(4). With iflib(9), IFCAP_VLAN_HWFILTER was turned on
by default but also lagg(4) was fixed in that regard in r203548. So
just remove the now redundant and defunct IFCAP_VLAN_HWFILTER handling
in {em,ixl,ixlv}_setup_interface().
- Nuke other redundant IFCAP_* setting in {em,ixl,ixlv}_setup_interface()
which is (more completely) already done in {em,ixl,ixlv}_if_attach_pre()
now.
- Remove some redundant/dead setting of scctx->isc_tx_csum_flags in
em_if_attach_pre().
- Remove some IFCAP_* duplicated either directly or indirectly (e. g.
via IFCAP_HWCSUM) in {EM,IGB,IXL}_CAPS.
- Don't bother to fiddle with IFCAP_HWSTATS in ixgbe(4)/ixgbev(4) as
iflib(9) adds that capability unconditionally.
- Remove some unused macros from em(4).
- Bump __FreeBSD_version as some of the above changes require the modules
of drivers using iflib(9) to be recompiled.
Okayed by: sbruno@ at 201806 DevSummit Transport Working Group [1]
Reviewed by: sbruno (earlier version), erj
PR: 219428 (part of; comment #10) [1], 220997 (part of; comment #3) [2]
Differential Revision: https://reviews.freebsd.org/D15720
2018-07-15 19:04:23 +00:00
|
|
|
if (if_getcapabilities(ifp) & IFCAP_TSO) {
|
|
|
|
/*
|
|
|
|
* The stack can't handle a TSO size larger than IP_MAXPACKET,
|
|
|
|
* but some MACs do.
|
|
|
|
*/
|
|
|
|
if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max,
|
|
|
|
IP_MAXPACKET));
|
|
|
|
/*
|
|
|
|
* Take maximum number of m_pullup(9)'s in iflib_parse_header()
|
|
|
|
* into account. In the worst case, each of these calls will
|
|
|
|
* add another mbuf and, thus, the requirement for another DMA
|
|
|
|
* segment. So for best performance, it doesn't make sense to
|
|
|
|
* advertize a maximum of TSO segments that typically will
|
|
|
|
* require defragmentation in iflib_encap().
|
|
|
|
*/
|
|
|
|
if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3);
|
|
|
|
if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max);
|
|
|
|
}
|
2018-05-11 20:08:28 +00:00
|
|
|
if (scctx->isc_rss_table_size == 0)
|
|
|
|
scctx->isc_rss_table_size = 64;
|
|
|
|
scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
|
|
|
|
|
|
|
|
GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
|
|
|
|
/* XXX format name */
|
Make taskqgroup_attach{,_cpu}(9) work across architectures
So far, intr_{g,s}etaffinity(9) take a single int for identifying
a device interrupt. This approach doesn't work on all architectures
supported, as a single int isn't sufficient to globally specify a
device interrupt. In particular, with multiple interrupt controllers
in one system as found on e. g. arm and arm64 machines, an interrupt
number as returned by rman_get_start(9) may be only unique relative
to the bus and, thus, interrupt controller, a certain device hangs
off from.
In turn, this makes taskqgroup_attach{,_cpu}(9) and - internal to
the gtaskqueue implementation - taskqgroup_attach_deferred{,_cpu}()
not work across architectures. Yet in turn, iflib(4) as gtaskqueue
consumer so far doesn't fit architectures where interrupt numbers
aren't globally unique.
However, at least for intr_setaffinity(..., CPU_WHICH_IRQ, ...) as
employed by the gtaskqueue implementation to bind an interrupt to a
particular CPU, using bus_bind_intr(9) instead is equivalent from
a functional point of view, with bus_bind_intr(9) taking the device
and interrupt resource arguments required for uniquely specifying a
device interrupt.
Thus, change the gtaskqueue implementation to employ bus_bind_intr(9)
instead and intr_{g,s}etaffinity(9) to take the device and interrupt
resource arguments required respectively. This change also moves
struct grouptask from <sys/_task.h> to <sys/gtaskqueue.h> and wraps
struct gtask along with the gtask_fn_t typedef into #ifdef _KERNEL
as userland likes to include <sys/_task.h> or indirectly drags it
in - for better or worse also with _KERNEL defined -, which with
device_t and struct resource dependencies otherwise is no longer
as easily possible now.
The userland inclusion problem probably can be improved a bit by
introducing a _WANT_TASK (as well as a _WANT_MOUNT) akin to the
existing _WANT_PRISON etc., which is orthogonal to this change,
though, and likely needs an exp-run.
While at it:
- Change the gt_cpu member in the grouptask structure to be of type
int as used elswhere for specifying CPUs (an int16_t may be too
narrow sooner or later),
- move the gtaskqueue_enqueue_fn typedef from <sys/gtaskqueue.h> to
the gtaskqueue implementation as it's only used and needed there,
- change the GTASK_INIT macro to use "gtask" rather than "task" as
argument given that it actually operates on a struct gtask rather
than a struct task, and
- let subr_gtaskqueue.c consistently use __func__ to print functions
names.
Reported by: mmel
Reviewed by: mmel
Differential Revision: https://reviews.freebsd.org/D19139
2019-02-12 21:23:59 +00:00
|
|
|
taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx,
|
|
|
|
NULL, NULL, "admin");
|
2018-05-11 20:08:28 +00:00
|
|
|
|
|
|
|
/* XXX --- can support > 1 -- but keep it simple for now */
|
|
|
|
scctx->isc_intr = IFLIB_INTR_LEGACY;
|
|
|
|
|
|
|
|
/* Get memory for the station queues */
|
|
|
|
if ((err = iflib_queues_alloc(ctx))) {
|
|
|
|
device_printf(dev, "Unable to allocate queue memory\n");
|
2019-01-22 00:56:44 +00:00
|
|
|
goto fail_iflib_detach;
|
2018-05-11 20:08:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if ((err = iflib_qset_structures_setup(ctx))) {
|
|
|
|
device_printf(dev, "qset structure setup failed %d\n", err);
|
|
|
|
goto fail_queues;
|
|
|
|
}
|
Assorted TSO fixes for em(4)/iflib(9) and dead code removal:
- Ever since the workaround for the silicon bug of TSO4 causing MAC hangs
was committed in r295133, CSUM_TSO always got disabled unconditionally
by em(4) on the first invocation of em_init_locked(). However, even with
that problem fixed, it turned out that for at least e. g. 82579 not all
necessary TSO workarounds are in place, still causing MAC hangs even at
Gigabit speed. Thus, for stable/11, TSO usage was deliberately disabled
in r323292 (r323293 for stable/10) for the EM-class by default, allowing
users to turn it on if it happens to work with their particular EM MAC
in a Gigabit-only environment.
In head, the TSO workaround for speeds other than Gigabit was lost with
the conversion to iflib(9) in r311849 (possibly along with another one
or two TSO workarounds). Yet at the same time, for EM-class MACs TSO4
got enabled by default again, causing device hangs. Therefore, change the
default for this hardware class back to have TSO4 off, allowing users
to turn it on manually if it happens to work in their environment as
we do in stable/{10,11}. An alternative would be to add a whitelist of
EM-class devices where TSO4 actually is reliable with the workarounds in
place, but given that the advantage of TSO at Gigabit speed is rather
limited - especially with the overhead of these workarounds -, that's
really not worth it. [1]
This change includes the addition of an isc_capabilities to struct
if_softc_ctx so iflib(9) can also handle interface capabilities that
shouldn't be enabled by default which is used to handle the default-off
capabilities of e1000 as suggested by shurd@ and moving their handling
from em_setup_interface() to em_if_attach_pre() accordingly.
- Although 82543 support TSO4 in theory, the former lem(4) didn't have
support for TSO4, presumably because TSO4 is even more broken in the
LEM-class of MACs than the later EM ones. Still, TSO4 for LEM-class
devices was enabled as part of the conversion to iflib(9) in r311849,
causing device hangs. So revert back to the pre-r311849 behavior of
not supporting TSO4 for LEM-class at all, which includes not creating
a TSO DMA tag in iflib(9) for devices not having IFCAP_TSO4 set. [2]
- In fact, the FreeBSD TCP stack can handle a TSO size of IP_MAXPACKET
(65535) rather than FREEBSD_TSO_SIZE_MAX (65518). However, the TSO
DMA must have a maxsize of the maximum TSO size plus the size of a
VLAN header for software VLAN tagging. The iflib(9) converted em(4),
thus, first correctly sets scctx->isc_tx_tso_size_max to EM_TSO_SIZE
in em_if_attach_pre(), but later on overrides it with IP_MAXPACKET
in em_setup_interface() (apparently, left-over from pre-iflib(9)
times). So remove the later and correct iflib(9) to correctly cap
the maximum TSO size reported to the stack at IP_MAXPACKET. While at
it, let iflib(9) use if_sethwtsomax*().
This change includes the addition of isc_tso_max{seg,}size DMA engine
constraints for the TSO DMA tag to struct if_shared_ctx and letting
iflib_txsd_alloc() automatically adjust the maxsize of that tag in case
IFCAP_VLAN_MTU is supported as requested by shurd@.
- Move the if_setifheaderlen(9) call for adjusting the maximum Ethernet
header length from {ixgbe,ixl,ixlv,ixv,em}_setup_interface() to iflib(9)
so adjustment is automatically done in case IFCAP_VLAN_MTU is supported.
As a consequence, this adjustment now is also done in case of bnxt(4)
which missed it previously.
- Move the reduction of the maximum TSO segment count reported to the
stack by the number of m_pullup(9) calls (which in the worst case,
can add another mbuf and, thus, the requirement for another DMA
segment each) in the transmit path for performance reasons from
em_setup_interface() to iflib_txsd_alloc() as these pull-ups are now
done in iflib_parse_header() rather than in the no longer existing
em_xmit(). Moreover, this optimization applies to all drivers using
iflib(9) and not just em(4); all in-tree iflib(9) consumers still
have enough room to handle full size TSO packets. Also, reduce the
adjustment to the maximum number of m_pullup(9)'s now performed in
iflib_parse_header().
- Prior to the conversion of em(4)/igb(4)/lem(4) and ixl(4) to iflib(9)
in r311849 and r335338 respectively, these drivers didn't enable
IFCAP_VLAN_HWFILTER by default due to VLAN events not being passed
through by lagg(4). With iflib(9), IFCAP_VLAN_HWFILTER was turned on
by default but also lagg(4) was fixed in that regard in r203548. So
just remove the now redundant and defunct IFCAP_VLAN_HWFILTER handling
in {em,ixl,ixlv}_setup_interface().
- Nuke other redundant IFCAP_* setting in {em,ixl,ixlv}_setup_interface()
which is (more completely) already done in {em,ixl,ixlv}_if_attach_pre()
now.
- Remove some redundant/dead setting of scctx->isc_tx_csum_flags in
em_if_attach_pre().
- Remove some IFCAP_* duplicated either directly or indirectly (e. g.
via IFCAP_HWCSUM) in {EM,IGB,IXL}_CAPS.
- Don't bother to fiddle with IFCAP_HWSTATS in ixgbe(4)/ixgbev(4) as
iflib(9) adds that capability unconditionally.
- Remove some unused macros from em(4).
- Bump __FreeBSD_version as some of the above changes require the modules
of drivers using iflib(9) to be recompiled.
Okayed by: sbruno@ at 201806 DevSummit Transport Working Group [1]
Reviewed by: sbruno (earlier version), erj
PR: 219428 (part of; comment #10) [1], 220997 (part of; comment #3) [2]
Differential Revision: https://reviews.freebsd.org/D15720
2018-07-15 19:04:23 +00:00
|
|
|
|
2018-05-11 20:08:28 +00:00
|
|
|
/*
|
|
|
|
* XXX What if anything do we want to do about interrupts?
|
|
|
|
*/
|
2019-04-17 17:19:54 +00:00
|
|
|
ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet);
|
2018-05-11 20:08:28 +00:00
|
|
|
if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
|
|
|
|
device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
|
|
|
|
goto fail_detach;
|
|
|
|
}
|
Assorted TSO fixes for em(4)/iflib(9) and dead code removal:
- Ever since the workaround for the silicon bug of TSO4 causing MAC hangs
was committed in r295133, CSUM_TSO always got disabled unconditionally
by em(4) on the first invocation of em_init_locked(). However, even with
that problem fixed, it turned out that for at least e. g. 82579 not all
necessary TSO workarounds are in place, still causing MAC hangs even at
Gigabit speed. Thus, for stable/11, TSO usage was deliberately disabled
in r323292 (r323293 for stable/10) for the EM-class by default, allowing
users to turn it on if it happens to work with their particular EM MAC
in a Gigabit-only environment.
In head, the TSO workaround for speeds other than Gigabit was lost with
the conversion to iflib(9) in r311849 (possibly along with another one
or two TSO workarounds). Yet at the same time, for EM-class MACs TSO4
got enabled by default again, causing device hangs. Therefore, change the
default for this hardware class back to have TSO4 off, allowing users
to turn it on manually if it happens to work in their environment as
we do in stable/{10,11}. An alternative would be to add a whitelist of
EM-class devices where TSO4 actually is reliable with the workarounds in
place, but given that the advantage of TSO at Gigabit speed is rather
limited - especially with the overhead of these workarounds -, that's
really not worth it. [1]
This change includes the addition of an isc_capabilities to struct
if_softc_ctx so iflib(9) can also handle interface capabilities that
shouldn't be enabled by default which is used to handle the default-off
capabilities of e1000 as suggested by shurd@ and moving their handling
from em_setup_interface() to em_if_attach_pre() accordingly.
- Although 82543 support TSO4 in theory, the former lem(4) didn't have
support for TSO4, presumably because TSO4 is even more broken in the
LEM-class of MACs than the later EM ones. Still, TSO4 for LEM-class
devices was enabled as part of the conversion to iflib(9) in r311849,
causing device hangs. So revert back to the pre-r311849 behavior of
not supporting TSO4 for LEM-class at all, which includes not creating
a TSO DMA tag in iflib(9) for devices not having IFCAP_TSO4 set. [2]
- In fact, the FreeBSD TCP stack can handle a TSO size of IP_MAXPACKET
(65535) rather than FREEBSD_TSO_SIZE_MAX (65518). However, the TSO
DMA must have a maxsize of the maximum TSO size plus the size of a
VLAN header for software VLAN tagging. The iflib(9) converted em(4),
thus, first correctly sets scctx->isc_tx_tso_size_max to EM_TSO_SIZE
in em_if_attach_pre(), but later on overrides it with IP_MAXPACKET
in em_setup_interface() (apparently, left-over from pre-iflib(9)
times). So remove the later and correct iflib(9) to correctly cap
the maximum TSO size reported to the stack at IP_MAXPACKET. While at
it, let iflib(9) use if_sethwtsomax*().
This change includes the addition of isc_tso_max{seg,}size DMA engine
constraints for the TSO DMA tag to struct if_shared_ctx and letting
iflib_txsd_alloc() automatically adjust the maxsize of that tag in case
IFCAP_VLAN_MTU is supported as requested by shurd@.
- Move the if_setifheaderlen(9) call for adjusting the maximum Ethernet
header length from {ixgbe,ixl,ixlv,ixv,em}_setup_interface() to iflib(9)
so adjustment is automatically done in case IFCAP_VLAN_MTU is supported.
As a consequence, this adjustment now is also done in case of bnxt(4)
which missed it previously.
- Move the reduction of the maximum TSO segment count reported to the
stack by the number of m_pullup(9) calls (which in the worst case,
can add another mbuf and, thus, the requirement for another DMA
segment each) in the transmit path for performance reasons from
em_setup_interface() to iflib_txsd_alloc() as these pull-ups are now
done in iflib_parse_header() rather than in the no longer existing
em_xmit(). Moreover, this optimization applies to all drivers using
iflib(9) and not just em(4); all in-tree iflib(9) consumers still
have enough room to handle full size TSO packets. Also, reduce the
adjustment to the maximum number of m_pullup(9)'s now performed in
iflib_parse_header().
- Prior to the conversion of em(4)/igb(4)/lem(4) and ixl(4) to iflib(9)
in r311849 and r335338 respectively, these drivers didn't enable
IFCAP_VLAN_HWFILTER by default due to VLAN events not being passed
through by lagg(4). With iflib(9), IFCAP_VLAN_HWFILTER was turned on
by default but also lagg(4) was fixed in that regard in r203548. So
just remove the now redundant and defunct IFCAP_VLAN_HWFILTER handling
in {em,ixl,ixlv}_setup_interface().
- Nuke other redundant IFCAP_* setting in {em,ixl,ixlv}_setup_interface()
which is (more completely) already done in {em,ixl,ixlv}_if_attach_pre()
now.
- Remove some redundant/dead setting of scctx->isc_tx_csum_flags in
em_if_attach_pre().
- Remove some IFCAP_* duplicated either directly or indirectly (e. g.
via IFCAP_HWCSUM) in {EM,IGB,IXL}_CAPS.
- Don't bother to fiddle with IFCAP_HWSTATS in ixgbe(4)/ixgbev(4) as
iflib(9) adds that capability unconditionally.
- Remove some unused macros from em(4).
- Bump __FreeBSD_version as some of the above changes require the modules
of drivers using iflib(9) to be recompiled.
Okayed by: sbruno@ at 201806 DevSummit Transport Working Group [1]
Reviewed by: sbruno (earlier version), erj
PR: 219428 (part of; comment #10) [1], 220997 (part of; comment #3) [2]
Differential Revision: https://reviews.freebsd.org/D15720
2018-07-15 19:04:23 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
|
|
|
|
* This must appear after the call to ether_ifattach() because
|
|
|
|
* ether_ifattach() sets if_hdrlen to the default value.
|
|
|
|
*/
|
|
|
|
if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
|
|
|
|
if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
|
|
|
|
|
2018-05-11 20:08:28 +00:00
|
|
|
/* XXX handle more than one queue */
|
|
|
|
for (i = 0; i < scctx->isc_nrxqsets; i++)
|
|
|
|
IFDI_RX_CLSET(ctx, 0, i, ctx->ifc_rxqs[i].ifr_fl[0].ifl_sds.ifsd_cl);
|
|
|
|
|
|
|
|
*ctxp = ctx;
|
|
|
|
|
|
|
|
if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
|
|
|
|
iflib_add_device_sysctl_post(ctx);
|
|
|
|
ctx->ifc_flags |= IFC_INIT_DONE;
|
2019-03-28 20:43:47 +00:00
|
|
|
CTX_UNLOCK(ctx);
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
|
2018-05-11 20:08:28 +00:00
|
|
|
return (0);
|
|
|
|
fail_detach:
|
|
|
|
ether_ifdetach(ctx->ifc_ifp);
|
|
|
|
fail_queues:
|
2021-02-01 16:13:00 +00:00
|
|
|
iflib_tqg_detach(ctx);
|
2018-05-11 20:08:28 +00:00
|
|
|
iflib_tx_structures_free(ctx);
|
|
|
|
iflib_rx_structures_free(ctx);
|
2019-01-22 00:56:44 +00:00
|
|
|
fail_iflib_detach:
|
2018-05-11 20:08:28 +00:00
|
|
|
IFDI_DETACH(ctx);
|
2021-02-01 16:13:00 +00:00
|
|
|
IFDI_QUEUES_FREE(ctx);
|
2019-03-28 20:43:47 +00:00
|
|
|
fail_unlock:
|
|
|
|
CTX_UNLOCK(ctx);
|
2019-08-16 23:33:44 +00:00
|
|
|
iflib_deregister(ctx);
|
2019-01-22 00:56:44 +00:00
|
|
|
fail_ctx_free:
|
|
|
|
free(ctx->ifc_softc, M_IFLIB);
|
|
|
|
free(ctx, M_IFLIB);
|
2018-05-11 20:08:28 +00:00
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
iflib_pseudo_deregister(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
if_t ifp = ctx->ifc_ifp;
|
2020-06-21 22:02:49 +00:00
|
|
|
if_shared_ctx_t sctx = ctx->ifc_sctx;
|
2018-05-11 20:08:28 +00:00
|
|
|
|
2019-10-23 23:20:49 +00:00
|
|
|
/* Unregister VLAN event handlers early */
|
|
|
|
iflib_unregister_vlan_handlers(ctx);
|
|
|
|
|
2020-06-21 22:02:49 +00:00
|
|
|
if ((sctx->isc_flags & IFLIB_PSEUDO) &&
|
|
|
|
(sctx->isc_flags & IFLIB_PSEUDO_ETHER) == 0) {
|
|
|
|
bpfdetach(ifp);
|
|
|
|
if_detach(ifp);
|
|
|
|
} else {
|
|
|
|
ether_ifdetach(ifp);
|
|
|
|
}
|
2018-05-11 20:08:28 +00:00
|
|
|
|
2020-12-07 14:52:57 +00:00
|
|
|
iflib_tqg_detach(ctx);
|
2018-05-11 20:08:28 +00:00
|
|
|
iflib_tx_structures_free(ctx);
|
|
|
|
iflib_rx_structures_free(ctx);
|
2021-02-01 16:13:00 +00:00
|
|
|
IFDI_DETACH(ctx);
|
|
|
|
IFDI_QUEUES_FREE(ctx);
|
2019-08-16 23:33:44 +00:00
|
|
|
|
|
|
|
iflib_deregister(ctx);
|
|
|
|
|
2018-05-11 20:08:28 +00:00
|
|
|
if (ctx->ifc_flags & IFC_SC_ALLOCATED)
|
|
|
|
free(ctx->ifc_softc, M_IFLIB);
|
|
|
|
free(ctx, M_IFLIB);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
int
|
|
|
|
iflib_device_attach(device_t dev)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx;
|
|
|
|
if_shared_ctx_t sctx;
|
|
|
|
|
|
|
|
if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
|
|
|
|
return (ENOTSUP);
|
|
|
|
|
|
|
|
pci_enable_busmaster(dev);
|
|
|
|
|
|
|
|
return (iflib_device_register(dev, NULL, sctx, &ctx));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
iflib_device_deregister(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
if_t ifp = ctx->ifc_ifp;
|
|
|
|
device_t dev = ctx->ifc_dev;
|
|
|
|
|
|
|
|
/* Make sure VLANS are not using driver */
|
|
|
|
if (if_vlantrunkinuse(ifp)) {
|
2018-10-12 22:40:54 +00:00
|
|
|
device_printf(dev, "Vlan in use, detach first\n");
|
2016-05-18 04:35:58 +00:00
|
|
|
return (EBUSY);
|
|
|
|
}
|
2018-10-12 22:40:54 +00:00
|
|
|
#ifdef PCI_IOV
|
|
|
|
if (!CTX_IS_VF(ctx) && pci_iov_detach(dev) != 0) {
|
|
|
|
device_printf(dev, "SR-IOV in use; detach first.\n");
|
|
|
|
return (EBUSY);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
STATE_LOCK(ctx);
|
|
|
|
ctx->ifc_flags |= IFC_IN_DETACH;
|
|
|
|
STATE_UNLOCK(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2019-10-23 23:20:49 +00:00
|
|
|
/* Unregister VLAN handlers before calling iflib_stop() */
|
|
|
|
iflib_unregister_vlan_handlers(ctx);
|
|
|
|
|
|
|
|
iflib_netmap_detach(ifp);
|
|
|
|
ether_ifdetach(ifp);
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
CTX_LOCK(ctx);
|
|
|
|
iflib_stop(ctx);
|
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
|
2019-04-24 13:32:04 +00:00
|
|
|
iflib_rem_pfil(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
if (ctx->ifc_led_dev != NULL)
|
|
|
|
led_destroy(ctx->ifc_led_dev);
|
2017-07-03 18:23:35 +00:00
|
|
|
|
2020-12-07 14:52:57 +00:00
|
|
|
iflib_tqg_detach(ctx);
|
2021-02-01 16:13:00 +00:00
|
|
|
iflib_tx_structures_free(ctx);
|
|
|
|
iflib_rx_structures_free(ctx);
|
|
|
|
|
2018-05-29 18:03:43 +00:00
|
|
|
CTX_LOCK(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
IFDI_DETACH(ctx);
|
2021-02-01 16:13:00 +00:00
|
|
|
IFDI_QUEUES_FREE(ctx);
|
2018-05-29 18:03:43 +00:00
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
|
|
|
|
/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
|
2018-10-12 22:40:54 +00:00
|
|
|
iflib_free_intr_mem(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
bus_generic_detach(dev);
|
|
|
|
|
2019-08-16 23:33:44 +00:00
|
|
|
iflib_deregister(ctx);
|
|
|
|
|
|
|
|
device_set_softc(ctx->ifc_dev, NULL);
|
2016-08-12 21:29:44 +00:00
|
|
|
if (ctx->ifc_flags & IFC_SC_ALLOCATED)
|
|
|
|
free(ctx->ifc_softc, M_IFLIB);
|
2019-04-25 21:24:56 +00:00
|
|
|
unref_ctx_core_offset(ctx);
|
2016-08-12 21:29:44 +00:00
|
|
|
free(ctx, M_IFLIB);
|
2016-05-18 04:35:58 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2020-12-07 14:52:57 +00:00
|
|
|
static void
|
|
|
|
iflib_tqg_detach(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
iflib_txq_t txq;
|
|
|
|
iflib_rxq_t rxq;
|
|
|
|
int i;
|
|
|
|
struct taskqgroup *tqg;
|
|
|
|
|
|
|
|
/* XXX drain any dependent tasks */
|
|
|
|
tqg = qgroup_if_io_tqg;
|
|
|
|
for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
|
|
|
|
callout_drain(&txq->ift_timer);
|
|
|
|
#ifdef DEV_NETMAP
|
|
|
|
callout_drain(&txq->ift_netmap_timer);
|
|
|
|
#endif /* DEV_NETMAP */
|
|
|
|
if (txq->ift_task.gt_uniq != NULL)
|
|
|
|
taskqgroup_detach(tqg, &txq->ift_task);
|
|
|
|
}
|
|
|
|
for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
|
|
|
|
if (rxq->ifr_task.gt_uniq != NULL)
|
|
|
|
taskqgroup_detach(tqg, &rxq->ifr_task);
|
|
|
|
}
|
|
|
|
tqg = qgroup_if_config_tqg;
|
|
|
|
if (ctx->ifc_admin_task.gt_uniq != NULL)
|
|
|
|
taskqgroup_detach(tqg, &ctx->ifc_admin_task);
|
|
|
|
if (ctx->ifc_vflr_task.gt_uniq != NULL)
|
|
|
|
taskqgroup_detach(tqg, &ctx->ifc_vflr_task);
|
|
|
|
}
|
|
|
|
|
2018-10-12 22:40:54 +00:00
|
|
|
static void
|
|
|
|
iflib_free_intr_mem(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_MSIX) {
|
|
|
|
iflib_irq_free(ctx, &ctx->ifc_legacy_irq);
|
|
|
|
}
|
2019-01-30 13:21:26 +00:00
|
|
|
if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_LEGACY) {
|
|
|
|
pci_release_msi(ctx->ifc_dev);
|
|
|
|
}
|
2018-10-12 22:40:54 +00:00
|
|
|
if (ctx->ifc_msix_mem != NULL) {
|
|
|
|
bus_release_resource(ctx->ifc_dev, SYS_RES_MEMORY,
|
2019-01-30 13:21:26 +00:00
|
|
|
rman_get_rid(ctx->ifc_msix_mem), ctx->ifc_msix_mem);
|
2018-10-12 22:40:54 +00:00
|
|
|
ctx->ifc_msix_mem = NULL;
|
|
|
|
}
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
int
|
|
|
|
iflib_device_detach(device_t dev)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = device_get_softc(dev);
|
|
|
|
|
|
|
|
return (iflib_device_deregister(ctx));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
iflib_device_suspend(device_t dev)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = device_get_softc(dev);
|
|
|
|
|
|
|
|
CTX_LOCK(ctx);
|
|
|
|
IFDI_SUSPEND(ctx);
|
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
|
|
|
|
return bus_generic_suspend(dev);
|
|
|
|
}
|
|
|
|
int
|
|
|
|
iflib_device_shutdown(device_t dev)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = device_get_softc(dev);
|
|
|
|
|
|
|
|
CTX_LOCK(ctx);
|
|
|
|
IFDI_SHUTDOWN(ctx);
|
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
|
|
|
|
return bus_generic_suspend(dev);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
iflib_device_resume(device_t dev)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = device_get_softc(dev);
|
|
|
|
iflib_txq_t txq = ctx->ifc_txqs;
|
|
|
|
|
|
|
|
CTX_LOCK(ctx);
|
|
|
|
IFDI_RESUME(ctx);
|
2019-01-07 23:46:54 +00:00
|
|
|
iflib_if_init_locked(ctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
for (int i = 0; i < NTXQSETS(ctx); i++, txq++)
|
|
|
|
iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
|
|
|
|
|
|
|
|
return (bus_generic_resume(dev));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
iflib_device_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
if_ctx_t ctx = device_get_softc(dev);
|
|
|
|
|
|
|
|
CTX_LOCK(ctx);
|
|
|
|
error = IFDI_IOV_INIT(ctx, num_vfs, params);
|
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
iflib_device_iov_uninit(device_t dev)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = device_get_softc(dev);
|
|
|
|
|
|
|
|
CTX_LOCK(ctx);
|
|
|
|
IFDI_IOV_UNINIT(ctx);
|
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
iflib_device_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
if_ctx_t ctx = device_get_softc(dev);
|
|
|
|
|
|
|
|
CTX_LOCK(ctx);
|
|
|
|
error = IFDI_IOV_VF_ADD(ctx, vfnum, params);
|
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*********************************************************************
|
|
|
|
*
|
|
|
|
* MODULE FUNCTION DEFINITIONS
|
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
|
2017-09-16 02:41:38 +00:00
|
|
|
/*
|
|
|
|
* - Start a fast taskqueue thread for each core
|
|
|
|
* - Start a taskqueue for control operations
|
|
|
|
*/
|
2016-05-18 04:35:58 +00:00
|
|
|
static int
|
|
|
|
iflib_module_init(void)
|
|
|
|
{
|
2020-12-19 01:08:33 +00:00
|
|
|
iflib_timer_default = hz / 2;
|
2016-05-18 04:35:58 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
iflib_module_event_handler(module_t mod, int what, void *arg)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
|
|
|
switch (what) {
|
|
|
|
case MOD_LOAD:
|
|
|
|
if ((err = iflib_module_init()) != 0)
|
|
|
|
return (err);
|
|
|
|
break;
|
|
|
|
case MOD_UNLOAD:
|
|
|
|
return (EBUSY);
|
|
|
|
default:
|
|
|
|
return (EOPNOTSUPP);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*********************************************************************
|
|
|
|
*
|
|
|
|
* PUBLIC FUNCTION DEFINITIONS
|
|
|
|
* ordered as in iflib.h
|
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
|
|
|
|
static void
|
|
|
|
_iflib_assert(if_shared_ctx_t sctx)
|
|
|
|
{
|
2019-05-10 00:41:42 +00:00
|
|
|
int i;
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
MPASS(sctx->isc_tx_maxsize);
|
|
|
|
MPASS(sctx->isc_tx_maxsegsize);
|
|
|
|
|
|
|
|
MPASS(sctx->isc_rx_maxsize);
|
|
|
|
MPASS(sctx->isc_rx_nsegments);
|
|
|
|
MPASS(sctx->isc_rx_maxsegsize);
|
|
|
|
|
2019-05-10 00:41:42 +00:00
|
|
|
MPASS(sctx->isc_nrxqs >= 1 && sctx->isc_nrxqs <= 8);
|
|
|
|
for (i = 0; i < sctx->isc_nrxqs; i++) {
|
|
|
|
MPASS(sctx->isc_nrxd_min[i]);
|
|
|
|
MPASS(powerof2(sctx->isc_nrxd_min[i]));
|
|
|
|
MPASS(sctx->isc_nrxd_max[i]);
|
|
|
|
MPASS(powerof2(sctx->isc_nrxd_max[i]));
|
|
|
|
MPASS(sctx->isc_nrxd_default[i]);
|
|
|
|
MPASS(powerof2(sctx->isc_nrxd_default[i]));
|
|
|
|
}
|
|
|
|
|
|
|
|
MPASS(sctx->isc_ntxqs >= 1 && sctx->isc_ntxqs <= 8);
|
|
|
|
for (i = 0; i < sctx->isc_ntxqs; i++) {
|
|
|
|
MPASS(sctx->isc_ntxd_min[i]);
|
|
|
|
MPASS(powerof2(sctx->isc_ntxd_min[i]));
|
|
|
|
MPASS(sctx->isc_ntxd_max[i]);
|
|
|
|
MPASS(powerof2(sctx->isc_ntxd_max[i]));
|
|
|
|
MPASS(sctx->isc_ntxd_default[i]);
|
|
|
|
MPASS(powerof2(sctx->isc_ntxd_default[i]));
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
2017-01-02 00:56:33 +00:00
|
|
|
static void
|
|
|
|
_iflib_pre_assert(if_softc_ctx_t scctx)
|
|
|
|
{
|
|
|
|
|
|
|
|
MPASS(scctx->isc_txrx->ift_txd_encap);
|
|
|
|
MPASS(scctx->isc_txrx->ift_txd_flush);
|
|
|
|
MPASS(scctx->isc_txrx->ift_txd_credits_update);
|
|
|
|
MPASS(scctx->isc_txrx->ift_rxd_available);
|
|
|
|
MPASS(scctx->isc_txrx->ift_rxd_pkt_get);
|
|
|
|
MPASS(scctx->isc_txrx->ift_rxd_refill);
|
|
|
|
MPASS(scctx->isc_txrx->ift_rxd_flush);
|
|
|
|
}
|
2016-10-18 14:02:45 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
static int
|
|
|
|
iflib_register(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
if_shared_ctx_t sctx = ctx->ifc_sctx;
|
|
|
|
driver_t *driver = sctx->isc_driver;
|
|
|
|
device_t dev = ctx->ifc_dev;
|
|
|
|
if_t ifp;
|
2020-06-21 22:02:49 +00:00
|
|
|
u_char type;
|
|
|
|
int iflags;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2020-05-31 18:42:00 +00:00
|
|
|
if ((sctx->isc_flags & IFLIB_PSEUDO) == 0)
|
|
|
|
_iflib_assert(sctx);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2018-05-03 17:02:31 +00:00
|
|
|
CTX_LOCK_INIT(ctx);
|
2018-04-12 14:35:37 +00:00
|
|
|
STATE_LOCK_INIT(ctx, device_get_nameunit(ctx->ifc_dev));
|
2020-06-21 22:02:49 +00:00
|
|
|
if (sctx->isc_flags & IFLIB_PSEUDO) {
|
|
|
|
if (sctx->isc_flags & IFLIB_PSEUDO_ETHER)
|
|
|
|
type = IFT_ETHER;
|
|
|
|
else
|
|
|
|
type = IFT_PPP;
|
|
|
|
} else
|
|
|
|
type = IFT_ETHER;
|
|
|
|
ifp = ctx->ifc_ifp = if_alloc(type);
|
2016-05-18 04:35:58 +00:00
|
|
|
if (ifp == NULL) {
|
|
|
|
device_printf(dev, "can not allocate ifnet structure\n");
|
|
|
|
return (ENOMEM);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize our context's device specific methods
|
|
|
|
*/
|
|
|
|
kobj_init((kobj_t) ctx, (kobj_class_t) driver);
|
|
|
|
kobj_class_compile((kobj_class_t) driver);
|
|
|
|
|
|
|
|
if_initname(ifp, device_get_name(dev), device_get_unit(dev));
|
|
|
|
if_setsoftc(ifp, ctx);
|
|
|
|
if_setdev(ifp, dev);
|
|
|
|
if_setinitfn(ifp, iflib_if_init);
|
|
|
|
if_setioctlfn(ifp, iflib_if_ioctl);
|
2018-07-25 22:46:36 +00:00
|
|
|
#ifdef ALTQ
|
|
|
|
if_setstartfn(ifp, iflib_altq_if_start);
|
|
|
|
if_settransmitfn(ifp, iflib_altq_if_transmit);
|
2018-08-04 01:45:17 +00:00
|
|
|
if_setsendqready(ifp);
|
2018-07-25 22:46:36 +00:00
|
|
|
#else
|
2016-05-18 04:35:58 +00:00
|
|
|
if_settransmitfn(ifp, iflib_if_transmit);
|
2018-07-25 22:46:36 +00:00
|
|
|
#endif
|
2016-05-18 04:35:58 +00:00
|
|
|
if_setqflushfn(ifp, iflib_if_qflush);
|
2020-06-21 22:02:49 +00:00
|
|
|
iflags = IFF_MULTICAST | IFF_KNOWSEPOCH;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2020-06-21 22:02:49 +00:00
|
|
|
if ((sctx->isc_flags & IFLIB_PSEUDO) &&
|
|
|
|
(sctx->isc_flags & IFLIB_PSEUDO_ETHER) == 0)
|
|
|
|
iflags |= IFF_POINTOPOINT;
|
|
|
|
else
|
|
|
|
iflags |= IFF_BROADCAST | IFF_SIMPLEX;
|
|
|
|
if_setflags(ifp, iflags);
|
2016-05-18 04:35:58 +00:00
|
|
|
ctx->ifc_vlan_attach_event =
|
|
|
|
EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx,
|
|
|
|
EVENTHANDLER_PRI_FIRST);
|
|
|
|
ctx->ifc_vlan_detach_event =
|
|
|
|
EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx,
|
|
|
|
EVENTHANDLER_PRI_FIRST);
|
|
|
|
|
2019-05-03 20:05:31 +00:00
|
|
|
if ((sctx->isc_flags & IFLIB_DRIVER_MEDIA) == 0) {
|
|
|
|
ctx->ifc_mediap = &ctx->ifc_media;
|
|
|
|
ifmedia_init(ctx->ifc_mediap, IFM_IMASK,
|
|
|
|
iflib_media_change, iflib_media_status);
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2019-08-16 23:33:44 +00:00
|
|
|
static void
|
2019-10-23 23:20:49 +00:00
|
|
|
iflib_unregister_vlan_handlers(if_ctx_t ctx)
|
2019-08-16 23:33:44 +00:00
|
|
|
{
|
|
|
|
/* Unregister VLAN events */
|
|
|
|
if (ctx->ifc_vlan_attach_event != NULL) {
|
|
|
|
EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event);
|
|
|
|
ctx->ifc_vlan_attach_event = NULL;
|
|
|
|
}
|
|
|
|
if (ctx->ifc_vlan_detach_event != NULL) {
|
|
|
|
EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event);
|
|
|
|
ctx->ifc_vlan_detach_event = NULL;
|
|
|
|
}
|
|
|
|
|
2019-10-23 23:20:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
iflib_deregister(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
if_t ifp = ctx->ifc_ifp;
|
|
|
|
|
|
|
|
/* Remove all media */
|
|
|
|
ifmedia_removeall(&ctx->ifc_media);
|
|
|
|
|
|
|
|
/* Ensure that VLAN event handlers are unregistered */
|
|
|
|
iflib_unregister_vlan_handlers(ctx);
|
|
|
|
|
2019-08-16 23:33:44 +00:00
|
|
|
/* Release kobject reference */
|
|
|
|
kobj_delete((kobj_t) ctx, NULL);
|
|
|
|
|
|
|
|
/* Free the ifnet structure */
|
|
|
|
if_free(ifp);
|
|
|
|
|
|
|
|
STATE_LOCK_DESTROY(ctx);
|
|
|
|
|
|
|
|
/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
|
|
|
|
CTX_LOCK_DESTROY(ctx);
|
|
|
|
}
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
static int
|
|
|
|
iflib_queues_alloc(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
if_shared_ctx_t sctx = ctx->ifc_sctx;
|
2016-08-12 21:29:44 +00:00
|
|
|
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
|
2016-05-18 04:35:58 +00:00
|
|
|
device_t dev = ctx->ifc_dev;
|
2016-08-12 21:29:44 +00:00
|
|
|
int nrxqsets = scctx->isc_nrxqsets;
|
|
|
|
int ntxqsets = scctx->isc_ntxqsets;
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_txq_t txq;
|
|
|
|
iflib_rxq_t rxq;
|
|
|
|
iflib_fl_t fl = NULL;
|
2016-08-12 21:29:44 +00:00
|
|
|
int i, j, cpu, err, txconf, rxconf;
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_dma_info_t ifdip;
|
2016-08-12 21:29:44 +00:00
|
|
|
uint32_t *rxqsizes = scctx->isc_rxqsizes;
|
|
|
|
uint32_t *txqsizes = scctx->isc_txqsizes;
|
2016-05-18 04:35:58 +00:00
|
|
|
uint8_t nrxqs = sctx->isc_nrxqs;
|
|
|
|
uint8_t ntxqs = sctx->isc_ntxqs;
|
|
|
|
int nfree_lists = sctx->isc_nfl ? sctx->isc_nfl : 1;
|
2021-01-10 13:49:51 +00:00
|
|
|
int fl_offset = (sctx->isc_flags & IFLIB_HAS_RXCQ ? 1 : 0);
|
2016-05-18 04:35:58 +00:00
|
|
|
caddr_t *vaddrs;
|
|
|
|
uint64_t *paddrs;
|
|
|
|
|
2016-08-12 21:29:44 +00:00
|
|
|
KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1"));
|
|
|
|
KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1"));
|
2021-01-10 13:49:51 +00:00
|
|
|
KASSERT(nrxqs >= fl_offset + nfree_lists,
|
|
|
|
("there must be at least a rxq for each free list"));
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2018-06-18 17:27:43 +00:00
|
|
|
/* Allocate the TX ring struct memory */
|
iflib: fix invalid free during queue allocation failure
In r301567, code was added to cleanup to prevent memory leaks for the
Tx and Rx ring structs. This code carefully tracked txq and rxq, and
made sure to free them properly during cleanup.
Because we assigned the txq and rxq pointers into the ctx->ifc_txqs and
ctx->ifc_rxqs, we carefully reset these pointers to NULL, so that
cleanup code would not accidentally free the memory twice.
This was changed by r304021 ("Update iflib to support more NIC designs"),
which removed this resetting of the pointers to NULL, because it re-used
the txq and rxq pointers as an index into the queue set array.
Unfortunately, the cleanup code was left alone. Thus, if we fail to
allocate DMA or fail to configure the queues using the drivers ifdi
methods, we will attempt to free txq and rxq. These variables would now
incorrectly point to the wrong location, resulting in a page fault.
There are a number of methods to correct this, but ultimately the root
cause was that we reuse the txq and rxq pointers for two different
purposes.
Instead, when allocating, store the returned pointer directly into
ctx->ifc_txqs and ctx->ifc_rxqs. Then, assign this to txq and rxq as
index pointers before starting the loop to allocate each queue.
Drop the cleanup code for txq and rxq, and only use ctx->ifc_txqs and
ctx->ifc_rxqs.
Thus, we no longer need to free txq or rxq under any error flow, and
intsead rely solely on the pointers stored in ctx->ifc_txqs and
ctx->ifc_rxqs. This prevents the invalid free(), and ensures that we
still properly cleanup after ourselves as before when failing to
allocate.
Submitted by: Jacob Keller
Reviewed by: gallatin, sbruno
Sponsored by: Intel Corporation
Differential Revision: https://reviews.freebsd.org/D15285
2018-05-04 15:20:34 +00:00
|
|
|
if (!(ctx->ifc_txqs =
|
2018-01-21 15:42:36 +00:00
|
|
|
(iflib_txq_t) malloc(sizeof(struct iflib_txq) *
|
|
|
|
ntxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
|
2016-05-18 04:35:58 +00:00
|
|
|
device_printf(dev, "Unable to allocate TX ring memory\n");
|
|
|
|
err = ENOMEM;
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Now allocate the RX */
|
iflib: fix invalid free during queue allocation failure
In r301567, code was added to cleanup to prevent memory leaks for the
Tx and Rx ring structs. This code carefully tracked txq and rxq, and
made sure to free them properly during cleanup.
Because we assigned the txq and rxq pointers into the ctx->ifc_txqs and
ctx->ifc_rxqs, we carefully reset these pointers to NULL, so that
cleanup code would not accidentally free the memory twice.
This was changed by r304021 ("Update iflib to support more NIC designs"),
which removed this resetting of the pointers to NULL, because it re-used
the txq and rxq pointers as an index into the queue set array.
Unfortunately, the cleanup code was left alone. Thus, if we fail to
allocate DMA or fail to configure the queues using the drivers ifdi
methods, we will attempt to free txq and rxq. These variables would now
incorrectly point to the wrong location, resulting in a page fault.
There are a number of methods to correct this, but ultimately the root
cause was that we reuse the txq and rxq pointers for two different
purposes.
Instead, when allocating, store the returned pointer directly into
ctx->ifc_txqs and ctx->ifc_rxqs. Then, assign this to txq and rxq as
index pointers before starting the loop to allocate each queue.
Drop the cleanup code for txq and rxq, and only use ctx->ifc_txqs and
ctx->ifc_rxqs.
Thus, we no longer need to free txq or rxq under any error flow, and
intsead rely solely on the pointers stored in ctx->ifc_txqs and
ctx->ifc_rxqs. This prevents the invalid free(), and ensures that we
still properly cleanup after ourselves as before when failing to
allocate.
Submitted by: Jacob Keller
Reviewed by: gallatin, sbruno
Sponsored by: Intel Corporation
Differential Revision: https://reviews.freebsd.org/D15285
2018-05-04 15:20:34 +00:00
|
|
|
if (!(ctx->ifc_rxqs =
|
2018-01-21 15:42:36 +00:00
|
|
|
(iflib_rxq_t) malloc(sizeof(struct iflib_rxq) *
|
|
|
|
nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
|
2016-05-18 04:35:58 +00:00
|
|
|
device_printf(dev, "Unable to allocate RX ring memory\n");
|
|
|
|
err = ENOMEM;
|
|
|
|
goto rx_fail;
|
|
|
|
}
|
|
|
|
|
iflib: fix invalid free during queue allocation failure
In r301567, code was added to cleanup to prevent memory leaks for the
Tx and Rx ring structs. This code carefully tracked txq and rxq, and
made sure to free them properly during cleanup.
Because we assigned the txq and rxq pointers into the ctx->ifc_txqs and
ctx->ifc_rxqs, we carefully reset these pointers to NULL, so that
cleanup code would not accidentally free the memory twice.
This was changed by r304021 ("Update iflib to support more NIC designs"),
which removed this resetting of the pointers to NULL, because it re-used
the txq and rxq pointers as an index into the queue set array.
Unfortunately, the cleanup code was left alone. Thus, if we fail to
allocate DMA or fail to configure the queues using the drivers ifdi
methods, we will attempt to free txq and rxq. These variables would now
incorrectly point to the wrong location, resulting in a page fault.
There are a number of methods to correct this, but ultimately the root
cause was that we reuse the txq and rxq pointers for two different
purposes.
Instead, when allocating, store the returned pointer directly into
ctx->ifc_txqs and ctx->ifc_rxqs. Then, assign this to txq and rxq as
index pointers before starting the loop to allocate each queue.
Drop the cleanup code for txq and rxq, and only use ctx->ifc_txqs and
ctx->ifc_rxqs.
Thus, we no longer need to free txq or rxq under any error flow, and
intsead rely solely on the pointers stored in ctx->ifc_txqs and
ctx->ifc_rxqs. This prevents the invalid free(), and ensures that we
still properly cleanup after ourselves as before when failing to
allocate.
Submitted by: Jacob Keller
Reviewed by: gallatin, sbruno
Sponsored by: Intel Corporation
Differential Revision: https://reviews.freebsd.org/D15285
2018-05-04 15:20:34 +00:00
|
|
|
txq = ctx->ifc_txqs;
|
|
|
|
rxq = ctx->ifc_rxqs;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX handle allocation failure
|
|
|
|
*/
|
2016-07-06 14:09:49 +00:00
|
|
|
for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) {
|
2016-05-18 04:35:58 +00:00
|
|
|
/* Set up some basics */
|
|
|
|
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs,
|
|
|
|
M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
|
|
|
|
device_printf(dev,
|
|
|
|
"Unable to allocate TX DMA info memory\n");
|
2016-05-18 04:35:58 +00:00
|
|
|
err = ENOMEM;
|
2016-06-07 20:26:00 +00:00
|
|
|
goto err_tx_desc;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
txq->ift_ifdi = ifdip;
|
|
|
|
for (j = 0; j < ntxqs; j++, ifdip++) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
if (iflib_dma_alloc(ctx, txqsizes[j], ifdip, 0)) {
|
|
|
|
device_printf(dev,
|
|
|
|
"Unable to allocate TX descriptors\n");
|
2016-05-18 04:35:58 +00:00
|
|
|
err = ENOMEM;
|
|
|
|
goto err_tx_desc;
|
|
|
|
}
|
2017-03-13 22:53:06 +00:00
|
|
|
txq->ift_txd_size[j] = scctx->isc_txd_size[j];
|
2016-05-18 04:35:58 +00:00
|
|
|
bzero((void *)ifdip->idi_vaddr, txqsizes[j]);
|
|
|
|
}
|
|
|
|
txq->ift_ctx = ctx;
|
|
|
|
txq->ift_id = i;
|
2016-08-12 21:29:44 +00:00
|
|
|
if (sctx->isc_flags & IFLIB_HAS_TXCQ) {
|
|
|
|
txq->ift_br_offset = 1;
|
|
|
|
} else {
|
|
|
|
txq->ift_br_offset = 0;
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
if (iflib_txsd_alloc(txq)) {
|
|
|
|
device_printf(dev, "Critical Failure setting up TX buffers\n");
|
|
|
|
err = ENOMEM;
|
|
|
|
goto err_tx_desc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Initialize the TX lock */
|
2019-05-06 20:56:41 +00:00
|
|
|
snprintf(txq->ift_mtx_name, MTX_NAME_LEN, "%s:TX(%d):callout",
|
2016-05-18 04:35:58 +00:00
|
|
|
device_get_nameunit(dev), txq->ift_id);
|
|
|
|
mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF);
|
|
|
|
callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0);
|
2020-10-27 21:53:33 +00:00
|
|
|
txq->ift_timer.c_cpu = cpu;
|
|
|
|
#ifdef DEV_NETMAP
|
|
|
|
callout_init_mtx(&txq->ift_netmap_timer, &txq->ift_mtx, 0);
|
|
|
|
txq->ift_netmap_timer.c_cpu = cpu;
|
|
|
|
#endif /* DEV_NETMAP */
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
err = ifmp_ring_alloc(&txq->ift_br, 2048, txq, iflib_txq_drain,
|
|
|
|
iflib_txq_can_drain, M_IFLIB, M_WAITOK);
|
|
|
|
if (err) {
|
|
|
|
/* XXX free any allocated rings */
|
|
|
|
device_printf(dev, "Unable to allocate buf_ring\n");
|
|
|
|
goto err_tx_desc;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) {
|
|
|
|
/* Set up some basics */
|
2020-02-12 08:30:07 +00:00
|
|
|
callout_init(&rxq->ifr_watchdog, 1);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
if ((ifdip = malloc(sizeof(struct iflib_dma_info) * nrxqs,
|
|
|
|
M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
|
|
|
|
device_printf(dev,
|
|
|
|
"Unable to allocate RX DMA info memory\n");
|
2016-05-18 04:35:58 +00:00
|
|
|
err = ENOMEM;
|
2016-06-07 20:26:00 +00:00
|
|
|
goto err_tx_desc;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
rxq->ifr_ifdi = ifdip;
|
2017-03-13 22:53:06 +00:00
|
|
|
/* XXX this needs to be changed if #rx queues != #tx queues */
|
|
|
|
rxq->ifr_ntxqirq = 1;
|
|
|
|
rxq->ifr_txqid[0] = i;
|
2016-05-18 04:35:58 +00:00
|
|
|
for (j = 0; j < nrxqs; j++, ifdip++) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, 0)) {
|
|
|
|
device_printf(dev,
|
|
|
|
"Unable to allocate RX descriptors\n");
|
2016-05-18 04:35:58 +00:00
|
|
|
err = ENOMEM;
|
|
|
|
goto err_tx_desc;
|
|
|
|
}
|
|
|
|
bzero((void *)ifdip->idi_vaddr, rxqsizes[j]);
|
|
|
|
}
|
|
|
|
rxq->ifr_ctx = ctx;
|
|
|
|
rxq->ifr_id = i;
|
2021-01-10 13:49:51 +00:00
|
|
|
rxq->ifr_fl_offset = fl_offset;
|
2016-05-18 04:35:58 +00:00
|
|
|
rxq->ifr_nfl = nfree_lists;
|
|
|
|
if (!(fl =
|
2018-01-21 15:42:36 +00:00
|
|
|
(iflib_fl_t) malloc(sizeof(struct iflib_fl) * nfree_lists, M_IFLIB, M_NOWAIT | M_ZERO))) {
|
2016-05-18 04:35:58 +00:00
|
|
|
device_printf(dev, "Unable to allocate free list memory\n");
|
|
|
|
err = ENOMEM;
|
2016-06-07 20:26:00 +00:00
|
|
|
goto err_tx_desc;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
rxq->ifr_fl = fl;
|
|
|
|
for (j = 0; j < nfree_lists; j++) {
|
2017-03-13 22:53:06 +00:00
|
|
|
fl[j].ifl_rxq = rxq;
|
|
|
|
fl[j].ifl_id = j;
|
|
|
|
fl[j].ifl_ifdi = &rxq->ifr_ifdi[j + rxq->ifr_fl_offset];
|
|
|
|
fl[j].ifl_rxd_size = scctx->isc_rxd_size[j];
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
2018-10-12 22:40:54 +00:00
|
|
|
/* Allocate receive buffers for the ring */
|
2016-05-18 04:35:58 +00:00
|
|
|
if (iflib_rxsd_alloc(rxq)) {
|
|
|
|
device_printf(dev,
|
|
|
|
"Critical Failure setting up receive buffers\n");
|
|
|
|
err = ENOMEM;
|
|
|
|
goto err_rx_desc;
|
|
|
|
}
|
2017-07-03 18:23:35 +00:00
|
|
|
|
|
|
|
for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
|
2019-01-26 21:35:51 +00:00
|
|
|
fl->ifl_rx_bitmap = bit_alloc(fl->ifl_size, M_IFLIB,
|
|
|
|
M_WAITOK);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* TXQs */
|
|
|
|
vaddrs = malloc(sizeof(caddr_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
|
|
|
|
paddrs = malloc(sizeof(uint64_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
|
|
|
|
for (i = 0; i < ntxqsets; i++) {
|
|
|
|
iflib_dma_info_t di = ctx->ifc_txqs[i].ift_ifdi;
|
|
|
|
|
|
|
|
for (j = 0; j < ntxqs; j++, di++) {
|
|
|
|
vaddrs[i*ntxqs + j] = di->idi_vaddr;
|
|
|
|
paddrs[i*ntxqs + j] = di->idi_paddr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if ((err = IFDI_TX_QUEUES_ALLOC(ctx, vaddrs, paddrs, ntxqs, ntxqsets)) != 0) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
device_printf(ctx->ifc_dev,
|
|
|
|
"Unable to allocate device TX queue\n");
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_tx_structures_free(ctx);
|
|
|
|
free(vaddrs, M_IFLIB);
|
|
|
|
free(paddrs, M_IFLIB);
|
|
|
|
goto err_rx_desc;
|
|
|
|
}
|
|
|
|
free(vaddrs, M_IFLIB);
|
|
|
|
free(paddrs, M_IFLIB);
|
|
|
|
|
|
|
|
/* RXQs */
|
|
|
|
vaddrs = malloc(sizeof(caddr_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
|
|
|
|
paddrs = malloc(sizeof(uint64_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
|
|
|
|
for (i = 0; i < nrxqsets; i++) {
|
|
|
|
iflib_dma_info_t di = ctx->ifc_rxqs[i].ifr_ifdi;
|
|
|
|
|
|
|
|
for (j = 0; j < nrxqs; j++, di++) {
|
|
|
|
vaddrs[i*nrxqs + j] = di->idi_vaddr;
|
|
|
|
paddrs[i*nrxqs + j] = di->idi_paddr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if ((err = IFDI_RX_QUEUES_ALLOC(ctx, vaddrs, paddrs, nrxqs, nrxqsets)) != 0) {
|
o As illustrated by e. g. figure 7-14 of the Intel 82599 10 GbE
controller datasheet revision 3.3, in the context of Ethernet
MACs the control data describing the packet buffers typically
are named "descriptors". Each of these descriptors references
one buffer, multiple of which a packet can be composed of.
By contrast, in comments, messages and the names of structure
members, iflib(4) refers to DMA resources employed for RX and
TX buffers (rather than control data) as "desc(riptors)".
This odd naming convention of iflib(4) made reviewing r343085
and identifying wrong and missing bus_dmamap_sync(9) calls in
particular way harder than it already is. This convention may
also explain why the netmap(4) part of iflib(4) pairs the DMA
tags for control data with DMA maps of buffers and vice versa
in calls to bus_dma(9) functions.
Therefore, change iflib(4) to refer to buf(fers) when buffers
and not the usual understanding of descriptors is meant. This
change does not include corrections to the DMA resources used
in the netmap(4) parts. However, it revises error messages to
state which kind of allocation/creation failed. Specifically,
the "Unable to allocate tx_buffer (map) memory" copy & pasted
inappropriately on several occasions was replaced with proper
messages.
o Enhance some other error messages to indicate which half - RX
or TX - they apply to instead of using identical text in both
cases and generally canonicalize them.
o Correct the descriptions of iflib_{r,t}xsd_alloc() to reflect
reality; current code doesn't use {r,t}x_buffer structures.
o In iflib_queues_alloc():
- Remove redundant BUS_DMA_NOWAIT of iflib_dma_alloc() calls,
- change the M_WAITOK from malloc(9) calls into M_NOWAIT. The
return values are already checked, deferred DMA allocations
not being an option at this point, BUS_DMA_NOWAIT has to be
used anyway and prior malloc(9) calls in this function also
specify M_NOWAIT.
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D19067
2019-02-04 20:46:57 +00:00
|
|
|
device_printf(ctx->ifc_dev,
|
|
|
|
"Unable to allocate device RX queue\n");
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_tx_structures_free(ctx);
|
|
|
|
free(vaddrs, M_IFLIB);
|
|
|
|
free(paddrs, M_IFLIB);
|
|
|
|
goto err_rx_desc;
|
|
|
|
}
|
|
|
|
free(vaddrs, M_IFLIB);
|
|
|
|
free(paddrs, M_IFLIB);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
/* XXX handle allocation failure changes */
|
|
|
|
err_rx_desc:
|
|
|
|
err_tx_desc:
|
iflib: fix invalid free during queue allocation failure
In r301567, code was added to cleanup to prevent memory leaks for the
Tx and Rx ring structs. This code carefully tracked txq and rxq, and
made sure to free them properly during cleanup.
Because we assigned the txq and rxq pointers into the ctx->ifc_txqs and
ctx->ifc_rxqs, we carefully reset these pointers to NULL, so that
cleanup code would not accidentally free the memory twice.
This was changed by r304021 ("Update iflib to support more NIC designs"),
which removed this resetting of the pointers to NULL, because it re-used
the txq and rxq pointers as an index into the queue set array.
Unfortunately, the cleanup code was left alone. Thus, if we fail to
allocate DMA or fail to configure the queues using the drivers ifdi
methods, we will attempt to free txq and rxq. These variables would now
incorrectly point to the wrong location, resulting in a page fault.
There are a number of methods to correct this, but ultimately the root
cause was that we reuse the txq and rxq pointers for two different
purposes.
Instead, when allocating, store the returned pointer directly into
ctx->ifc_txqs and ctx->ifc_rxqs. Then, assign this to txq and rxq as
index pointers before starting the loop to allocate each queue.
Drop the cleanup code for txq and rxq, and only use ctx->ifc_txqs and
ctx->ifc_rxqs.
Thus, we no longer need to free txq or rxq under any error flow, and
intsead rely solely on the pointers stored in ctx->ifc_txqs and
ctx->ifc_rxqs. This prevents the invalid free(), and ensures that we
still properly cleanup after ourselves as before when failing to
allocate.
Submitted by: Jacob Keller
Reviewed by: gallatin, sbruno
Sponsored by: Intel Corporation
Differential Revision: https://reviews.freebsd.org/D15285
2018-05-04 15:20:34 +00:00
|
|
|
rx_fail:
|
2016-05-18 04:35:58 +00:00
|
|
|
if (ctx->ifc_rxqs != NULL)
|
|
|
|
free(ctx->ifc_rxqs, M_IFLIB);
|
|
|
|
ctx->ifc_rxqs = NULL;
|
|
|
|
if (ctx->ifc_txqs != NULL)
|
|
|
|
free(ctx->ifc_txqs, M_IFLIB);
|
|
|
|
ctx->ifc_txqs = NULL;
|
|
|
|
fail:
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
iflib_tx_structures_setup(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
iflib_txq_t txq = ctx->ifc_txqs;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < NTXQSETS(ctx); i++, txq++)
|
|
|
|
iflib_txq_setup(txq);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
iflib_tx_structures_free(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
iflib_txq_t txq = ctx->ifc_txqs;
|
2018-11-14 15:16:45 +00:00
|
|
|
if_shared_ctx_t sctx = ctx->ifc_sctx;
|
2016-05-18 04:35:58 +00:00
|
|
|
int i, j;
|
|
|
|
|
|
|
|
for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
|
2018-11-14 15:16:45 +00:00
|
|
|
for (j = 0; j < sctx->isc_ntxqs; j++)
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_dma_free(&txq->ift_ifdi[j]);
|
2019-10-30 20:45:12 +00:00
|
|
|
iflib_txq_destroy(txq);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
free(ctx->ifc_txqs, M_IFLIB);
|
|
|
|
ctx->ifc_txqs = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*********************************************************************
|
|
|
|
*
|
|
|
|
* Initialize all receive rings.
|
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
static int
|
|
|
|
iflib_rx_structures_setup(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
iflib_rxq_t rxq = ctx->ifc_rxqs;
|
2016-05-18 14:18:03 +00:00
|
|
|
int q;
|
|
|
|
#if defined(INET6) || defined(INET)
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
int err, i;
|
2016-05-18 14:18:03 +00:00
|
|
|
#endif
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
for (q = 0; q < ctx->ifc_softc_ctx.isc_nrxqsets; q++, rxq++) {
|
2016-05-18 14:18:03 +00:00
|
|
|
#if defined(INET6) || defined(INET)
|
2021-04-23 09:51:22 +00:00
|
|
|
err = tcp_lro_init_args(&rxq->ifr_lc, ctx->ifc_ifp,
|
|
|
|
TCP_LRO_ENTRIES, min(1024,
|
|
|
|
ctx->ifc_softc_ctx.isc_nrxd[rxq->ifr_fl_offset]));
|
|
|
|
if (err != 0) {
|
|
|
|
device_printf(ctx->ifc_dev,
|
|
|
|
"LRO Initialization failed!\n");
|
|
|
|
goto fail;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
2016-05-18 14:18:03 +00:00
|
|
|
#endif
|
2016-05-18 04:35:58 +00:00
|
|
|
IFDI_RXQ_SETUP(ctx, rxq->ifr_id);
|
|
|
|
}
|
|
|
|
return (0);
|
2016-05-18 14:18:03 +00:00
|
|
|
#if defined(INET6) || defined(INET)
|
2016-05-18 04:35:58 +00:00
|
|
|
fail:
|
|
|
|
/*
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
* Free LRO resources allocated so far, we will only handle
|
2016-05-18 04:35:58 +00:00
|
|
|
* the rings that completed, the failing case will have
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
* cleaned up for itself. 'q' failed, so its the terminus.
|
2016-05-18 04:35:58 +00:00
|
|
|
*/
|
|
|
|
rxq = ctx->ifc_rxqs;
|
|
|
|
for (i = 0; i < q; ++i, rxq++) {
|
2021-04-23 09:51:22 +00:00
|
|
|
tcp_lro_free(&rxq->ifr_lc);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
return (err);
|
2016-05-18 14:18:03 +00:00
|
|
|
#endif
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*********************************************************************
|
|
|
|
*
|
|
|
|
* Free all receive rings.
|
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
static void
|
|
|
|
iflib_rx_structures_free(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
iflib_rxq_t rxq = ctx->ifc_rxqs;
|
iflib: properly release memory allocated for DMA
DMA memory allocations using the bus_dma.h interface are not properly
released in all cases for both Tx and Rx. This causes ~448 bytes of
M_DEVBUF allocations to be leaked.
First, the DMA maps for Rx are not properly destroyed. A slight attempt
is made in iflib_fl_bufs_free to destroy the maps if we're detaching.
However, this function may not be reliably called during detach. Indeed,
there is a comment "asking" if this should be moved out.
Fix this by moving the bus_dmamap_destroy call into iflib_rx_sds_free,
where we already sync and unload the DMA.
Second, the DMA tag associated with the ifr_ifdi descriptor DMA is not
released properly anywhere. Add a call to iflib_dma_free in
iflib_rx_structures_free.
Third, use of NULL as a canary value on the map pointer returned by
bus_dmamap_create is not valid. On some platforms, notably x86, this
value may be NULL. In this case, we fail to properly release the related
resources.
Remove the NULL checks on map values in both iflib_fl_bufs_free and
iflib_txsd_destroy.
With all of these fixes applied, the leaks to M_DEVBUF are squelched,
and iflib drivers now seem to properly cleanup when detaching.
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Submitted by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed by: erj@, gallatin@
MFC after: 1 week
Sponsored by: Intel Corporation
Differential Revision: https://reviews.freebsd.org/D22203
2019-11-04 23:06:57 +00:00
|
|
|
if_shared_ctx_t sctx = ctx->ifc_sctx;
|
|
|
|
int i, j;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
for (i = 0; i < ctx->ifc_softc_ctx.isc_nrxqsets; i++, rxq++) {
|
iflib: properly release memory allocated for DMA
DMA memory allocations using the bus_dma.h interface are not properly
released in all cases for both Tx and Rx. This causes ~448 bytes of
M_DEVBUF allocations to be leaked.
First, the DMA maps for Rx are not properly destroyed. A slight attempt
is made in iflib_fl_bufs_free to destroy the maps if we're detaching.
However, this function may not be reliably called during detach. Indeed,
there is a comment "asking" if this should be moved out.
Fix this by moving the bus_dmamap_destroy call into iflib_rx_sds_free,
where we already sync and unload the DMA.
Second, the DMA tag associated with the ifr_ifdi descriptor DMA is not
released properly anywhere. Add a call to iflib_dma_free in
iflib_rx_structures_free.
Third, use of NULL as a canary value on the map pointer returned by
bus_dmamap_create is not valid. On some platforms, notably x86, this
value may be NULL. In this case, we fail to properly release the related
resources.
Remove the NULL checks on map values in both iflib_fl_bufs_free and
iflib_txsd_destroy.
With all of these fixes applied, the leaks to M_DEVBUF are squelched,
and iflib drivers now seem to properly cleanup when detaching.
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Submitted by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed by: erj@, gallatin@
MFC after: 1 week
Sponsored by: Intel Corporation
Differential Revision: https://reviews.freebsd.org/D22203
2019-11-04 23:06:57 +00:00
|
|
|
for (j = 0; j < sctx->isc_nrxqs; j++)
|
|
|
|
iflib_dma_free(&rxq->ifr_ifdi[j]);
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_rx_sds_free(rxq);
|
2019-05-08 09:03:43 +00:00
|
|
|
#if defined(INET6) || defined(INET)
|
2021-04-23 09:51:22 +00:00
|
|
|
tcp_lro_free(&rxq->ifr_lc);
|
2019-05-08 09:03:43 +00:00
|
|
|
#endif
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
2018-10-12 22:40:54 +00:00
|
|
|
free(ctx->ifc_rxqs, M_IFLIB);
|
|
|
|
ctx->ifc_rxqs = NULL;
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
iflib_qset_structures_setup(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
2018-05-08 16:56:02 +00:00
|
|
|
/*
|
|
|
|
* It is expected that the caller takes care of freeing queues if this
|
|
|
|
* fails.
|
|
|
|
*/
|
2018-05-08 17:15:10 +00:00
|
|
|
if ((err = iflib_tx_structures_setup(ctx)) != 0) {
|
|
|
|
device_printf(ctx->ifc_dev, "iflib_tx_structures_setup failed: %d\n", err);
|
2016-05-18 04:35:58 +00:00
|
|
|
return (err);
|
2018-05-08 17:15:10 +00:00
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2018-05-08 16:56:02 +00:00
|
|
|
if ((err = iflib_rx_structures_setup(ctx)) != 0)
|
2016-05-18 04:35:58 +00:00
|
|
|
device_printf(ctx->ifc_dev, "iflib_rx_structures_setup failed: %d\n", err);
|
2018-05-08 16:56:02 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
|
2018-05-29 21:56:39 +00:00
|
|
|
driver_filter_t filter, void *filter_arg, driver_intr_t handler, void *arg, const char *name)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name));
|
|
|
|
}
|
|
|
|
|
2017-12-20 01:03:34 +00:00
|
|
|
/* Just to avoid copy/paste */
|
|
|
|
static inline int
|
Make taskqgroup_attach{,_cpu}(9) work across architectures
So far, intr_{g,s}etaffinity(9) take a single int for identifying
a device interrupt. This approach doesn't work on all architectures
supported, as a single int isn't sufficient to globally specify a
device interrupt. In particular, with multiple interrupt controllers
in one system as found on e. g. arm and arm64 machines, an interrupt
number as returned by rman_get_start(9) may be only unique relative
to the bus and, thus, interrupt controller, a certain device hangs
off from.
In turn, this makes taskqgroup_attach{,_cpu}(9) and - internal to
the gtaskqueue implementation - taskqgroup_attach_deferred{,_cpu}()
not work across architectures. Yet in turn, iflib(4) as gtaskqueue
consumer so far doesn't fit architectures where interrupt numbers
aren't globally unique.
However, at least for intr_setaffinity(..., CPU_WHICH_IRQ, ...) as
employed by the gtaskqueue implementation to bind an interrupt to a
particular CPU, using bus_bind_intr(9) instead is equivalent from
a functional point of view, with bus_bind_intr(9) taking the device
and interrupt resource arguments required for uniquely specifying a
device interrupt.
Thus, change the gtaskqueue implementation to employ bus_bind_intr(9)
instead and intr_{g,s}etaffinity(9) to take the device and interrupt
resource arguments required respectively. This change also moves
struct grouptask from <sys/_task.h> to <sys/gtaskqueue.h> and wraps
struct gtask along with the gtask_fn_t typedef into #ifdef _KERNEL
as userland likes to include <sys/_task.h> or indirectly drags it
in - for better or worse also with _KERNEL defined -, which with
device_t and struct resource dependencies otherwise is no longer
as easily possible now.
The userland inclusion problem probably can be improved a bit by
introducing a _WANT_TASK (as well as a _WANT_MOUNT) akin to the
existing _WANT_PRISON etc., which is orthogonal to this change,
though, and likely needs an exp-run.
While at it:
- Change the gt_cpu member in the grouptask structure to be of type
int as used elswhere for specifying CPUs (an int16_t may be too
narrow sooner or later),
- move the gtaskqueue_enqueue_fn typedef from <sys/gtaskqueue.h> to
the gtaskqueue implementation as it's only used and needed there,
- change the GTASK_INIT macro to use "gtask" rather than "task" as
argument given that it actually operates on a struct gtask rather
than a struct task, and
- let subr_gtaskqueue.c consistently use __func__ to print functions
names.
Reported by: mmel
Reviewed by: mmel
Differential Revision: https://reviews.freebsd.org/D19139
2019-02-12 21:23:59 +00:00
|
|
|
iflib_irq_set_affinity(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,
|
|
|
|
int qid, struct grouptask *gtask, struct taskqgroup *tqg, void *uniq,
|
|
|
|
const char *name)
|
2017-12-20 01:03:34 +00:00
|
|
|
{
|
Make taskqgroup_attach{,_cpu}(9) work across architectures
So far, intr_{g,s}etaffinity(9) take a single int for identifying
a device interrupt. This approach doesn't work on all architectures
supported, as a single int isn't sufficient to globally specify a
device interrupt. In particular, with multiple interrupt controllers
in one system as found on e. g. arm and arm64 machines, an interrupt
number as returned by rman_get_start(9) may be only unique relative
to the bus and, thus, interrupt controller, a certain device hangs
off from.
In turn, this makes taskqgroup_attach{,_cpu}(9) and - internal to
the gtaskqueue implementation - taskqgroup_attach_deferred{,_cpu}()
not work across architectures. Yet in turn, iflib(4) as gtaskqueue
consumer so far doesn't fit architectures where interrupt numbers
aren't globally unique.
However, at least for intr_setaffinity(..., CPU_WHICH_IRQ, ...) as
employed by the gtaskqueue implementation to bind an interrupt to a
particular CPU, using bus_bind_intr(9) instead is equivalent from
a functional point of view, with bus_bind_intr(9) taking the device
and interrupt resource arguments required for uniquely specifying a
device interrupt.
Thus, change the gtaskqueue implementation to employ bus_bind_intr(9)
instead and intr_{g,s}etaffinity(9) to take the device and interrupt
resource arguments required respectively. This change also moves
struct grouptask from <sys/_task.h> to <sys/gtaskqueue.h> and wraps
struct gtask along with the gtask_fn_t typedef into #ifdef _KERNEL
as userland likes to include <sys/_task.h> or indirectly drags it
in - for better or worse also with _KERNEL defined -, which with
device_t and struct resource dependencies otherwise is no longer
as easily possible now.
The userland inclusion problem probably can be improved a bit by
introducing a _WANT_TASK (as well as a _WANT_MOUNT) akin to the
existing _WANT_PRISON etc., which is orthogonal to this change,
though, and likely needs an exp-run.
While at it:
- Change the gt_cpu member in the grouptask structure to be of type
int as used elswhere for specifying CPUs (an int16_t may be too
narrow sooner or later),
- move the gtaskqueue_enqueue_fn typedef from <sys/gtaskqueue.h> to
the gtaskqueue implementation as it's only used and needed there,
- change the GTASK_INIT macro to use "gtask" rather than "task" as
argument given that it actually operates on a struct gtask rather
than a struct task, and
- let subr_gtaskqueue.c consistently use __func__ to print functions
names.
Reported by: mmel
Reviewed by: mmel
Differential Revision: https://reviews.freebsd.org/D19139
2019-02-12 21:23:59 +00:00
|
|
|
device_t dev;
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
unsigned int base_cpuid, cpuid;
|
|
|
|
int err;
|
2017-12-20 01:03:34 +00:00
|
|
|
|
Make taskqgroup_attach{,_cpu}(9) work across architectures
So far, intr_{g,s}etaffinity(9) take a single int for identifying
a device interrupt. This approach doesn't work on all architectures
supported, as a single int isn't sufficient to globally specify a
device interrupt. In particular, with multiple interrupt controllers
in one system as found on e. g. arm and arm64 machines, an interrupt
number as returned by rman_get_start(9) may be only unique relative
to the bus and, thus, interrupt controller, a certain device hangs
off from.
In turn, this makes taskqgroup_attach{,_cpu}(9) and - internal to
the gtaskqueue implementation - taskqgroup_attach_deferred{,_cpu}()
not work across architectures. Yet in turn, iflib(4) as gtaskqueue
consumer so far doesn't fit architectures where interrupt numbers
aren't globally unique.
However, at least for intr_setaffinity(..., CPU_WHICH_IRQ, ...) as
employed by the gtaskqueue implementation to bind an interrupt to a
particular CPU, using bus_bind_intr(9) instead is equivalent from
a functional point of view, with bus_bind_intr(9) taking the device
and interrupt resource arguments required for uniquely specifying a
device interrupt.
Thus, change the gtaskqueue implementation to employ bus_bind_intr(9)
instead and intr_{g,s}etaffinity(9) to take the device and interrupt
resource arguments required respectively. This change also moves
struct grouptask from <sys/_task.h> to <sys/gtaskqueue.h> and wraps
struct gtask along with the gtask_fn_t typedef into #ifdef _KERNEL
as userland likes to include <sys/_task.h> or indirectly drags it
in - for better or worse also with _KERNEL defined -, which with
device_t and struct resource dependencies otherwise is no longer
as easily possible now.
The userland inclusion problem probably can be improved a bit by
introducing a _WANT_TASK (as well as a _WANT_MOUNT) akin to the
existing _WANT_PRISON etc., which is orthogonal to this change,
though, and likely needs an exp-run.
While at it:
- Change the gt_cpu member in the grouptask structure to be of type
int as used elswhere for specifying CPUs (an int16_t may be too
narrow sooner or later),
- move the gtaskqueue_enqueue_fn typedef from <sys/gtaskqueue.h> to
the gtaskqueue implementation as it's only used and needed there,
- change the GTASK_INIT macro to use "gtask" rather than "task" as
argument given that it actually operates on a struct gtask rather
than a struct task, and
- let subr_gtaskqueue.c consistently use __func__ to print functions
names.
Reported by: mmel
Reviewed by: mmel
Differential Revision: https://reviews.freebsd.org/D19139
2019-02-12 21:23:59 +00:00
|
|
|
dev = ctx->ifc_dev;
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
base_cpuid = ctx->ifc_sysctl_core_offset;
|
|
|
|
cpuid = get_cpuid_for_queue(ctx, base_cpuid, qid, type == IFLIB_INTR_TX);
|
|
|
|
err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid, dev,
|
|
|
|
irq ? irq->ii_res : NULL, name);
|
2017-12-20 01:03:34 +00:00
|
|
|
if (err) {
|
Make taskqgroup_attach{,_cpu}(9) work across architectures
So far, intr_{g,s}etaffinity(9) take a single int for identifying
a device interrupt. This approach doesn't work on all architectures
supported, as a single int isn't sufficient to globally specify a
device interrupt. In particular, with multiple interrupt controllers
in one system as found on e. g. arm and arm64 machines, an interrupt
number as returned by rman_get_start(9) may be only unique relative
to the bus and, thus, interrupt controller, a certain device hangs
off from.
In turn, this makes taskqgroup_attach{,_cpu}(9) and - internal to
the gtaskqueue implementation - taskqgroup_attach_deferred{,_cpu}()
not work across architectures. Yet in turn, iflib(4) as gtaskqueue
consumer so far doesn't fit architectures where interrupt numbers
aren't globally unique.
However, at least for intr_setaffinity(..., CPU_WHICH_IRQ, ...) as
employed by the gtaskqueue implementation to bind an interrupt to a
particular CPU, using bus_bind_intr(9) instead is equivalent from
a functional point of view, with bus_bind_intr(9) taking the device
and interrupt resource arguments required for uniquely specifying a
device interrupt.
Thus, change the gtaskqueue implementation to employ bus_bind_intr(9)
instead and intr_{g,s}etaffinity(9) to take the device and interrupt
resource arguments required respectively. This change also moves
struct grouptask from <sys/_task.h> to <sys/gtaskqueue.h> and wraps
struct gtask along with the gtask_fn_t typedef into #ifdef _KERNEL
as userland likes to include <sys/_task.h> or indirectly drags it
in - for better or worse also with _KERNEL defined -, which with
device_t and struct resource dependencies otherwise is no longer
as easily possible now.
The userland inclusion problem probably can be improved a bit by
introducing a _WANT_TASK (as well as a _WANT_MOUNT) akin to the
existing _WANT_PRISON etc., which is orthogonal to this change,
though, and likely needs an exp-run.
While at it:
- Change the gt_cpu member in the grouptask structure to be of type
int as used elswhere for specifying CPUs (an int16_t may be too
narrow sooner or later),
- move the gtaskqueue_enqueue_fn typedef from <sys/gtaskqueue.h> to
the gtaskqueue implementation as it's only used and needed there,
- change the GTASK_INIT macro to use "gtask" rather than "task" as
argument given that it actually operates on a struct gtask rather
than a struct task, and
- let subr_gtaskqueue.c consistently use __func__ to print functions
names.
Reported by: mmel
Reviewed by: mmel
Differential Revision: https://reviews.freebsd.org/D19139
2019-02-12 21:23:59 +00:00
|
|
|
device_printf(dev, "taskqgroup_attach_cpu failed %d\n", err);
|
2017-12-20 01:03:34 +00:00
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
#ifdef notyet
|
|
|
|
if (cpuid > ctx->ifc_cpuid_highest)
|
|
|
|
ctx->ifc_cpuid_highest = cpuid;
|
|
|
|
#endif
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
return (0);
|
2017-12-20 01:03:34 +00:00
|
|
|
}
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
int
|
|
|
|
iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
|
2018-05-29 21:56:39 +00:00
|
|
|
iflib_intr_type_t type, driver_filter_t *filter,
|
|
|
|
void *filter_arg, int qid, const char *name)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
Make taskqgroup_attach{,_cpu}(9) work across architectures
So far, intr_{g,s}etaffinity(9) take a single int for identifying
a device interrupt. This approach doesn't work on all architectures
supported, as a single int isn't sufficient to globally specify a
device interrupt. In particular, with multiple interrupt controllers
in one system as found on e. g. arm and arm64 machines, an interrupt
number as returned by rman_get_start(9) may be only unique relative
to the bus and, thus, interrupt controller, a certain device hangs
off from.
In turn, this makes taskqgroup_attach{,_cpu}(9) and - internal to
the gtaskqueue implementation - taskqgroup_attach_deferred{,_cpu}()
not work across architectures. Yet in turn, iflib(4) as gtaskqueue
consumer so far doesn't fit architectures where interrupt numbers
aren't globally unique.
However, at least for intr_setaffinity(..., CPU_WHICH_IRQ, ...) as
employed by the gtaskqueue implementation to bind an interrupt to a
particular CPU, using bus_bind_intr(9) instead is equivalent from
a functional point of view, with bus_bind_intr(9) taking the device
and interrupt resource arguments required for uniquely specifying a
device interrupt.
Thus, change the gtaskqueue implementation to employ bus_bind_intr(9)
instead and intr_{g,s}etaffinity(9) to take the device and interrupt
resource arguments required respectively. This change also moves
struct grouptask from <sys/_task.h> to <sys/gtaskqueue.h> and wraps
struct gtask along with the gtask_fn_t typedef into #ifdef _KERNEL
as userland likes to include <sys/_task.h> or indirectly drags it
in - for better or worse also with _KERNEL defined -, which with
device_t and struct resource dependencies otherwise is no longer
as easily possible now.
The userland inclusion problem probably can be improved a bit by
introducing a _WANT_TASK (as well as a _WANT_MOUNT) akin to the
existing _WANT_PRISON etc., which is orthogonal to this change,
though, and likely needs an exp-run.
While at it:
- Change the gt_cpu member in the grouptask structure to be of type
int as used elswhere for specifying CPUs (an int16_t may be too
narrow sooner or later),
- move the gtaskqueue_enqueue_fn typedef from <sys/gtaskqueue.h> to
the gtaskqueue implementation as it's only used and needed there,
- change the GTASK_INIT macro to use "gtask" rather than "task" as
argument given that it actually operates on a struct gtask rather
than a struct task, and
- let subr_gtaskqueue.c consistently use __func__ to print functions
names.
Reported by: mmel
Reviewed by: mmel
Differential Revision: https://reviews.freebsd.org/D19139
2019-02-12 21:23:59 +00:00
|
|
|
device_t dev;
|
2016-05-18 04:35:58 +00:00
|
|
|
struct grouptask *gtask;
|
|
|
|
struct taskqgroup *tqg;
|
|
|
|
iflib_filter_info_t info;
|
2016-08-12 21:29:44 +00:00
|
|
|
gtask_fn_t *fn;
|
2017-12-20 01:03:34 +00:00
|
|
|
int tqrid, err;
|
2017-03-13 22:53:06 +00:00
|
|
|
driver_filter_t *intr_fast;
|
2016-05-18 04:35:58 +00:00
|
|
|
void *q;
|
|
|
|
|
|
|
|
info = &ctx->ifc_filter_info;
|
2016-10-18 13:29:30 +00:00
|
|
|
tqrid = rid;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
/* XXX merge tx/rx for netmap? */
|
|
|
|
case IFLIB_INTR_TX:
|
|
|
|
q = &ctx->ifc_txqs[qid];
|
|
|
|
info = &ctx->ifc_txqs[qid].ift_filter_info;
|
|
|
|
gtask = &ctx->ifc_txqs[qid].ift_task;
|
2017-09-16 02:41:38 +00:00
|
|
|
tqg = qgroup_if_io_tqg;
|
2016-05-18 04:35:58 +00:00
|
|
|
fn = _task_fn_tx;
|
2017-03-13 22:53:06 +00:00
|
|
|
intr_fast = iflib_fast_intr;
|
2016-11-18 04:19:21 +00:00
|
|
|
GROUPTASK_INIT(gtask, 0, fn, q);
|
2018-05-16 21:03:22 +00:00
|
|
|
ctx->ifc_flags |= IFC_NETMAP_TX_IRQ;
|
2016-05-18 04:35:58 +00:00
|
|
|
break;
|
|
|
|
case IFLIB_INTR_RX:
|
|
|
|
q = &ctx->ifc_rxqs[qid];
|
|
|
|
info = &ctx->ifc_rxqs[qid].ifr_filter_info;
|
|
|
|
gtask = &ctx->ifc_rxqs[qid].ifr_task;
|
2017-09-16 02:41:38 +00:00
|
|
|
tqg = qgroup_if_io_tqg;
|
2016-05-18 04:35:58 +00:00
|
|
|
fn = _task_fn_rx;
|
2017-09-16 02:41:38 +00:00
|
|
|
intr_fast = iflib_fast_intr;
|
2020-02-11 18:57:07 +00:00
|
|
|
NET_GROUPTASK_INIT(gtask, 0, fn, q);
|
2017-03-13 22:53:06 +00:00
|
|
|
break;
|
|
|
|
case IFLIB_INTR_RXTX:
|
|
|
|
q = &ctx->ifc_rxqs[qid];
|
|
|
|
info = &ctx->ifc_rxqs[qid].ifr_filter_info;
|
|
|
|
gtask = &ctx->ifc_rxqs[qid].ifr_task;
|
2017-09-16 02:41:38 +00:00
|
|
|
tqg = qgroup_if_io_tqg;
|
2017-03-13 22:53:06 +00:00
|
|
|
fn = _task_fn_rx;
|
|
|
|
intr_fast = iflib_fast_intr_rxtx;
|
2020-02-11 18:57:07 +00:00
|
|
|
NET_GROUPTASK_INIT(gtask, 0, fn, q);
|
2016-05-18 04:35:58 +00:00
|
|
|
break;
|
|
|
|
case IFLIB_INTR_ADMIN:
|
|
|
|
q = ctx;
|
2016-11-18 04:19:21 +00:00
|
|
|
tqrid = -1;
|
2016-05-18 04:35:58 +00:00
|
|
|
info = &ctx->ifc_filter_info;
|
|
|
|
gtask = &ctx->ifc_admin_task;
|
2017-09-16 02:41:38 +00:00
|
|
|
tqg = qgroup_if_config_tqg;
|
2016-05-18 04:35:58 +00:00
|
|
|
fn = _task_fn_admin;
|
2017-03-13 22:53:06 +00:00
|
|
|
intr_fast = iflib_fast_intr_ctx;
|
2016-05-18 04:35:58 +00:00
|
|
|
break;
|
|
|
|
default:
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
device_printf(ctx->ifc_dev, "%s: unknown net intr type\n",
|
|
|
|
__func__);
|
|
|
|
return (EINVAL);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
info->ifi_filter = filter;
|
|
|
|
info->ifi_filter_arg = filter_arg;
|
|
|
|
info->ifi_task = gtask;
|
2017-03-13 22:53:06 +00:00
|
|
|
info->ifi_ctx = q;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
Make taskqgroup_attach{,_cpu}(9) work across architectures
So far, intr_{g,s}etaffinity(9) take a single int for identifying
a device interrupt. This approach doesn't work on all architectures
supported, as a single int isn't sufficient to globally specify a
device interrupt. In particular, with multiple interrupt controllers
in one system as found on e. g. arm and arm64 machines, an interrupt
number as returned by rman_get_start(9) may be only unique relative
to the bus and, thus, interrupt controller, a certain device hangs
off from.
In turn, this makes taskqgroup_attach{,_cpu}(9) and - internal to
the gtaskqueue implementation - taskqgroup_attach_deferred{,_cpu}()
not work across architectures. Yet in turn, iflib(4) as gtaskqueue
consumer so far doesn't fit architectures where interrupt numbers
aren't globally unique.
However, at least for intr_setaffinity(..., CPU_WHICH_IRQ, ...) as
employed by the gtaskqueue implementation to bind an interrupt to a
particular CPU, using bus_bind_intr(9) instead is equivalent from
a functional point of view, with bus_bind_intr(9) taking the device
and interrupt resource arguments required for uniquely specifying a
device interrupt.
Thus, change the gtaskqueue implementation to employ bus_bind_intr(9)
instead and intr_{g,s}etaffinity(9) to take the device and interrupt
resource arguments required respectively. This change also moves
struct grouptask from <sys/_task.h> to <sys/gtaskqueue.h> and wraps
struct gtask along with the gtask_fn_t typedef into #ifdef _KERNEL
as userland likes to include <sys/_task.h> or indirectly drags it
in - for better or worse also with _KERNEL defined -, which with
device_t and struct resource dependencies otherwise is no longer
as easily possible now.
The userland inclusion problem probably can be improved a bit by
introducing a _WANT_TASK (as well as a _WANT_MOUNT) akin to the
existing _WANT_PRISON etc., which is orthogonal to this change,
though, and likely needs an exp-run.
While at it:
- Change the gt_cpu member in the grouptask structure to be of type
int as used elswhere for specifying CPUs (an int16_t may be too
narrow sooner or later),
- move the gtaskqueue_enqueue_fn typedef from <sys/gtaskqueue.h> to
the gtaskqueue implementation as it's only used and needed there,
- change the GTASK_INIT macro to use "gtask" rather than "task" as
argument given that it actually operates on a struct gtask rather
than a struct task, and
- let subr_gtaskqueue.c consistently use __func__ to print functions
names.
Reported by: mmel
Reviewed by: mmel
Differential Revision: https://reviews.freebsd.org/D19139
2019-02-12 21:23:59 +00:00
|
|
|
dev = ctx->ifc_dev;
|
2017-03-13 22:53:06 +00:00
|
|
|
err = _iflib_irq_alloc(ctx, irq, rid, intr_fast, NULL, info, name);
|
2016-11-18 04:19:21 +00:00
|
|
|
if (err != 0) {
|
Make taskqgroup_attach{,_cpu}(9) work across architectures
So far, intr_{g,s}etaffinity(9) take a single int for identifying
a device interrupt. This approach doesn't work on all architectures
supported, as a single int isn't sufficient to globally specify a
device interrupt. In particular, with multiple interrupt controllers
in one system as found on e. g. arm and arm64 machines, an interrupt
number as returned by rman_get_start(9) may be only unique relative
to the bus and, thus, interrupt controller, a certain device hangs
off from.
In turn, this makes taskqgroup_attach{,_cpu}(9) and - internal to
the gtaskqueue implementation - taskqgroup_attach_deferred{,_cpu}()
not work across architectures. Yet in turn, iflib(4) as gtaskqueue
consumer so far doesn't fit architectures where interrupt numbers
aren't globally unique.
However, at least for intr_setaffinity(..., CPU_WHICH_IRQ, ...) as
employed by the gtaskqueue implementation to bind an interrupt to a
particular CPU, using bus_bind_intr(9) instead is equivalent from
a functional point of view, with bus_bind_intr(9) taking the device
and interrupt resource arguments required for uniquely specifying a
device interrupt.
Thus, change the gtaskqueue implementation to employ bus_bind_intr(9)
instead and intr_{g,s}etaffinity(9) to take the device and interrupt
resource arguments required respectively. This change also moves
struct grouptask from <sys/_task.h> to <sys/gtaskqueue.h> and wraps
struct gtask along with the gtask_fn_t typedef into #ifdef _KERNEL
as userland likes to include <sys/_task.h> or indirectly drags it
in - for better or worse also with _KERNEL defined -, which with
device_t and struct resource dependencies otherwise is no longer
as easily possible now.
The userland inclusion problem probably can be improved a bit by
introducing a _WANT_TASK (as well as a _WANT_MOUNT) akin to the
existing _WANT_PRISON etc., which is orthogonal to this change,
though, and likely needs an exp-run.
While at it:
- Change the gt_cpu member in the grouptask structure to be of type
int as used elswhere for specifying CPUs (an int16_t may be too
narrow sooner or later),
- move the gtaskqueue_enqueue_fn typedef from <sys/gtaskqueue.h> to
the gtaskqueue implementation as it's only used and needed there,
- change the GTASK_INIT macro to use "gtask" rather than "task" as
argument given that it actually operates on a struct gtask rather
than a struct task, and
- let subr_gtaskqueue.c consistently use __func__ to print functions
names.
Reported by: mmel
Reviewed by: mmel
Differential Revision: https://reviews.freebsd.org/D19139
2019-02-12 21:23:59 +00:00
|
|
|
device_printf(dev, "_iflib_irq_alloc failed %d\n", err);
|
2016-05-18 04:35:58 +00:00
|
|
|
return (err);
|
2016-11-18 04:19:21 +00:00
|
|
|
}
|
|
|
|
if (type == IFLIB_INTR_ADMIN)
|
|
|
|
return (0);
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
if (tqrid != -1) {
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q,
|
|
|
|
name);
|
2017-12-20 01:03:34 +00:00
|
|
|
if (err)
|
|
|
|
return (err);
|
2016-10-18 13:12:19 +00:00
|
|
|
} else {
|
Make taskqgroup_attach{,_cpu}(9) work across architectures
So far, intr_{g,s}etaffinity(9) take a single int for identifying
a device interrupt. This approach doesn't work on all architectures
supported, as a single int isn't sufficient to globally specify a
device interrupt. In particular, with multiple interrupt controllers
in one system as found on e. g. arm and arm64 machines, an interrupt
number as returned by rman_get_start(9) may be only unique relative
to the bus and, thus, interrupt controller, a certain device hangs
off from.
In turn, this makes taskqgroup_attach{,_cpu}(9) and - internal to
the gtaskqueue implementation - taskqgroup_attach_deferred{,_cpu}()
not work across architectures. Yet in turn, iflib(4) as gtaskqueue
consumer so far doesn't fit architectures where interrupt numbers
aren't globally unique.
However, at least for intr_setaffinity(..., CPU_WHICH_IRQ, ...) as
employed by the gtaskqueue implementation to bind an interrupt to a
particular CPU, using bus_bind_intr(9) instead is equivalent from
a functional point of view, with bus_bind_intr(9) taking the device
and interrupt resource arguments required for uniquely specifying a
device interrupt.
Thus, change the gtaskqueue implementation to employ bus_bind_intr(9)
instead and intr_{g,s}etaffinity(9) to take the device and interrupt
resource arguments required respectively. This change also moves
struct grouptask from <sys/_task.h> to <sys/gtaskqueue.h> and wraps
struct gtask along with the gtask_fn_t typedef into #ifdef _KERNEL
as userland likes to include <sys/_task.h> or indirectly drags it
in - for better or worse also with _KERNEL defined -, which with
device_t and struct resource dependencies otherwise is no longer
as easily possible now.
The userland inclusion problem probably can be improved a bit by
introducing a _WANT_TASK (as well as a _WANT_MOUNT) akin to the
existing _WANT_PRISON etc., which is orthogonal to this change,
though, and likely needs an exp-run.
While at it:
- Change the gt_cpu member in the grouptask structure to be of type
int as used elswhere for specifying CPUs (an int16_t may be too
narrow sooner or later),
- move the gtaskqueue_enqueue_fn typedef from <sys/gtaskqueue.h> to
the gtaskqueue implementation as it's only used and needed there,
- change the GTASK_INIT macro to use "gtask" rather than "task" as
argument given that it actually operates on a struct gtask rather
than a struct task, and
- let subr_gtaskqueue.c consistently use __func__ to print functions
names.
Reported by: mmel
Reviewed by: mmel
Differential Revision: https://reviews.freebsd.org/D19139
2019-02-12 21:23:59 +00:00
|
|
|
taskqgroup_attach(tqg, gtask, q, dev, irq->ii_res, name);
|
2016-10-18 13:12:19 +00:00
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2018-10-12 22:40:54 +00:00
|
|
|
iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type, void *arg, int qid, const char *name)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
device_t dev;
|
2016-05-18 04:35:58 +00:00
|
|
|
struct grouptask *gtask;
|
|
|
|
struct taskqgroup *tqg;
|
2016-08-12 21:29:44 +00:00
|
|
|
gtask_fn_t *fn;
|
2016-05-18 04:35:58 +00:00
|
|
|
void *q;
|
2017-12-20 01:03:34 +00:00
|
|
|
int err;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case IFLIB_INTR_TX:
|
|
|
|
q = &ctx->ifc_txqs[qid];
|
|
|
|
gtask = &ctx->ifc_txqs[qid].ift_task;
|
2017-09-16 02:41:38 +00:00
|
|
|
tqg = qgroup_if_io_tqg;
|
2016-05-18 04:35:58 +00:00
|
|
|
fn = _task_fn_tx;
|
2020-02-12 09:19:47 +00:00
|
|
|
GROUPTASK_INIT(gtask, 0, fn, q);
|
2016-05-18 04:35:58 +00:00
|
|
|
break;
|
|
|
|
case IFLIB_INTR_RX:
|
|
|
|
q = &ctx->ifc_rxqs[qid];
|
|
|
|
gtask = &ctx->ifc_rxqs[qid].ifr_task;
|
2017-09-16 02:41:38 +00:00
|
|
|
tqg = qgroup_if_io_tqg;
|
2016-05-18 04:35:58 +00:00
|
|
|
fn = _task_fn_rx;
|
2020-02-12 09:19:47 +00:00
|
|
|
NET_GROUPTASK_INIT(gtask, 0, fn, q);
|
2016-05-18 04:35:58 +00:00
|
|
|
break;
|
|
|
|
case IFLIB_INTR_IOV:
|
|
|
|
q = ctx;
|
|
|
|
gtask = &ctx->ifc_vflr_task;
|
2017-09-16 02:41:38 +00:00
|
|
|
tqg = qgroup_if_config_tqg;
|
2016-05-18 04:35:58 +00:00
|
|
|
fn = _task_fn_iov;
|
2020-02-12 09:19:47 +00:00
|
|
|
GROUPTASK_INIT(gtask, 0, fn, q);
|
2016-05-18 04:35:58 +00:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
panic("unknown net intr type");
|
|
|
|
}
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q, name);
|
|
|
|
if (err) {
|
|
|
|
dev = ctx->ifc_dev;
|
|
|
|
taskqgroup_attach(tqg, gtask, q, dev, irq ? irq->ii_res : NULL,
|
|
|
|
name);
|
2017-12-20 01:03:34 +00:00
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
iflib_irq_free(if_ctx_t ctx, if_irq_t irq)
|
|
|
|
{
|
2019-01-30 13:21:26 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
if (irq->ii_tag)
|
|
|
|
bus_teardown_intr(ctx->ifc_dev, irq->ii_res, irq->ii_tag);
|
|
|
|
|
|
|
|
if (irq->ii_res)
|
2019-01-30 13:21:26 +00:00
|
|
|
bus_release_resource(ctx->ifc_dev, SYS_RES_IRQ,
|
|
|
|
rman_get_rid(irq->ii_res), irq->ii_res);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2018-05-29 21:56:39 +00:00
|
|
|
iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filter_arg, int *rid, const char *name)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
|
|
|
iflib_txq_t txq = ctx->ifc_txqs;
|
|
|
|
iflib_rxq_t rxq = ctx->ifc_rxqs;
|
|
|
|
if_irq_t irq = &ctx->ifc_legacy_irq;
|
|
|
|
iflib_filter_info_t info;
|
Make taskqgroup_attach{,_cpu}(9) work across architectures
So far, intr_{g,s}etaffinity(9) take a single int for identifying
a device interrupt. This approach doesn't work on all architectures
supported, as a single int isn't sufficient to globally specify a
device interrupt. In particular, with multiple interrupt controllers
in one system as found on e. g. arm and arm64 machines, an interrupt
number as returned by rman_get_start(9) may be only unique relative
to the bus and, thus, interrupt controller, a certain device hangs
off from.
In turn, this makes taskqgroup_attach{,_cpu}(9) and - internal to
the gtaskqueue implementation - taskqgroup_attach_deferred{,_cpu}()
not work across architectures. Yet in turn, iflib(4) as gtaskqueue
consumer so far doesn't fit architectures where interrupt numbers
aren't globally unique.
However, at least for intr_setaffinity(..., CPU_WHICH_IRQ, ...) as
employed by the gtaskqueue implementation to bind an interrupt to a
particular CPU, using bus_bind_intr(9) instead is equivalent from
a functional point of view, with bus_bind_intr(9) taking the device
and interrupt resource arguments required for uniquely specifying a
device interrupt.
Thus, change the gtaskqueue implementation to employ bus_bind_intr(9)
instead and intr_{g,s}etaffinity(9) to take the device and interrupt
resource arguments required respectively. This change also moves
struct grouptask from <sys/_task.h> to <sys/gtaskqueue.h> and wraps
struct gtask along with the gtask_fn_t typedef into #ifdef _KERNEL
as userland likes to include <sys/_task.h> or indirectly drags it
in - for better or worse also with _KERNEL defined -, which with
device_t and struct resource dependencies otherwise is no longer
as easily possible now.
The userland inclusion problem probably can be improved a bit by
introducing a _WANT_TASK (as well as a _WANT_MOUNT) akin to the
existing _WANT_PRISON etc., which is orthogonal to this change,
though, and likely needs an exp-run.
While at it:
- Change the gt_cpu member in the grouptask structure to be of type
int as used elswhere for specifying CPUs (an int16_t may be too
narrow sooner or later),
- move the gtaskqueue_enqueue_fn typedef from <sys/gtaskqueue.h> to
the gtaskqueue implementation as it's only used and needed there,
- change the GTASK_INIT macro to use "gtask" rather than "task" as
argument given that it actually operates on a struct gtask rather
than a struct task, and
- let subr_gtaskqueue.c consistently use __func__ to print functions
names.
Reported by: mmel
Reviewed by: mmel
Differential Revision: https://reviews.freebsd.org/D19139
2019-02-12 21:23:59 +00:00
|
|
|
device_t dev;
|
2016-05-18 04:35:58 +00:00
|
|
|
struct grouptask *gtask;
|
Make taskqgroup_attach{,_cpu}(9) work across architectures
So far, intr_{g,s}etaffinity(9) take a single int for identifying
a device interrupt. This approach doesn't work on all architectures
supported, as a single int isn't sufficient to globally specify a
device interrupt. In particular, with multiple interrupt controllers
in one system as found on e. g. arm and arm64 machines, an interrupt
number as returned by rman_get_start(9) may be only unique relative
to the bus and, thus, interrupt controller, a certain device hangs
off from.
In turn, this makes taskqgroup_attach{,_cpu}(9) and - internal to
the gtaskqueue implementation - taskqgroup_attach_deferred{,_cpu}()
not work across architectures. Yet in turn, iflib(4) as gtaskqueue
consumer so far doesn't fit architectures where interrupt numbers
aren't globally unique.
However, at least for intr_setaffinity(..., CPU_WHICH_IRQ, ...) as
employed by the gtaskqueue implementation to bind an interrupt to a
particular CPU, using bus_bind_intr(9) instead is equivalent from
a functional point of view, with bus_bind_intr(9) taking the device
and interrupt resource arguments required for uniquely specifying a
device interrupt.
Thus, change the gtaskqueue implementation to employ bus_bind_intr(9)
instead and intr_{g,s}etaffinity(9) to take the device and interrupt
resource arguments required respectively. This change also moves
struct grouptask from <sys/_task.h> to <sys/gtaskqueue.h> and wraps
struct gtask along with the gtask_fn_t typedef into #ifdef _KERNEL
as userland likes to include <sys/_task.h> or indirectly drags it
in - for better or worse also with _KERNEL defined -, which with
device_t and struct resource dependencies otherwise is no longer
as easily possible now.
The userland inclusion problem probably can be improved a bit by
introducing a _WANT_TASK (as well as a _WANT_MOUNT) akin to the
existing _WANT_PRISON etc., which is orthogonal to this change,
though, and likely needs an exp-run.
While at it:
- Change the gt_cpu member in the grouptask structure to be of type
int as used elswhere for specifying CPUs (an int16_t may be too
narrow sooner or later),
- move the gtaskqueue_enqueue_fn typedef from <sys/gtaskqueue.h> to
the gtaskqueue implementation as it's only used and needed there,
- change the GTASK_INIT macro to use "gtask" rather than "task" as
argument given that it actually operates on a struct gtask rather
than a struct task, and
- let subr_gtaskqueue.c consistently use __func__ to print functions
names.
Reported by: mmel
Reviewed by: mmel
Differential Revision: https://reviews.freebsd.org/D19139
2019-02-12 21:23:59 +00:00
|
|
|
struct resource *res;
|
2016-05-18 04:35:58 +00:00
|
|
|
struct taskqgroup *tqg;
|
|
|
|
void *q;
|
2019-06-15 11:07:41 +00:00
|
|
|
int err, tqrid;
|
2019-09-30 15:59:07 +00:00
|
|
|
bool rx_only;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
q = &ctx->ifc_rxqs[0];
|
|
|
|
info = &rxq[0].ifr_filter_info;
|
|
|
|
gtask = &rxq[0].ifr_task;
|
2017-09-16 02:41:38 +00:00
|
|
|
tqg = qgroup_if_io_tqg;
|
2019-06-15 11:07:41 +00:00
|
|
|
tqrid = *rid;
|
2019-09-30 15:59:07 +00:00
|
|
|
rx_only = (ctx->ifc_sctx->isc_flags & IFLIB_SINGLE_IRQ_RX_ONLY) != 0;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
ctx->ifc_flags |= IFC_LEGACY;
|
|
|
|
info->ifi_filter = filter;
|
|
|
|
info->ifi_filter_arg = filter_arg;
|
|
|
|
info->ifi_task = gtask;
|
2019-09-30 15:59:07 +00:00
|
|
|
info->ifi_ctx = rx_only ? ctx : q;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
Make taskqgroup_attach{,_cpu}(9) work across architectures
So far, intr_{g,s}etaffinity(9) take a single int for identifying
a device interrupt. This approach doesn't work on all architectures
supported, as a single int isn't sufficient to globally specify a
device interrupt. In particular, with multiple interrupt controllers
in one system as found on e. g. arm and arm64 machines, an interrupt
number as returned by rman_get_start(9) may be only unique relative
to the bus and, thus, interrupt controller, a certain device hangs
off from.
In turn, this makes taskqgroup_attach{,_cpu}(9) and - internal to
the gtaskqueue implementation - taskqgroup_attach_deferred{,_cpu}()
not work across architectures. Yet in turn, iflib(4) as gtaskqueue
consumer so far doesn't fit architectures where interrupt numbers
aren't globally unique.
However, at least for intr_setaffinity(..., CPU_WHICH_IRQ, ...) as
employed by the gtaskqueue implementation to bind an interrupt to a
particular CPU, using bus_bind_intr(9) instead is equivalent from
a functional point of view, with bus_bind_intr(9) taking the device
and interrupt resource arguments required for uniquely specifying a
device interrupt.
Thus, change the gtaskqueue implementation to employ bus_bind_intr(9)
instead and intr_{g,s}etaffinity(9) to take the device and interrupt
resource arguments required respectively. This change also moves
struct grouptask from <sys/_task.h> to <sys/gtaskqueue.h> and wraps
struct gtask along with the gtask_fn_t typedef into #ifdef _KERNEL
as userland likes to include <sys/_task.h> or indirectly drags it
in - for better or worse also with _KERNEL defined -, which with
device_t and struct resource dependencies otherwise is no longer
as easily possible now.
The userland inclusion problem probably can be improved a bit by
introducing a _WANT_TASK (as well as a _WANT_MOUNT) akin to the
existing _WANT_PRISON etc., which is orthogonal to this change,
though, and likely needs an exp-run.
While at it:
- Change the gt_cpu member in the grouptask structure to be of type
int as used elswhere for specifying CPUs (an int16_t may be too
narrow sooner or later),
- move the gtaskqueue_enqueue_fn typedef from <sys/gtaskqueue.h> to
the gtaskqueue implementation as it's only used and needed there,
- change the GTASK_INIT macro to use "gtask" rather than "task" as
argument given that it actually operates on a struct gtask rather
than a struct task, and
- let subr_gtaskqueue.c consistently use __func__ to print functions
names.
Reported by: mmel
Reviewed by: mmel
Differential Revision: https://reviews.freebsd.org/D19139
2019-02-12 21:23:59 +00:00
|
|
|
dev = ctx->ifc_dev;
|
2016-05-18 04:35:58 +00:00
|
|
|
/* We allocate a single interrupt resource */
|
2019-09-30 15:59:07 +00:00
|
|
|
err = _iflib_irq_alloc(ctx, irq, tqrid, rx_only ? iflib_fast_intr_ctx :
|
|
|
|
iflib_fast_intr_rxtx, NULL, info, name);
|
|
|
|
if (err != 0)
|
2016-05-18 04:35:58 +00:00
|
|
|
return (err);
|
2020-02-12 09:19:47 +00:00
|
|
|
NET_GROUPTASK_INIT(gtask, 0, _task_fn_rx, q);
|
Make taskqgroup_attach{,_cpu}(9) work across architectures
So far, intr_{g,s}etaffinity(9) take a single int for identifying
a device interrupt. This approach doesn't work on all architectures
supported, as a single int isn't sufficient to globally specify a
device interrupt. In particular, with multiple interrupt controllers
in one system as found on e. g. arm and arm64 machines, an interrupt
number as returned by rman_get_start(9) may be only unique relative
to the bus and, thus, interrupt controller, a certain device hangs
off from.
In turn, this makes taskqgroup_attach{,_cpu}(9) and - internal to
the gtaskqueue implementation - taskqgroup_attach_deferred{,_cpu}()
not work across architectures. Yet in turn, iflib(4) as gtaskqueue
consumer so far doesn't fit architectures where interrupt numbers
aren't globally unique.
However, at least for intr_setaffinity(..., CPU_WHICH_IRQ, ...) as
employed by the gtaskqueue implementation to bind an interrupt to a
particular CPU, using bus_bind_intr(9) instead is equivalent from
a functional point of view, with bus_bind_intr(9) taking the device
and interrupt resource arguments required for uniquely specifying a
device interrupt.
Thus, change the gtaskqueue implementation to employ bus_bind_intr(9)
instead and intr_{g,s}etaffinity(9) to take the device and interrupt
resource arguments required respectively. This change also moves
struct grouptask from <sys/_task.h> to <sys/gtaskqueue.h> and wraps
struct gtask along with the gtask_fn_t typedef into #ifdef _KERNEL
as userland likes to include <sys/_task.h> or indirectly drags it
in - for better or worse also with _KERNEL defined -, which with
device_t and struct resource dependencies otherwise is no longer
as easily possible now.
The userland inclusion problem probably can be improved a bit by
introducing a _WANT_TASK (as well as a _WANT_MOUNT) akin to the
existing _WANT_PRISON etc., which is orthogonal to this change,
though, and likely needs an exp-run.
While at it:
- Change the gt_cpu member in the grouptask structure to be of type
int as used elswhere for specifying CPUs (an int16_t may be too
narrow sooner or later),
- move the gtaskqueue_enqueue_fn typedef from <sys/gtaskqueue.h> to
the gtaskqueue implementation as it's only used and needed there,
- change the GTASK_INIT macro to use "gtask" rather than "task" as
argument given that it actually operates on a struct gtask rather
than a struct task, and
- let subr_gtaskqueue.c consistently use __func__ to print functions
names.
Reported by: mmel
Reviewed by: mmel
Differential Revision: https://reviews.freebsd.org/D19139
2019-02-12 21:23:59 +00:00
|
|
|
res = irq->ii_res;
|
|
|
|
taskqgroup_attach(tqg, gtask, q, dev, res, name);
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
GROUPTASK_INIT(&txq->ift_task, 0, _task_fn_tx, txq);
|
Make taskqgroup_attach{,_cpu}(9) work across architectures
So far, intr_{g,s}etaffinity(9) take a single int for identifying
a device interrupt. This approach doesn't work on all architectures
supported, as a single int isn't sufficient to globally specify a
device interrupt. In particular, with multiple interrupt controllers
in one system as found on e. g. arm and arm64 machines, an interrupt
number as returned by rman_get_start(9) may be only unique relative
to the bus and, thus, interrupt controller, a certain device hangs
off from.
In turn, this makes taskqgroup_attach{,_cpu}(9) and - internal to
the gtaskqueue implementation - taskqgroup_attach_deferred{,_cpu}()
not work across architectures. Yet in turn, iflib(4) as gtaskqueue
consumer so far doesn't fit architectures where interrupt numbers
aren't globally unique.
However, at least for intr_setaffinity(..., CPU_WHICH_IRQ, ...) as
employed by the gtaskqueue implementation to bind an interrupt to a
particular CPU, using bus_bind_intr(9) instead is equivalent from
a functional point of view, with bus_bind_intr(9) taking the device
and interrupt resource arguments required for uniquely specifying a
device interrupt.
Thus, change the gtaskqueue implementation to employ bus_bind_intr(9)
instead and intr_{g,s}etaffinity(9) to take the device and interrupt
resource arguments required respectively. This change also moves
struct grouptask from <sys/_task.h> to <sys/gtaskqueue.h> and wraps
struct gtask along with the gtask_fn_t typedef into #ifdef _KERNEL
as userland likes to include <sys/_task.h> or indirectly drags it
in - for better or worse also with _KERNEL defined -, which with
device_t and struct resource dependencies otherwise is no longer
as easily possible now.
The userland inclusion problem probably can be improved a bit by
introducing a _WANT_TASK (as well as a _WANT_MOUNT) akin to the
existing _WANT_PRISON etc., which is orthogonal to this change,
though, and likely needs an exp-run.
While at it:
- Change the gt_cpu member in the grouptask structure to be of type
int as used elswhere for specifying CPUs (an int16_t may be too
narrow sooner or later),
- move the gtaskqueue_enqueue_fn typedef from <sys/gtaskqueue.h> to
the gtaskqueue implementation as it's only used and needed there,
- change the GTASK_INIT macro to use "gtask" rather than "task" as
argument given that it actually operates on a struct gtask rather
than a struct task, and
- let subr_gtaskqueue.c consistently use __func__ to print functions
names.
Reported by: mmel
Reviewed by: mmel
Differential Revision: https://reviews.freebsd.org/D19139
2019-02-12 21:23:59 +00:00
|
|
|
taskqgroup_attach(qgroup_if_io_tqg, &txq->ift_task, txq, dev, res,
|
|
|
|
"tx");
|
2016-05-18 04:35:58 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
iflib_led_create(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
|
|
|
|
ctx->ifc_led_dev = led_create(iflib_led_func, ctx,
|
2017-03-13 22:53:06 +00:00
|
|
|
device_get_nameunit(ctx->ifc_dev));
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
iflib_tx_intr_deferred(if_ctx_t ctx, int txqid)
|
|
|
|
{
|
|
|
|
|
|
|
|
GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid)
|
|
|
|
{
|
|
|
|
|
|
|
|
GROUPTASK_ENQUEUE(&ctx->ifc_rxqs[rxqid].ifr_task);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
iflib_admin_intr_deferred(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
|
2020-03-24 17:54:34 +00:00
|
|
|
MPASS(ctx->ifc_admin_task.gt_taskqueue != NULL);
|
2016-05-18 04:35:58 +00:00
|
|
|
GROUPTASK_ENQUEUE(&ctx->ifc_admin_task);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
iflib_iov_intr_deferred(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
|
|
|
|
GROUPTASK_ENQUEUE(&ctx->ifc_vflr_task);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2019-06-15 11:07:41 +00:00
|
|
|
iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, const char *name)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
|
|
|
|
Make taskqgroup_attach{,_cpu}(9) work across architectures
So far, intr_{g,s}etaffinity(9) take a single int for identifying
a device interrupt. This approach doesn't work on all architectures
supported, as a single int isn't sufficient to globally specify a
device interrupt. In particular, with multiple interrupt controllers
in one system as found on e. g. arm and arm64 machines, an interrupt
number as returned by rman_get_start(9) may be only unique relative
to the bus and, thus, interrupt controller, a certain device hangs
off from.
In turn, this makes taskqgroup_attach{,_cpu}(9) and - internal to
the gtaskqueue implementation - taskqgroup_attach_deferred{,_cpu}()
not work across architectures. Yet in turn, iflib(4) as gtaskqueue
consumer so far doesn't fit architectures where interrupt numbers
aren't globally unique.
However, at least for intr_setaffinity(..., CPU_WHICH_IRQ, ...) as
employed by the gtaskqueue implementation to bind an interrupt to a
particular CPU, using bus_bind_intr(9) instead is equivalent from
a functional point of view, with bus_bind_intr(9) taking the device
and interrupt resource arguments required for uniquely specifying a
device interrupt.
Thus, change the gtaskqueue implementation to employ bus_bind_intr(9)
instead and intr_{g,s}etaffinity(9) to take the device and interrupt
resource arguments required respectively. This change also moves
struct grouptask from <sys/_task.h> to <sys/gtaskqueue.h> and wraps
struct gtask along with the gtask_fn_t typedef into #ifdef _KERNEL
as userland likes to include <sys/_task.h> or indirectly drags it
in - for better or worse also with _KERNEL defined -, which with
device_t and struct resource dependencies otherwise is no longer
as easily possible now.
The userland inclusion problem probably can be improved a bit by
introducing a _WANT_TASK (as well as a _WANT_MOUNT) akin to the
existing _WANT_PRISON etc., which is orthogonal to this change,
though, and likely needs an exp-run.
While at it:
- Change the gt_cpu member in the grouptask structure to be of type
int as used elswhere for specifying CPUs (an int16_t may be too
narrow sooner or later),
- move the gtaskqueue_enqueue_fn typedef from <sys/gtaskqueue.h> to
the gtaskqueue implementation as it's only used and needed there,
- change the GTASK_INIT macro to use "gtask" rather than "task" as
argument given that it actually operates on a struct gtask rather
than a struct task, and
- let subr_gtaskqueue.c consistently use __func__ to print functions
names.
Reported by: mmel
Reviewed by: mmel
Differential Revision: https://reviews.freebsd.org/D19139
2019-02-12 21:23:59 +00:00
|
|
|
taskqgroup_attach_cpu(qgroup_if_io_tqg, gt, uniq, cpu, NULL, NULL,
|
|
|
|
name);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2018-05-03 17:02:31 +00:00
|
|
|
iflib_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn,
|
|
|
|
const char *name)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
GROUPTASK_INIT(gtask, 0, fn, ctx);
|
Make taskqgroup_attach{,_cpu}(9) work across architectures
So far, intr_{g,s}etaffinity(9) take a single int for identifying
a device interrupt. This approach doesn't work on all architectures
supported, as a single int isn't sufficient to globally specify a
device interrupt. In particular, with multiple interrupt controllers
in one system as found on e. g. arm and arm64 machines, an interrupt
number as returned by rman_get_start(9) may be only unique relative
to the bus and, thus, interrupt controller, a certain device hangs
off from.
In turn, this makes taskqgroup_attach{,_cpu}(9) and - internal to
the gtaskqueue implementation - taskqgroup_attach_deferred{,_cpu}()
not work across architectures. Yet in turn, iflib(4) as gtaskqueue
consumer so far doesn't fit architectures where interrupt numbers
aren't globally unique.
However, at least for intr_setaffinity(..., CPU_WHICH_IRQ, ...) as
employed by the gtaskqueue implementation to bind an interrupt to a
particular CPU, using bus_bind_intr(9) instead is equivalent from
a functional point of view, with bus_bind_intr(9) taking the device
and interrupt resource arguments required for uniquely specifying a
device interrupt.
Thus, change the gtaskqueue implementation to employ bus_bind_intr(9)
instead and intr_{g,s}etaffinity(9) to take the device and interrupt
resource arguments required respectively. This change also moves
struct grouptask from <sys/_task.h> to <sys/gtaskqueue.h> and wraps
struct gtask along with the gtask_fn_t typedef into #ifdef _KERNEL
as userland likes to include <sys/_task.h> or indirectly drags it
in - for better or worse also with _KERNEL defined -, which with
device_t and struct resource dependencies otherwise is no longer
as easily possible now.
The userland inclusion problem probably can be improved a bit by
introducing a _WANT_TASK (as well as a _WANT_MOUNT) akin to the
existing _WANT_PRISON etc., which is orthogonal to this change,
though, and likely needs an exp-run.
While at it:
- Change the gt_cpu member in the grouptask structure to be of type
int as used elswhere for specifying CPUs (an int16_t may be too
narrow sooner or later),
- move the gtaskqueue_enqueue_fn typedef from <sys/gtaskqueue.h> to
the gtaskqueue implementation as it's only used and needed there,
- change the GTASK_INIT macro to use "gtask" rather than "task" as
argument given that it actually operates on a struct gtask rather
than a struct task, and
- let subr_gtaskqueue.c consistently use __func__ to print functions
names.
Reported by: mmel
Reviewed by: mmel
Differential Revision: https://reviews.freebsd.org/D19139
2019-02-12 21:23:59 +00:00
|
|
|
taskqgroup_attach(qgroup_if_config_tqg, gtask, gtask, NULL, NULL,
|
|
|
|
name);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2016-08-12 21:29:44 +00:00
|
|
|
iflib_config_gtask_deinit(struct grouptask *gtask)
|
|
|
|
{
|
|
|
|
|
2017-09-16 02:41:38 +00:00
|
|
|
taskqgroup_detach(qgroup_if_config_tqg, gtask);
|
2016-08-12 21:29:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
iflib_link_state_change(if_ctx_t ctx, int link_state, uint64_t baudrate)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
|
|
|
if_t ifp = ctx->ifc_ifp;
|
|
|
|
iflib_txq_t txq = ctx->ifc_txqs;
|
|
|
|
|
|
|
|
if_setbaudrate(ifp, baudrate);
|
2018-04-12 14:35:37 +00:00
|
|
|
if (baudrate >= IF_Gbps(10)) {
|
|
|
|
STATE_LOCK(ctx);
|
2017-03-13 22:53:06 +00:00
|
|
|
ctx->ifc_flags |= IFC_PREFETCH;
|
2018-04-12 14:35:37 +00:00
|
|
|
STATE_UNLOCK(ctx);
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
/* If link down, disable watchdog */
|
|
|
|
if ((ctx->ifc_link_state == LINK_STATE_UP) && (link_state == LINK_STATE_DOWN)) {
|
|
|
|
for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxqsets; i++, txq++)
|
|
|
|
txq->ift_qstatus = IFLIB_QUEUE_IDLE;
|
|
|
|
}
|
|
|
|
ctx->ifc_link_state = link_state;
|
|
|
|
if_link_state_change(ifp, link_state);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq)
|
|
|
|
{
|
|
|
|
int credits;
|
2017-01-02 00:56:33 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
int credits_pre = txq->ift_cidx_processed;
|
2017-03-13 22:53:06 +00:00
|
|
|
#endif
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2019-01-16 05:44:14 +00:00
|
|
|
bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
|
|
|
|
BUS_DMASYNC_POSTREAD);
|
2017-03-13 22:53:06 +00:00
|
|
|
if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, true)) == 0)
|
2016-05-18 04:35:58 +00:00
|
|
|
return (0);
|
|
|
|
|
|
|
|
txq->ift_processed += credits;
|
|
|
|
txq->ift_cidx_processed += credits;
|
|
|
|
|
2017-01-02 00:56:33 +00:00
|
|
|
MPASS(credits_pre + credits == txq->ift_cidx_processed);
|
2016-05-18 04:35:58 +00:00
|
|
|
if (txq->ift_cidx_processed >= txq->ift_size)
|
|
|
|
txq->ift_cidx_processed -= txq->ift_size;
|
|
|
|
return (credits);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2017-03-13 22:53:06 +00:00
|
|
|
iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget)
|
2016-05-18 04:35:58 +00:00
|
|
|
{
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
iflib_fl_t fl;
|
|
|
|
u_int i;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
Further correct and optimize the bus_dma(9) usage of iflib(4):
o Correct the obvious bugs in the netmap(4) parts:
- No longer check for the existence of DMA maps as bus_dma(9)
is used unconditionally in iflib(4) since r341095.
- Supply the correct DMA tag and map pairs to bus_dma(9)
functions (see also the commit message of r343753).
- In iflib_netmap_timer_adjust(), add synchronization of the
TX descriptors before calling the ift_txd_credits_update
method as the latter evaluates the TX descriptors possibly
updated by the MAC.
- In _task_fn_tx(), wrap the netmap(4)-specific bits in
#ifdef DEV_NETMAP just as done in _task_fn_admin() and
_task_fn_rx() respectively.
o In iflib_fast_intr_rxtx(), synchronize the TX rather than
the RX descriptors before calling the ift_txd_credits_update
method (see also above).
o There's no need to synchronize an RX buffer that is going to
be recycled in iflib_rxd_pkt_get(), yet; it's sufficient to
do that as late as passing RX buffers to the MAC via the
ift_rxd_refill method. Hence, combine that synchronization
with the synchronization of new buffers into a common spot
in _iflib_fl_refill().
o There's no need to synchronize the RX descriptors of a free
list in preparation of the MAC updating their statuses with
every invocation of rxd_frag_to_sd(); it's enough to do this
once before handing control over to the MAC, i. e. before
calling ift_rxd_flush method in _iflib_fl_refill(), which
already performs the necessary synchronization.
o Given that the ift_rxd_available method evaluates the RX
descriptors which possibly have been altered by the MAC,
synchronize as appropriate beforehand. Most notably this
is now done in iflib_rxd_avail(), which in turn means that
we don't need to issue the same synchronization yet again
before calling the ift_rxd_pkt_get method in iflib_rxeof().
o In iflib_txd_db_check(), synchronize the TX descriptors
before handing them over to the MAC for transmission via
the ift_txd_flush method.
o In iflib_encap(), move the TX buffer synchronization after
the invocation of the ift_txd_encap() method. If the MAC
driver fails to encapsulate the packet and we retry with
a defragmented mbuf chain or finally fail, the cycles for
TX buffer synchronization have been wasted. Synchronizing
afterwards matches what non-iflib(4) drivers typically do
and is sufficient as the MAC will not actually start with
the transmission before - in this case - the ift_txd_flush
method is called.
Moreover, for the latter reason the synchronization of the
TX descriptors in iflib_encap() can go as it's enough to
synchronize them before passing control over to the MAC by
issuing the ift_txd_flush() method (see above).
o In iflib_txq_can_drain(), only synchronize TX descriptors
if the ift_txd_credits_update method accessing these is
actually called.
Differential Revision: https://reviews.freebsd.org/D19081
2019-02-12 21:08:44 +00:00
|
|
|
for (i = 0, fl = &rxq->ifr_fl[0]; i < rxq->ifr_nfl; i++, fl++)
|
|
|
|
bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
|
|
|
|
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
|
2016-08-12 21:29:44 +00:00
|
|
|
return (ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, cidx,
|
|
|
|
budget));
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
iflib_add_int_delay_sysctl(if_ctx_t ctx, const char *name,
|
|
|
|
const char *description, if_int_delay_info_t info,
|
|
|
|
int offset, int value)
|
|
|
|
{
|
|
|
|
info->iidi_ctx = ctx;
|
|
|
|
info->iidi_offset = offset;
|
|
|
|
info->iidi_value = value;
|
|
|
|
SYSCTL_ADD_PROC(device_get_sysctl_ctx(ctx->ifc_dev),
|
|
|
|
SYSCTL_CHILDREN(device_get_sysctl_tree(ctx->ifc_dev)),
|
2020-02-26 14:26:36 +00:00
|
|
|
OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
|
2016-05-18 04:35:58 +00:00
|
|
|
info, 0, iflib_sysctl_int_delay, "I", description);
|
|
|
|
}
|
|
|
|
|
2018-05-03 17:02:31 +00:00
|
|
|
struct sx *
|
2016-05-18 04:35:58 +00:00
|
|
|
iflib_ctx_lock_get(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
|
2018-05-03 17:02:31 +00:00
|
|
|
return (&ctx->ifc_ctx_sx);
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
iflib_msix_init(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
device_t dev = ctx->ifc_dev;
|
|
|
|
if_shared_ctx_t sctx = ctx->ifc_sctx;
|
|
|
|
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
int admincnt, bar, err, iflib_num_rx_queues, iflib_num_tx_queues;
|
|
|
|
int msgs, queuemsgs, queues, rx_queues, tx_queues, vectors;
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2017-11-16 18:52:58 +00:00
|
|
|
iflib_num_tx_queues = ctx->ifc_sysctl_ntxqs;
|
|
|
|
iflib_num_rx_queues = ctx->ifc_sysctl_nrxqs;
|
|
|
|
|
2019-01-30 13:21:26 +00:00
|
|
|
if (bootverbose)
|
|
|
|
device_printf(dev, "msix_init qsets capped at %d\n",
|
|
|
|
imax(scctx->isc_ntxqsets, scctx->isc_nrxqsets));
|
2016-08-12 21:29:44 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
/* Override by tuneable */
|
2017-04-04 21:03:34 +00:00
|
|
|
if (scctx->isc_disable_msix)
|
2016-05-18 04:35:58 +00:00
|
|
|
goto msi;
|
|
|
|
|
2019-01-30 13:21:26 +00:00
|
|
|
/* First try MSI-X */
|
|
|
|
if ((msgs = pci_msix_count(dev)) == 0) {
|
|
|
|
if (bootverbose)
|
|
|
|
device_printf(dev, "MSI-X not supported or disabled\n");
|
|
|
|
goto msi;
|
|
|
|
}
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
|
|
|
|
bar = ctx->ifc_softc_ctx.isc_msix_bar;
|
2016-05-18 04:35:58 +00:00
|
|
|
/*
|
|
|
|
* bar == -1 => "trust me I know what I'm doing"
|
|
|
|
* Some drivers are for hardware that is so shoddily
|
|
|
|
* documented that no one knows which bars are which
|
|
|
|
* so the developer has to map all bars. This hack
|
2019-01-30 13:21:26 +00:00
|
|
|
* allows shoddy garbage to use MSI-X in this framework.
|
2016-05-18 04:35:58 +00:00
|
|
|
*/
|
|
|
|
if (bar != -1) {
|
|
|
|
ctx->ifc_msix_mem = bus_alloc_resource_any(dev,
|
|
|
|
SYS_RES_MEMORY, &bar, RF_ACTIVE);
|
|
|
|
if (ctx->ifc_msix_mem == NULL) {
|
2019-01-30 13:21:26 +00:00
|
|
|
device_printf(dev, "Unable to map MSI-X table\n");
|
2016-05-18 04:35:58 +00:00
|
|
|
goto msi;
|
|
|
|
}
|
|
|
|
}
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
|
|
|
|
admincnt = sctx->isc_admin_intrcnt;
|
2016-05-18 04:35:58 +00:00
|
|
|
#if IFLIB_DEBUG
|
|
|
|
/* use only 1 qset in debug mode */
|
|
|
|
queuemsgs = min(msgs - admincnt, 1);
|
|
|
|
#else
|
|
|
|
queuemsgs = msgs - admincnt;
|
|
|
|
#endif
|
|
|
|
#ifdef RSS
|
2017-11-29 18:14:57 +00:00
|
|
|
queues = imin(queuemsgs, rss_getnumbuckets());
|
2016-05-18 04:35:58 +00:00
|
|
|
#else
|
2017-11-29 18:14:57 +00:00
|
|
|
queues = queuemsgs;
|
2016-05-18 04:35:58 +00:00
|
|
|
#endif
|
2017-11-29 18:14:57 +00:00
|
|
|
queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues);
|
2019-01-30 13:21:26 +00:00
|
|
|
if (bootverbose)
|
|
|
|
device_printf(dev,
|
|
|
|
"intr CPUs: %d queue msgs: %d admincnt: %d\n",
|
|
|
|
CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt);
|
2016-05-18 04:35:58 +00:00
|
|
|
#ifdef RSS
|
|
|
|
/* If we're doing RSS, clamp at the number of RSS buckets */
|
|
|
|
if (queues > rss_getnumbuckets())
|
|
|
|
queues = rss_getnumbuckets();
|
|
|
|
#endif
|
2016-08-12 21:29:44 +00:00
|
|
|
if (iflib_num_rx_queues > 0 && iflib_num_rx_queues < queuemsgs - admincnt)
|
|
|
|
rx_queues = iflib_num_rx_queues;
|
2016-05-18 04:35:58 +00:00
|
|
|
else
|
|
|
|
rx_queues = queues;
|
2017-11-16 18:52:58 +00:00
|
|
|
|
|
|
|
if (rx_queues > scctx->isc_nrxqsets)
|
|
|
|
rx_queues = scctx->isc_nrxqsets;
|
|
|
|
|
2016-08-12 21:29:44 +00:00
|
|
|
/*
|
|
|
|
* We want this to be all logical CPUs by default
|
|
|
|
*/
|
2016-05-18 04:35:58 +00:00
|
|
|
if (iflib_num_tx_queues > 0 && iflib_num_tx_queues < queues)
|
|
|
|
tx_queues = iflib_num_tx_queues;
|
|
|
|
else
|
2016-08-12 21:29:44 +00:00
|
|
|
tx_queues = mp_ncpus;
|
|
|
|
|
2017-11-16 18:52:58 +00:00
|
|
|
if (tx_queues > scctx->isc_ntxqsets)
|
|
|
|
tx_queues = scctx->isc_ntxqsets;
|
|
|
|
|
2016-08-12 21:29:44 +00:00
|
|
|
if (ctx->ifc_sysctl_qs_eq_override == 0) {
|
|
|
|
#ifdef INVARIANTS
|
|
|
|
if (tx_queues != rx_queues)
|
2018-10-12 22:40:54 +00:00
|
|
|
device_printf(dev,
|
|
|
|
"queue equality override not set, capping rx_queues at %d and tx_queues at %d\n",
|
|
|
|
min(rx_queues, tx_queues), min(rx_queues, tx_queues));
|
2016-08-12 21:29:44 +00:00
|
|
|
#endif
|
|
|
|
tx_queues = min(rx_queues, tx_queues);
|
|
|
|
rx_queues = min(rx_queues, tx_queues);
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
vectors = rx_queues + admincnt;
|
|
|
|
if (msgs < vectors) {
|
|
|
|
device_printf(dev,
|
|
|
|
"insufficient number of MSI-X vectors "
|
|
|
|
"(supported %d, need %d)\n", msgs, vectors);
|
|
|
|
goto msi;
|
|
|
|
}
|
|
|
|
|
2019-05-06 20:56:41 +00:00
|
|
|
device_printf(dev, "Using %d RX queues %d TX queues\n", rx_queues,
|
|
|
|
tx_queues);
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
msgs = vectors;
|
2016-05-18 04:35:58 +00:00
|
|
|
if ((err = pci_alloc_msix(dev, &vectors)) == 0) {
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
if (vectors != msgs) {
|
|
|
|
device_printf(dev,
|
|
|
|
"Unable to allocate sufficient MSI-X vectors "
|
|
|
|
"(got %d, need %d)\n", vectors, msgs);
|
|
|
|
pci_release_msi(dev);
|
|
|
|
if (bar != -1) {
|
|
|
|
bus_release_resource(dev, SYS_RES_MEMORY, bar,
|
|
|
|
ctx->ifc_msix_mem);
|
|
|
|
ctx->ifc_msix_mem = NULL;
|
|
|
|
}
|
|
|
|
goto msi;
|
|
|
|
}
|
2019-01-30 13:21:26 +00:00
|
|
|
device_printf(dev, "Using MSI-X interrupts with %d vectors\n",
|
|
|
|
vectors);
|
2016-05-18 04:35:58 +00:00
|
|
|
scctx->isc_vectors = vectors;
|
|
|
|
scctx->isc_nrxqsets = rx_queues;
|
|
|
|
scctx->isc_ntxqsets = tx_queues;
|
|
|
|
scctx->isc_intr = IFLIB_INTR_MSIX;
|
2016-08-12 21:29:44 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
return (vectors);
|
|
|
|
} else {
|
2018-10-12 22:40:54 +00:00
|
|
|
device_printf(dev,
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
"failed to allocate %d MSI-X vectors, err: %d\n", vectors,
|
|
|
|
err);
|
|
|
|
if (bar != -1) {
|
|
|
|
bus_release_resource(dev, SYS_RES_MEMORY, bar,
|
|
|
|
ctx->ifc_msix_mem);
|
|
|
|
ctx->ifc_msix_mem = NULL;
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
msi:
|
|
|
|
vectors = pci_msi_count(dev);
|
|
|
|
scctx->isc_nrxqsets = 1;
|
|
|
|
scctx->isc_ntxqsets = 1;
|
|
|
|
scctx->isc_vectors = vectors;
|
|
|
|
if (vectors == 1 && pci_alloc_msi(dev, &vectors) == 0) {
|
|
|
|
device_printf(dev,"Using an MSI interrupt\n");
|
|
|
|
scctx->isc_intr = IFLIB_INTR_MSI;
|
|
|
|
} else {
|
Assorted fixes to MSI-X/MSI/INTx setup in iflib(9):
- In iflib_msix_init(), VMMs with broken MSI-X activation are trying
to be worked around by manually enabling PCIM_MSIXCTRL_MSIX_ENABLE
before calling pci_alloc_msix(9). Apart from constituting a layering
violation, this has the problem of leaving PCIM_MSIXCTRL_MSIX_ENABLE
enabled when falling back to MSI or INTx when e. g. MSI-X is black-
listed and initially also when disabled via hw.pci.enable_msix. The
later in turn was incorrectly worked around in r325166.
Since r310806, pci(4) itself has code to deal with broken MSI-X
handling of VMMs, so all of these workarounds in iflib(9) can go,
fixing non-working interrupts when falling back to MSI/INTx. In
any case, possibly further adjustments to broken MSI-X activation
of VMMs like enabling r310806 by default in VM environments need to
be placed into pci(4), not iflib(9). [1]
- Also remove the pci_enable_busmaster(9) call from iflib_msix_init(),
which is already more properly invoked from iflib_device_attach().
- When falling back to MSI/INTx, release the MSI-X BAR resource again.
- When falling back to INTx, ensure scctx->isc_vectors is set to 1 and
not to something higher from a device with more than one MSI message
supported.
- Make the nearby ring_state(s) stuff (static) const.
Discussed with: jhb at BSDCan 2018 [1]
Reviewed by: imp, jhb
Differential Revision: https://reviews.freebsd.org/D15729
2018-06-17 20:33:02 +00:00
|
|
|
scctx->isc_vectors = 1;
|
2016-05-18 04:35:58 +00:00
|
|
|
device_printf(dev,"Using a Legacy interrupt\n");
|
|
|
|
scctx->isc_intr = IFLIB_INTR_LEGACY;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (vectors);
|
|
|
|
}
|
|
|
|
|
Assorted fixes to MSI-X/MSI/INTx setup in iflib(9):
- In iflib_msix_init(), VMMs with broken MSI-X activation are trying
to be worked around by manually enabling PCIM_MSIXCTRL_MSIX_ENABLE
before calling pci_alloc_msix(9). Apart from constituting a layering
violation, this has the problem of leaving PCIM_MSIXCTRL_MSIX_ENABLE
enabled when falling back to MSI or INTx when e. g. MSI-X is black-
listed and initially also when disabled via hw.pci.enable_msix. The
later in turn was incorrectly worked around in r325166.
Since r310806, pci(4) itself has code to deal with broken MSI-X
handling of VMMs, so all of these workarounds in iflib(9) can go,
fixing non-working interrupts when falling back to MSI/INTx. In
any case, possibly further adjustments to broken MSI-X activation
of VMMs like enabling r310806 by default in VM environments need to
be placed into pci(4), not iflib(9). [1]
- Also remove the pci_enable_busmaster(9) call from iflib_msix_init(),
which is already more properly invoked from iflib_device_attach().
- When falling back to MSI/INTx, release the MSI-X BAR resource again.
- When falling back to INTx, ensure scctx->isc_vectors is set to 1 and
not to something higher from a device with more than one MSI message
supported.
- Make the nearby ring_state(s) stuff (static) const.
Discussed with: jhb at BSDCan 2018 [1]
Reviewed by: imp, jhb
Differential Revision: https://reviews.freebsd.org/D15729
2018-06-17 20:33:02 +00:00
|
|
|
static const char *ring_states[] = { "IDLE", "BUSY", "STALLED", "ABDICATED" };
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
static int
|
|
|
|
mp_ring_state_handler(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
uint16_t *state = ((uint16_t *)oidp->oid_arg1);
|
|
|
|
struct sbuf *sb;
|
Assorted fixes to MSI-X/MSI/INTx setup in iflib(9):
- In iflib_msix_init(), VMMs with broken MSI-X activation are trying
to be worked around by manually enabling PCIM_MSIXCTRL_MSIX_ENABLE
before calling pci_alloc_msix(9). Apart from constituting a layering
violation, this has the problem of leaving PCIM_MSIXCTRL_MSIX_ENABLE
enabled when falling back to MSI or INTx when e. g. MSI-X is black-
listed and initially also when disabled via hw.pci.enable_msix. The
later in turn was incorrectly worked around in r325166.
Since r310806, pci(4) itself has code to deal with broken MSI-X
handling of VMMs, so all of these workarounds in iflib(9) can go,
fixing non-working interrupts when falling back to MSI/INTx. In
any case, possibly further adjustments to broken MSI-X activation
of VMMs like enabling r310806 by default in VM environments need to
be placed into pci(4), not iflib(9). [1]
- Also remove the pci_enable_busmaster(9) call from iflib_msix_init(),
which is already more properly invoked from iflib_device_attach().
- When falling back to MSI/INTx, release the MSI-X BAR resource again.
- When falling back to INTx, ensure scctx->isc_vectors is set to 1 and
not to something higher from a device with more than one MSI message
supported.
- Make the nearby ring_state(s) stuff (static) const.
Discussed with: jhb at BSDCan 2018 [1]
Reviewed by: imp, jhb
Differential Revision: https://reviews.freebsd.org/D15729
2018-06-17 20:33:02 +00:00
|
|
|
const char *ring_state = "UNKNOWN";
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
/* XXX needed ? */
|
|
|
|
rc = sysctl_wire_old_buffer(req, 0);
|
|
|
|
MPASS(rc == 0);
|
|
|
|
if (rc != 0)
|
|
|
|
return (rc);
|
|
|
|
sb = sbuf_new_for_sysctl(NULL, NULL, 80, req);
|
|
|
|
MPASS(sb != NULL);
|
|
|
|
if (sb == NULL)
|
|
|
|
return (ENOMEM);
|
|
|
|
if (state[3] <= 3)
|
|
|
|
ring_state = ring_states[state[3]];
|
|
|
|
|
|
|
|
sbuf_printf(sb, "pidx_head: %04hd pidx_tail: %04hd cidx: %04hd state: %s",
|
|
|
|
state[0], state[1], state[2], ring_state);
|
|
|
|
rc = sbuf_finish(sb);
|
|
|
|
sbuf_delete(sb);
|
|
|
|
return(rc);
|
|
|
|
}
|
|
|
|
|
2016-08-12 21:29:44 +00:00
|
|
|
enum iflib_ndesc_handler {
|
|
|
|
IFLIB_NTXD_HANDLER,
|
|
|
|
IFLIB_NRXD_HANDLER,
|
|
|
|
};
|
2016-05-18 04:35:58 +00:00
|
|
|
|
2016-08-12 21:29:44 +00:00
|
|
|
static int
|
|
|
|
mp_ndesc_handler(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
if_ctx_t ctx = (void *)arg1;
|
|
|
|
enum iflib_ndesc_handler type = arg2;
|
|
|
|
char buf[256] = {0};
|
2017-03-13 22:53:06 +00:00
|
|
|
qidx_t *ndesc;
|
2016-08-12 21:29:44 +00:00
|
|
|
char *p, *next;
|
|
|
|
int nqs, rc, i;
|
|
|
|
|
|
|
|
nqs = 8;
|
|
|
|
switch(type) {
|
|
|
|
case IFLIB_NTXD_HANDLER:
|
|
|
|
ndesc = ctx->ifc_sysctl_ntxds;
|
|
|
|
if (ctx->ifc_sctx)
|
|
|
|
nqs = ctx->ifc_sctx->isc_ntxqs;
|
|
|
|
break;
|
|
|
|
case IFLIB_NRXD_HANDLER:
|
|
|
|
ndesc = ctx->ifc_sysctl_nrxds;
|
|
|
|
if (ctx->ifc_sctx)
|
|
|
|
nqs = ctx->ifc_sctx->isc_nrxqs;
|
|
|
|
break;
|
2018-05-04 18:57:05 +00:00
|
|
|
default:
|
o Use iflib_fast_intr_rxtx() also for "legacy" interrupts, i. e. INTx and
MSI. Unlike as with iflib_fast_intr_ctx(), the former will also enqueue
_task_fn_tx() in addition to _task_fn_rx() if appropriate, bringing TCP
TX throughput of EM-class devices on par with the MSI-X case and, thus,
close to wirespeed/pre-iflib(4) times again. [1]
Note that independently of the interrupt type, the UDP performance with
these MACs still is abysmal and nowhere near to where it was before the
conversion of em(4) to iflib(4).
o In iflib_init_locked(), announce which free list failed to set up.
o In _task_fn_tx() when running netmap(4), issue ifdi_intr_enable instead
of the ifdi_tx_queue_intr_enable method in case of a "legacy" interrupt
as the latter is valid with MSI-X only.
o Instead of adding the missing - and apparently convoluted enough that a
DBG_COUNTER_INC was put into a wrong spot in _task_fn_rx() - checks for
ifdi_{r,t}x_queue_intr_enable being available in the MSI-X case also to
iflib_fast_intr_rxtx(), factor these out to iflib_device_register() and
make the checks fail gracefully rather than panic. This avoids invoking
the checks at runtime over and over again in iflib_fast_intr_rxtx() and
_task_fn_{r,t}x() - even if it's just in case of INVARIANTS - and makes
these functions more readable.
o In iflib_rx_structures_setup(), only initialize LRO resources if device
and driver have LRO capability in order to not waste memory. Also, free
the LRO resources again if setting them up fails for one of the queues.
However, don't bother invoking iflib_rx_sds_free() in that case because
iflib_rx_structures_setup() doesn't call iflib_rxsd_alloc() either (and
iflib_{device,pseudo}_register() will issue iflib_rx_sds_free() in case
of failure via iflib_rx_structures_free(), but there definitely is some
asymmetry left to be fixed, though).
o Similarly, free LRO resources again in iflib_rx_structures_free().
o In iflib_irq_set_affinity(), handle get_core_offset() errors gracefully
instead of panicing (but only in case of INVARIANTS). This is a follow-
up to r344132, as such driver bugs shouldn't be fatal.
o Likewise, handle unknown iflib_intr_type_t in iflib_irq_alloc_generic()
gracefully, too.
o Bring yet more sanity to iflib_msix_init():
- If the device doesn't provide enough MSI-X vectors or not all vectors
can be allocate so the expected number of queues in addition to admin
interrupts can't be supported, try MSI next (and then INTx) as proper
MSI-X vector distribution can't be assured in such cases. In essence,
this change brings r254008 forward to iflib(4). Also, this is the fix
alluded to in the commit message of r343934.
- If the MSI-X allocation has failed, don't prematurely announce MSI is
going to be used as the latter in fact may not be available either.
- When falling back to MSI, only release the MSI-X table resource again
if it was allocated in iflib_msix_init(), i. e. isn't supplied by the
driver, in the first place.
o In mp_ndesc_handler(), handle unknown type arguments gracefully, too.
PR: 235031 (likely) [1]
Reviewed by: shurd
Differential Revision: https://reviews.freebsd.org/D20175
2019-05-07 08:28:35 +00:00
|
|
|
printf("%s: unhandled type\n", __func__);
|
|
|
|
return (EINVAL);
|
2016-08-12 21:29:44 +00:00
|
|
|
}
|
|
|
|
if (nqs == 0)
|
|
|
|
nqs = 8;
|
|
|
|
|
|
|
|
for (i=0; i<8; i++) {
|
|
|
|
if (i >= nqs)
|
|
|
|
break;
|
|
|
|
if (i)
|
|
|
|
strcat(buf, ",");
|
|
|
|
sprintf(strchr(buf, 0), "%d", ndesc[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
rc = sysctl_handle_string(oidp, buf, sizeof(buf), req);
|
|
|
|
if (rc || req->newptr == NULL)
|
|
|
|
return rc;
|
|
|
|
|
|
|
|
for (i = 0, next = buf, p = strsep(&next, " ,"); i < 8 && p;
|
|
|
|
i++, p = strsep(&next, " ,")) {
|
|
|
|
ndesc[i] = strtoul(p, NULL, 10);
|
|
|
|
}
|
|
|
|
|
|
|
|
return(rc);
|
|
|
|
}
|
2016-05-18 04:35:58 +00:00
|
|
|
|
|
|
|
#define NAME_BUFLEN 32
|
|
|
|
static void
|
|
|
|
iflib_add_device_sysctl_pre(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
device_t dev = iflib_get_dev(ctx);
|
|
|
|
struct sysctl_oid_list *child, *oid_list;
|
|
|
|
struct sysctl_ctx_list *ctx_list;
|
|
|
|
struct sysctl_oid *node;
|
|
|
|
|
|
|
|
ctx_list = device_get_sysctl_ctx(dev);
|
|
|
|
child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
|
|
|
|
ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, "iflib",
|
2020-02-26 14:26:36 +00:00
|
|
|
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "IFLIB fields");
|
2016-05-18 04:35:58 +00:00
|
|
|
oid_list = SYSCTL_CHILDREN(node);
|
|
|
|
|
2019-03-19 23:44:26 +00:00
|
|
|
SYSCTL_ADD_CONST_STRING(ctx_list, oid_list, OID_AUTO, "driver_version",
|
|
|
|
CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version,
|
2016-08-12 21:29:44 +00:00
|
|
|
"driver version");
|
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs",
|
|
|
|
CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0,
|
|
|
|
"# of txqs to use, 0 => use default #");
|
|
|
|
SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_nrxqs",
|
2016-08-12 21:29:44 +00:00
|
|
|
CTLFLAG_RWTUN, &ctx->ifc_sysctl_nrxqs, 0,
|
|
|
|
"# of rxqs to use, 0 => use default #");
|
|
|
|
SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable",
|
|
|
|
CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0,
|
|
|
|
"permit #txq != #rxq");
|
2017-09-23 01:37:01 +00:00
|
|
|
SYSCTL_ADD_INT(ctx_list, oid_list, OID_AUTO, "disable_msix",
|
2017-04-04 21:03:34 +00:00
|
|
|
CTLFLAG_RWTUN, &ctx->ifc_softc_ctx.isc_disable_msix, 0,
|
2019-01-30 13:21:26 +00:00
|
|
|
"disable MSI-X (default 0)");
|
2017-09-23 01:37:01 +00:00
|
|
|
SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "rx_budget",
|
|
|
|
CTLFLAG_RWTUN, &ctx->ifc_sysctl_rx_budget, 0,
|
2019-05-06 20:56:41 +00:00
|
|
|
"set the RX budget");
|
2018-07-20 17:45:26 +00:00
|
|
|
SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "tx_abdicate",
|
|
|
|
CTLFLAG_RWTUN, &ctx->ifc_sysctl_tx_abdicate, 0,
|
2019-05-06 20:56:41 +00:00
|
|
|
"cause TX to abdicate instead of running to completion");
|
2019-04-25 21:24:56 +00:00
|
|
|
ctx->ifc_sysctl_core_offset = CORE_OFFSET_UNSPECIFIED;
|
|
|
|
SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "core_offset",
|
|
|
|
CTLFLAG_RDTUN, &ctx->ifc_sysctl_core_offset, 0,
|
|
|
|
"offset to start using cores at");
|
|
|
|
SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "separate_txrx",
|
|
|
|
CTLFLAG_RDTUN, &ctx->ifc_sysctl_separate_txrx, 0,
|
|
|
|
"use separate cores for TX and RX");
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "use_logical_cores",
|
|
|
|
CTLFLAG_RDTUN, &ctx->ifc_sysctl_use_logical_cores, 0,
|
|
|
|
"try to make use of logical cores for TX and RX");
|
2016-08-12 21:29:44 +00:00
|
|
|
|
|
|
|
/* XXX change for per-queue sizes */
|
|
|
|
SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds",
|
2020-02-26 14:26:36 +00:00
|
|
|
CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, ctx,
|
|
|
|
IFLIB_NTXD_HANDLER, mp_ndesc_handler, "A",
|
|
|
|
"list of # of TX descriptors to use, 0 = use default #");
|
2016-08-12 21:29:44 +00:00
|
|
|
SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_nrxds",
|
2020-02-26 14:26:36 +00:00
|
|
|
CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, ctx,
|
|
|
|
IFLIB_NRXD_HANDLER, mp_ndesc_handler, "A",
|
|
|
|
"list of # of RX descriptors to use, 0 = use default #");
|
2016-05-18 04:35:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
iflib_add_device_sysctl_post(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
if_shared_ctx_t sctx = ctx->ifc_sctx;
|
|
|
|
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
|
|
|
|
device_t dev = iflib_get_dev(ctx);
|
|
|
|
struct sysctl_oid_list *child;
|
|
|
|
struct sysctl_ctx_list *ctx_list;
|
|
|
|
iflib_fl_t fl;
|
|
|
|
iflib_txq_t txq;
|
|
|
|
iflib_rxq_t rxq;
|
|
|
|
int i, j;
|
|
|
|
char namebuf[NAME_BUFLEN];
|
|
|
|
char *qfmt;
|
|
|
|
struct sysctl_oid *queue_node, *fl_node, *node;
|
|
|
|
struct sysctl_oid_list *queue_list, *fl_list;
|
|
|
|
ctx_list = device_get_sysctl_ctx(dev);
|
|
|
|
|
|
|
|
node = ctx->ifc_sysctl_node;
|
|
|
|
child = SYSCTL_CHILDREN(node);
|
|
|
|
|
|
|
|
if (scctx->isc_ntxqsets > 100)
|
|
|
|
qfmt = "txq%03d";
|
|
|
|
else if (scctx->isc_ntxqsets > 10)
|
|
|
|
qfmt = "txq%02d";
|
|
|
|
else
|
|
|
|
qfmt = "txq%d";
|
|
|
|
for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) {
|
|
|
|
snprintf(namebuf, NAME_BUFLEN, qfmt, i);
|
|
|
|
queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
|
2020-02-26 14:26:36 +00:00
|
|
|
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
|
2016-05-18 04:35:58 +00:00
|
|
|
queue_list = SYSCTL_CHILDREN(queue_node);
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&txq->ift_task.gt_cpu, 0, "cpu this queue is bound to");
|
2016-05-18 04:35:58 +00:00
|
|
|
#if MEMORY_LOGGING
|
|
|
|
SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&txq->ift_dequeued, "total mbufs freed");
|
|
|
|
SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_enqueued",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&txq->ift_enqueued, "total mbufs enqueued");
|
|
|
|
#endif
|
|
|
|
SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&txq->ift_mbuf_defrag, "# of times m_defrag was called");
|
|
|
|
SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "m_pullups",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&txq->ift_pullups, "# of times m_pullup was called");
|
|
|
|
SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag_failed",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&txq->ift_mbuf_defrag_failed, "# of times m_defrag failed");
|
|
|
|
SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_desc_avail",
|
|
|
|
CTLFLAG_RD,
|
2016-08-12 21:29:44 +00:00
|
|
|
&txq->ift_no_desc_avail, "# of times no descriptors were available");
|
2016-05-18 04:35:58 +00:00
|
|
|
SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "tx_map_failed",
|
|
|
|
CTLFLAG_RD,
|
2019-05-06 20:56:41 +00:00
|
|
|
&txq->ift_map_failed, "# of times DMA map failed");
|
2016-05-18 04:35:58 +00:00
|
|
|
SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txd_encap_efbig",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&txq->ift_txd_encap_efbig, "# of times txd_encap returned EFBIG");
|
|
|
|
SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_tx_dma_setup",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&txq->ift_no_tx_dma_setup, "# of times map failed for other than EFBIG");
|
|
|
|
SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_pidx",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&txq->ift_pidx, 1, "Producer Index");
|
|
|
|
SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&txq->ift_cidx, 1, "Consumer Index");
|
|
|
|
SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx_processed",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&txq->ift_cidx_processed, 1, "Consumer Index seen by credit update");
|
|
|
|
SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_in_use",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&txq->ift_in_use, 1, "descriptors in use");
|
|
|
|
SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_processed",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&txq->ift_processed, "descriptors procesed for clean");
|
|
|
|
SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_cleaned",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&txq->ift_cleaned, "total cleaned");
|
|
|
|
SYSCTL_ADD_PROC(ctx_list, queue_list, OID_AUTO, "ring_state",
|
2020-02-26 14:26:36 +00:00
|
|
|
CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
|
|
|
|
__DEVOLATILE(uint64_t *, &txq->ift_br->state), 0,
|
|
|
|
mp_ring_state_handler, "A", "soft ring state");
|
2016-05-18 04:35:58 +00:00
|
|
|
SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_enqueues",
|
2017-03-13 22:53:06 +00:00
|
|
|
CTLFLAG_RD, &txq->ift_br->enqueues,
|
2016-05-18 04:35:58 +00:00
|
|
|
"# of enqueues to the mp_ring for this queue");
|
|
|
|
SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_drops",
|
2017-03-13 22:53:06 +00:00
|
|
|
CTLFLAG_RD, &txq->ift_br->drops,
|
2016-05-18 04:35:58 +00:00
|
|
|
"# of drops in the mp_ring for this queue");
|
|
|
|
SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_starts",
|
2017-03-13 22:53:06 +00:00
|
|
|
CTLFLAG_RD, &txq->ift_br->starts,
|
2016-05-18 04:35:58 +00:00
|
|
|
"# of normal consumer starts in the mp_ring for this queue");
|
|
|
|
SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_stalls",
|
2017-03-13 22:53:06 +00:00
|
|
|
CTLFLAG_RD, &txq->ift_br->stalls,
|
2016-05-18 04:35:58 +00:00
|
|
|
"# of consumer stalls in the mp_ring for this queue");
|
|
|
|
SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_restarts",
|
2017-03-13 22:53:06 +00:00
|
|
|
CTLFLAG_RD, &txq->ift_br->restarts,
|
2016-05-18 04:35:58 +00:00
|
|
|
"# of consumer restarts in the mp_ring for this queue");
|
|
|
|
SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_abdications",
|
2017-03-13 22:53:06 +00:00
|
|
|
CTLFLAG_RD, &txq->ift_br->abdications,
|
2016-05-18 04:35:58 +00:00
|
|
|
"# of consumer abdications in the mp_ring for this queue");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (scctx->isc_nrxqsets > 100)
|
|
|
|
qfmt = "rxq%03d";
|
|
|
|
else if (scctx->isc_nrxqsets > 10)
|
|
|
|
qfmt = "rxq%02d";
|
|
|
|
else
|
|
|
|
qfmt = "rxq%d";
|
|
|
|
for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) {
|
|
|
|
snprintf(namebuf, NAME_BUFLEN, qfmt, i);
|
|
|
|
queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
|
2020-02-26 14:26:36 +00:00
|
|
|
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
|
2016-05-18 04:35:58 +00:00
|
|
|
queue_list = SYSCTL_CHILDREN(queue_node);
|
iflib: Improve mapping of TX/RX queues to CPUs
iflib now supports mapping each (TX,RX) queue pair to the same CPU
(default), to separate CPUs, or to a pair of physical and logical CPUs
that share the same L2 cache. The mapping mechanism supports unequal
numbers of TX and RX queues, with the excess queues always being
mapped to consecutive physical CPUs. When the platform cannot
distinguish between physical and logical CPUs, all are treated as
physical CPUs. See the comment on get_cpuid_for_queue() for the
entire matrix.
The following device-specific tunables influence the mapping process:
dev.<device>.<unit>.iflib.core_offset (existing)
dev.<device>.<unit>.iflib.separate_txrx (existing)
dev.<device>.<unit>.iflib.use_logical_cores (new)
The following new, read-only sysctls provide visibility of the mapping
results:
dev.<device>.<unit>.iflib.{t,r}xq<n>.cpu
When an iflib driver allocates TX softirqs without providing reference
RX IRQs, iflib now binds those TX softirqs to CPUs using the above
mapping mechanism (that is, treats them as if they were TX IRQs).
Previously, such bindings were left up to the grouptaskqueue code and
thus fell outside of the iflib CPU mapping strategy.
Reviewed by: kbowling
Tested by: olivier, pkelsey
MFC after: 3 weeks
Differential Revision: https://reviews.freebsd.org/D24094
2021-04-26 04:25:59 +00:00
|
|
|
SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&rxq->ifr_task.gt_cpu, 0, "cpu this queue is bound to");
|
2016-08-12 21:29:44 +00:00
|
|
|
if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
|
2016-05-18 04:35:58 +00:00
|
|
|
SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_cidx",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&rxq->ifr_cq_cidx, 1, "Consumer Index");
|
|
|
|
}
|
2016-11-18 04:19:21 +00:00
|
|
|
|
2016-05-18 04:35:58 +00:00
|
|
|
for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
|
|
|
|
snprintf(namebuf, NAME_BUFLEN, "rxq_fl%d", j);
|
|
|
|
fl_node = SYSCTL_ADD_NODE(ctx_list, queue_list, OID_AUTO, namebuf,
|
2020-02-26 14:26:36 +00:00
|
|
|
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "freelist Name");
|
2016-05-18 04:35:58 +00:00
|
|
|
fl_list = SYSCTL_CHILDREN(fl_node);
|
|
|
|
SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "pidx",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&fl->ifl_pidx, 1, "Producer Index");
|
|
|
|
SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "cidx",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&fl->ifl_cidx, 1, "Consumer Index");
|
|
|
|
SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "credits",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&fl->ifl_credits, 1, "credits available");
|
2020-03-14 19:56:46 +00:00
|
|
|
SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "buf_size",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&fl->ifl_buf_size, 1, "buffer size");
|
2016-05-18 04:35:58 +00:00
|
|
|
#if MEMORY_LOGGING
|
|
|
|
SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_enqueued",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&fl->ifl_m_enqueued, "mbufs allocated");
|
|
|
|
SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_dequeued",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&fl->ifl_m_dequeued, "mbufs freed");
|
|
|
|
SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_enqueued",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&fl->ifl_cl_enqueued, "clusters allocated");
|
|
|
|
SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_dequeued",
|
|
|
|
CTLFLAG_RD,
|
|
|
|
&fl->ifl_cl_dequeued, "clusters freed");
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
2017-03-13 22:53:06 +00:00
|
|
|
|
2018-10-12 22:40:54 +00:00
|
|
|
void
|
|
|
|
iflib_request_reset(if_ctx_t ctx)
|
|
|
|
{
|
|
|
|
|
|
|
|
STATE_LOCK(ctx);
|
|
|
|
ctx->ifc_flags |= IFC_DO_RESET;
|
|
|
|
STATE_UNLOCK(ctx);
|
|
|
|
}
|
|
|
|
|
2017-03-13 22:53:06 +00:00
|
|
|
#ifndef __NO_STRICT_ALIGNMENT
|
|
|
|
static struct mbuf *
|
|
|
|
iflib_fixup_rx(struct mbuf *m)
|
|
|
|
{
|
|
|
|
struct mbuf *n;
|
|
|
|
|
|
|
|
if (m->m_len <= (MCLBYTES - ETHER_HDR_LEN)) {
|
|
|
|
bcopy(m->m_data, m->m_data + ETHER_HDR_LEN, m->m_len);
|
|
|
|
m->m_data += ETHER_HDR_LEN;
|
|
|
|
n = m;
|
|
|
|
} else {
|
|
|
|
MGETHDR(n, M_NOWAIT, MT_DATA);
|
|
|
|
if (n == NULL) {
|
|
|
|
m_freem(m);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
bcopy(m->m_data, n->m_data, ETHER_HDR_LEN);
|
|
|
|
m->m_data += ETHER_HDR_LEN;
|
|
|
|
m->m_len -= ETHER_HDR_LEN;
|
|
|
|
n->m_len = ETHER_HDR_LEN;
|
|
|
|
M_MOVE_PKTHDR(n, m);
|
|
|
|
n->m_next = m;
|
|
|
|
}
|
|
|
|
return (n);
|
|
|
|
}
|
|
|
|
#endif
|
2018-05-06 00:57:52 +00:00
|
|
|
|
2019-10-17 16:23:03 +00:00
|
|
|
#ifdef DEBUGNET
|
2018-05-06 00:57:52 +00:00
|
|
|
static void
|
2019-10-17 16:23:03 +00:00
|
|
|
iflib_debugnet_init(if_t ifp, int *nrxr, int *ncl, int *clsize)
|
2018-05-06 00:57:52 +00:00
|
|
|
{
|
|
|
|
if_ctx_t ctx;
|
|
|
|
|
|
|
|
ctx = if_getsoftc(ifp);
|
|
|
|
CTX_LOCK(ctx);
|
|
|
|
*nrxr = NRXQSETS(ctx);
|
|
|
|
*ncl = ctx->ifc_rxqs[0].ifr_fl->ifl_size;
|
|
|
|
*clsize = ctx->ifc_rxqs[0].ifr_fl->ifl_buf_size;
|
|
|
|
CTX_UNLOCK(ctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2019-10-17 16:23:03 +00:00
|
|
|
iflib_debugnet_event(if_t ifp, enum debugnet_ev event)
|
2018-05-06 00:57:52 +00:00
|
|
|
{
|
|
|
|
if_ctx_t ctx;
|
|
|
|
if_softc_ctx_t scctx;
|
|
|
|
iflib_fl_t fl;
|
|
|
|
iflib_rxq_t rxq;
|
|
|
|
int i, j;
|
|
|
|
|
|
|
|
ctx = if_getsoftc(ifp);
|
|
|
|
scctx = &ctx->ifc_softc_ctx;
|
|
|
|
|
|
|
|
switch (event) {
|
2019-10-17 16:23:03 +00:00
|
|
|
case DEBUGNET_START:
|
2018-05-06 00:57:52 +00:00
|
|
|
for (i = 0; i < scctx->isc_nrxqsets; i++) {
|
|
|
|
rxq = &ctx->ifc_rxqs[i];
|
|
|
|
for (j = 0; j < rxq->ifr_nfl; j++) {
|
|
|
|
fl = rxq->ifr_fl;
|
|
|
|
fl->ifl_zone = m_getzone(fl->ifl_buf_size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
iflib_no_tx_batch = 1;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2019-10-17 16:23:03 +00:00
|
|
|
iflib_debugnet_transmit(if_t ifp, struct mbuf *m)
|
2018-05-06 00:57:52 +00:00
|
|
|
{
|
|
|
|
if_ctx_t ctx;
|
|
|
|
iflib_txq_t txq;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
ctx = if_getsoftc(ifp);
|
|
|
|
if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
|
|
|
|
IFF_DRV_RUNNING)
|
|
|
|
return (EBUSY);
|
|
|
|
|
|
|
|
txq = &ctx->ifc_txqs[0];
|
|
|
|
error = iflib_encap(txq, &m);
|
|
|
|
if (error == 0)
|
2020-12-19 01:08:33 +00:00
|
|
|
(void)iflib_txd_db_check(txq, true);
|
2018-05-06 00:57:52 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2019-10-17 16:23:03 +00:00
|
|
|
iflib_debugnet_poll(if_t ifp, int count)
|
2018-05-06 00:57:52 +00:00
|
|
|
{
|
2020-01-23 01:27:58 +00:00
|
|
|
struct epoch_tracker et;
|
2018-05-06 00:57:52 +00:00
|
|
|
if_ctx_t ctx;
|
|
|
|
if_softc_ctx_t scctx;
|
|
|
|
iflib_txq_t txq;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
ctx = if_getsoftc(ifp);
|
|
|
|
scctx = &ctx->ifc_softc_ctx;
|
|
|
|
|
|
|
|
if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
|
|
|
|
IFF_DRV_RUNNING)
|
|
|
|
return (EBUSY);
|
|
|
|
|
|
|
|
txq = &ctx->ifc_txqs[0];
|
|
|
|
(void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
|
|
|
|
|
2020-01-23 01:27:58 +00:00
|
|
|
NET_EPOCH_ENTER(et);
|
2018-05-06 00:57:52 +00:00
|
|
|
for (i = 0; i < scctx->isc_nrxqsets; i++)
|
|
|
|
(void)iflib_rxeof(&ctx->ifc_rxqs[i], 16 /* XXX */);
|
2020-01-23 01:27:58 +00:00
|
|
|
NET_EPOCH_EXIT(et);
|
2018-05-06 00:57:52 +00:00
|
|
|
return (0);
|
|
|
|
}
|
2019-10-17 16:23:03 +00:00
|
|
|
#endif /* DEBUGNET */
|