diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile index e5fa955f6cc9..0d5a780970e4 100644 --- a/share/man/man4/Makefile +++ b/share/man/man4/Makefile @@ -253,6 +253,7 @@ MAN= aac.4 \ net80211.4 \ netgraph.4 \ netintro.4 \ + netmap.4 \ ${_nfe.4} \ ${_nfsmb.4} \ ng_async.4 \ diff --git a/share/man/man4/netmap.4 b/share/man/man4/netmap.4 new file mode 100644 index 000000000000..8b646f9fa070 --- /dev/null +++ b/share/man/man4/netmap.4 @@ -0,0 +1,300 @@ +.\" Copyright (c) 2011 Matteo Landi, Luigi Rizzo, Universita` di Pisa +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" This document is derived in part from the enet man page (enet.4) +.\" distributed with 4.3BSD Unix. +.\" +.\" $FreeBSD$ +.\" $Id: netmap.4 9662 2011-11-16 13:18:06Z luigi $: stable/8/share/man/man4/bpf.4 181694 2008-08-13 17:45:06Z ed $ +.\" +.Dd November 16, 2011 +.Dt NETMAP 4 +.Os +.Sh NAME +.Nm netmap +.Nd a framework for fast packet I/O +.Sh SYNOPSIS +.Cd device netmap +.Sh DESCRIPTION +.Nm +is a framework for fast and safe access to network devices +(reaching 14.88 Mpps at less than 1 GHz). +.Nm +uses memory mapped buffers and metadata +(buffer indexes and lengths) to communicate with the kernel, +which is in charge of validating information through +.Pa ioctl() +and +.Pa select()/poll(). +.Nm +can exploit the parallelism in multiqueue devices and +multicore systems. +.Pp +.Pp +.Nm +requires explicit support in device drivers. +For a list of supported devices, see the end of this manual page. +.Sh OPERATION +.Nm +clients must first open the +.Pa open("/dev/netmap") , +and then issue an +.Pa ioctl(...,NIOCREGIF,...) +to bind the file descriptor to a network device. +.Pp +When a device is put in +.Nm +mode, its data path is disconnected from the host stack. +The processes owning the file descriptor +can exchange packets with the device, or with the host stack, +through an mmapped memory region that contains pre-allocated +buffers and metadata. +.Pp +Non blocking I/O is done with special +.Pa ioctl()'s , +whereas the file descriptor can be passed to +.Pa select()/poll() +to be notified about incoming packet or available transmit buffers. +.Ss Data structures +All data structures for all devices in +.Nm +mode are in a memory +region shared by the kernel and all processes +who open +.Pa /dev/netmap +(NOTE: visibility may be restricted in future implementations). +All references between the shared data structure +are relative (offsets or indexes). Some macros help converting +them into actual pointers. +.Pp +The data structures in shared memory are the following: +.Pp +.Bl -tag -width XXX +.It Dv struct netmap_if (one per interface) +indicates the number of rings supported by an interface, their +sizes, and the offsets of the +.Pa netmap_rings +associated to the interface. +The offset of a +.Pa struct netmap_if +in the shared memory region is indicated by the +.Pa nr_offset +field in the structure returned by the +.Pa NIOCREGIF +(see below). +.Bd -literal +struct netmap_if { + char ni_name[IFNAMSIZ]; /* name of the interface. */ + const u_int ni_num_queues; /* number of hw ring pairs */ + const ssize_t ring_ofs[]; /* offset of tx and rx rings */ +}; +.Ed +.It Dv struct netmap_ring (one per ring) +contains the index of the current read or write slot (cur), +the number of slots available for reception or transmission (avail), +and an array of +.Pa slots +describing the buffers. +There is one ring pair for each of the N hardware ring pairs +supported by the card (numbered 0..N-1), plus +one ring pair (numbered N) for packets from/to the host stack. +.Bd -literal +struct netmap_ring { + const ssize_t buf_ofs; + const uint32_t num_slots; /* number of slots in the ring. */ + uint32_t avail; /* number of usable slots */ + uint32_t cur; /* 'current' index for the user side */ + + const uint16_t nr_buf_size; + uint16_t flags; + struct netmap_slot slot[0]; /* array of slots. */ +} +.Ed +.It Dv struct netmap_slot (one per packet) +contains the metadata for a packet: a buffer index (buf_idx), +a buffer length (len), and some flags. +.Bd -literal +struct netmap_slot { + uint32_t buf_idx; /* buffer index */ + uint16_t len; /* packet length */ + uint16_t flags; /* buf changed, etc. */ +#define NS_BUF_CHANGED 0x0001 /* must resync, buffer changed */ +#define NS_REPORT 0x0002 /* tell hw to report results + * e.g. by generating an interrupt + */ +}; +.Ed +.It Dv packet buffers +are fixed size (approximately 2k) buffers allocated by the kernel +that contain packet data. Buffers addresses are computed through +macros. +.El +.Pp +Some macros support the access to objects in the shared memory +region. In particular: +.Bd -literal +struct netmap_if *nifp; +... +struct netmap_ring *txring = NETMAP_TXRING(nifp, i); +struct netmap_ring *rxring = NETMAP_RXRING(nifp, i); +int i = txring->slot[txring->cur].buf_idx; +char *buf = NETMAP_BUF(txring, i); +.Ed +.Ss IOCTLS +.Pp +.Nm +supports some ioctl() to synchronize the state of the rings +between the kernel and the user processes, plus some +to query and configure the interface. +The former do not require any argument, whereas the latter +use a +.Pa struct netmap_req +defined as follows: +.Bd -literal +struct nmreq { + char nr_name[IFNAMSIZ]; + uint32_t nr_offset; /* nifp offset in the shared region */ + uint32_t nr_memsize; /* size of the shared region */ + uint32_t nr_numdescs; /* descriptors per queue */ + uint16_t nr_numqueues; + uint16_t nr_ringid; /* ring(s) we care about */ +#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */ +#define NETMAP_SW_RING 0x2000 /* we process the sw ring */ +#define NETMAP_NO_TX_POLL 0x1000 /* no gratuitous txsync on poll */ +#define NETMAP_RING_MASK 0xfff /* the actual ring number */ +}; + +.Ed +A device descriptor obtained through +.Pa /dev/netmap +also supports the ioctl supported by network devices. +.Pp +The netmap-specific +.Xr ioctl 2 +command codes below are defined in +.In net/netmap.h +and are: +.Bl -tag -width XXXX +.It Dv NIOCGINFO +returns information about the interface named in nr_name. +On return, nr_memsize indicates the size of the shared netmap +memory region (this is device-independent), +nr_numslots indicates how many buffers are in a ring, +nr_numrings indicates the number of rings supported by the hardware. +.Pp +If the device does not support netmap, the ioctl returns EINVAL. +.It Dv NIOCREGIF +puts the interface named in nr_name into netmap mode, disconnecting +it from the host stack, and/or defines which rings are controlled +through this file descriptor. +On return, it gives the same info as NIOCGINFO, and nr_ringid +indicates the identity of the rings controlled through the file +descriptor. +.Pp +Possible values for nr_ringid are +.Bl -tag -width XXXXX +.It 0 +default, all hardware rings +.It NETMAP_SW_RING +the ``host rings'' connecting to the host stack +.It NETMAP_HW_RING + i +the i-th hardware ring +.El +By default, a +.Nm poll +or +.Nm select +call pushes out any pending packets on the transmit ring, even if +no write events are specified. +The feature can be disabled by or-ing +.Nm NETMAP_NO_TX_SYNC +to nr_ringid. +But normally you should keep this feature unless you are using +separate file descriptors for the send and receive rings, because +otherwise packets are pushed out only if NETMAP_TXSYNC is called, +or the send queue is full. +.Pp +.Pa NIOCREGIF +can be used multiple times to change the association of a +file descriptor to a ring pair, always within the same device. +.It Dv NIOCUNREGIF +brings an interface back to normal mode. +.It Dv NIOCTXSYNC +tells the hardware of new packets to transmit, and updates the +number of slots available for transmission. +.It Dv NIOCRXSYNC +tells the hardware of consumed packets, and asks for newly available +packets. +.El +.Ss SYSTEM CALLS +.Nm +uses +.Nm select +and +.Nm poll +to wake up processes when significant events occur. +.Sh EXAMPLES +The following code implements a traffic generator +.Pp +.Bd -literal -compact +#include +#include +struct netmap_if *nifp; +struct netmap_ring *ring; +struct netmap_request nmr; + +fd = open("/dev/netmap", O_RDWR); +bzero(&nmr, sizeof(nmr)); +strcpy(nmr.nm_name, "ix0"); +ioctl(fd, NIOCREG, &nmr); +p = mmap(0, nmr.memsize, fd); +nifp = NETMAP_IF(p, nmr.offset); +ring = NETMAP_TXRING(nifp, 0); +fds.fd = fd; +fds.events = POLLOUT; +for (;;) { + poll(list, 1, -1); + while (ring->avail-- > 0) { + i = ring->cur; + buf = NETMAP_BUF(ring, ring->slot[i].buf_index); + ... prepare packet in buf ... + ring->slot[i].len = ... packet length ... + ring->cur = NETMAP_RING_NEXT(ring, i); + } +} +.Ed +.Sh SUPPORTED INTERFACES +.Nm +supports the following interfaces: +.Xr em 4 , +.Xr ixgbe 4 , +.Xr re 4 , +.Sh AUTHORS +The +.Nm +framework has been designed and implemented by +.An Luigi Rizzo +and +.An Matteo Landi +in 2011 at the Universita` di Pisa. diff --git a/sys/dev/netmap/head.diff b/sys/dev/netmap/head.diff new file mode 100644 index 000000000000..51a8e34e74d1 --- /dev/null +++ b/sys/dev/netmap/head.diff @@ -0,0 +1,654 @@ +Index: conf/NOTES +=================================================================== +--- conf/NOTES (revision 227552) ++++ conf/NOTES (working copy) +@@ -799,6 +799,12 @@ + # option. DHCP requires bpf. + device bpf + ++# The `netmap' device implements memory-mapped access to network ++# devices from userspace, enabling wire-speed packet capture and ++# generation even at 10Gbit/s. Requires support in the device ++# driver. Supported drivers are ixgbe, e1000, re. ++device netmap ++ + # The `disc' device implements a minimal network interface, + # which throws away all packets sent and never receives any. It is + # included for testing and benchmarking purposes. +Index: conf/files +=================================================================== +--- conf/files (revision 227552) ++++ conf/files (working copy) +@@ -1507,6 +1507,7 @@ + dev/my/if_my.c optional my + dev/ncv/ncr53c500.c optional ncv + dev/ncv/ncr53c500_pccard.c optional ncv pccard ++dev/netmap/netmap.c optional netmap + dev/nge/if_nge.c optional nge + dev/nxge/if_nxge.c optional nxge + dev/nxge/xgehal/xgehal-device.c optional nxge +Index: conf/options +=================================================================== +--- conf/options (revision 227552) ++++ conf/options (working copy) +@@ -689,6 +689,7 @@ + + # various 'device presence' options. + DEV_BPF opt_bpf.h ++DEV_NETMAP opt_global.h + DEV_MCA opt_mca.h + DEV_CARP opt_carp.h + DEV_SPLASH opt_splash.h +Index: dev/e1000/if_igb.c +=================================================================== +--- dev/e1000/if_igb.c (revision 227552) ++++ dev/e1000/if_igb.c (working copy) +@@ -369,6 +369,9 @@ + &igb_rx_process_limit, 0, + "Maximum number of received packets to process at a time, -1 means unlimited"); + ++#ifdef DEV_NETMAP ++#include ++#endif /* DEV_NETMAP */ + /********************************************************************* + * Device identification routine + * +@@ -664,6 +667,9 @@ + adapter->led_dev = led_create(igb_led_func, adapter, + device_get_nameunit(dev)); + ++#ifdef DEV_NETMAP ++ igb_netmap_attach(adapter); ++#endif /* DEV_NETMAP */ + INIT_DEBUGOUT("igb_attach: end"); + + return (0); +@@ -742,6 +748,9 @@ + + callout_drain(&adapter->timer); + ++#ifdef DEV_NETMAP ++ netmap_detach(adapter->ifp); ++#endif /* DEV_NETMAP */ + igb_free_pci_resources(adapter); + bus_generic_detach(dev); + if_free(ifp); +@@ -3212,6 +3221,10 @@ + struct adapter *adapter = txr->adapter; + struct igb_tx_buffer *txbuf; + int i; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), ++ NR_TX, txr->me, 0); ++#endif + + /* Clear the old descriptor contents */ + IGB_TX_LOCK(txr); +@@ -3231,6 +3244,13 @@ + m_freem(txbuf->m_head); + txbuf->m_head = NULL; + } ++#ifdef DEV_NETMAP ++ if (slot) { ++ netmap_load_map(txr->txtag, txbuf->map, ++ NMB(slot), adapter->rx_mbuf_sz); ++ slot++; ++ } ++#endif /* DEV_NETMAP */ + /* clear the watch index */ + txbuf->next_eop = -1; + } +@@ -3626,6 +3646,19 @@ + + IGB_TX_LOCK_ASSERT(txr); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ struct netmap_adapter *na = NA(ifp); ++ ++ selwakeuppri(&na->tx_rings[txr->me].si, PI_NET); ++ IGB_TX_UNLOCK(txr); ++ IGB_CORE_LOCK(adapter); ++ selwakeuppri(&na->tx_rings[na->num_queues + 1].si, PI_NET); ++ IGB_CORE_UNLOCK(adapter); ++ IGB_TX_LOCK(txr); // the caller is supposed to own the lock ++ return FALSE; ++ } ++#endif /* DEV_NETMAP */ + if (txr->tx_avail == adapter->num_tx_desc) { + txr->queue_status = IGB_QUEUE_IDLE; + return FALSE; +@@ -3949,6 +3982,10 @@ + bus_dma_segment_t pseg[1], hseg[1]; + struct lro_ctrl *lro = &rxr->lro; + int rsize, nsegs, error = 0; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(rxr->adapter->ifp), ++ NR_RX, rxr->me, 0); ++#endif + + adapter = rxr->adapter; + dev = adapter->dev; +@@ -3974,6 +4011,18 @@ + struct mbuf *mh, *mp; + + rxbuf = &rxr->rx_buffers[j]; ++#ifdef DEV_NETMAP ++ if (slot) { ++ netmap_load_map(rxr->ptag, ++ rxbuf->pmap, NMB(slot), ++ adapter->rx_mbuf_sz); ++ /* Update descriptor */ ++ rxr->rx_base[j].read.pkt_addr = ++ htole64(vtophys(NMB(slot))); ++ slot++; ++ continue; ++ } ++#endif /* DEV_NETMAP */ + if (rxr->hdr_split == FALSE) + goto skip_head; + +@@ -4436,6 +4485,19 @@ + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ struct netmap_adapter *na = NA(ifp); ++ ++ selwakeuppri(&na->rx_rings[rxr->me].si, PI_NET); ++ IGB_RX_UNLOCK(rxr); ++ IGB_CORE_LOCK(adapter); ++ selwakeuppri(&na->rx_rings[na->num_queues + 1].si, PI_NET); ++ IGB_CORE_UNLOCK(adapter); ++ return (0); ++ } ++#endif /* DEV_NETMAP */ ++ + /* Main clean loop */ + for (i = rxr->next_to_check; count != 0;) { + struct mbuf *sendmp, *mh, *mp; +Index: dev/e1000/if_lem.c +=================================================================== +--- dev/e1000/if_lem.c (revision 227552) ++++ dev/e1000/if_lem.c (working copy) +@@ -316,6 +316,10 @@ + /* Global used in WOL setup with multiport cards */ + static int global_quad_port_a = 0; + ++#ifdef DEV_NETMAP ++#include ++#endif /* DEV_NETMAP */ ++ + /********************************************************************* + * Device identification routine + * +@@ -646,6 +650,9 @@ + adapter->led_dev = led_create(lem_led_func, adapter, + device_get_nameunit(dev)); + ++#ifdef DEV_NETMAP ++ lem_netmap_attach(adapter); ++#endif /* DEV_NETMAP */ + INIT_DEBUGOUT("lem_attach: end"); + + return (0); +@@ -724,6 +731,9 @@ + callout_drain(&adapter->timer); + callout_drain(&adapter->tx_fifo_timer); + ++#ifdef DEV_NETMAP ++ netmap_detach(ifp); ++#endif /* DEV_NETMAP */ + lem_free_pci_resources(adapter); + bus_generic_detach(dev); + if_free(ifp); +@@ -2637,6 +2647,9 @@ + lem_setup_transmit_structures(struct adapter *adapter) + { + struct em_buffer *tx_buffer; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), NR_TX, 0, 0); ++#endif + + /* Clear the old ring contents */ + bzero(adapter->tx_desc_base, +@@ -2650,6 +2663,15 @@ + bus_dmamap_unload(adapter->txtag, tx_buffer->map); + m_freem(tx_buffer->m_head); + tx_buffer->m_head = NULL; ++#ifdef DEV_NETMAP ++ if (slot) { ++ /* reload the map for netmap mode */ ++ netmap_load_map(adapter->txtag, ++ tx_buffer->map, NMB(slot), ++ NA(adapter->ifp)->buff_size); ++ slot++; ++ } ++#endif /* DEV_NETMAP */ + tx_buffer->next_eop = -1; + } + +@@ -2951,6 +2973,12 @@ + + EM_TX_LOCK_ASSERT(adapter); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->tx_rings[0].si, PI_NET); ++ return; ++ } ++#endif /* DEV_NETMAP */ + if (adapter->num_tx_desc_avail == adapter->num_tx_desc) + return; + +@@ -3181,6 +3209,9 @@ + { + struct em_buffer *rx_buffer; + int i, error; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), NR_RX, 0, 0); ++#endif + + /* Reset descriptor ring */ + bzero(adapter->rx_desc_base, +@@ -3200,6 +3231,18 @@ + + /* Allocate new ones. */ + for (i = 0; i < adapter->num_rx_desc; i++) { ++#ifdef DEV_NETMAP ++ if (slot) { ++ netmap_load_map(adapter->rxtag, ++ rx_buffer->map, NMB(slot), ++ NA(adapter->ifp)->buff_size); ++ /* Update descriptor */ ++ adapter->rx_desc_base[i].buffer_addr = ++ htole64(vtophys(NMB(slot))); ++ slot++; ++ continue; ++ } ++#endif /* DEV_NETMAP */ + error = lem_get_buf(adapter, i); + if (error) + return (error); +@@ -3407,6 +3450,14 @@ + bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, + BUS_DMASYNC_POSTREAD); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->rx_rings[0].si, PI_NET); ++ EM_RX_UNLOCK(adapter); ++ return (0); ++ } ++#endif /* DEV_NETMAP */ ++ + if (!((current_desc->status) & E1000_RXD_STAT_DD)) { + if (done != NULL) + *done = rx_sent; +Index: dev/e1000/if_em.c +=================================================================== +--- dev/e1000/if_em.c (revision 227552) ++++ dev/e1000/if_em.c (working copy) +@@ -399,6 +399,10 @@ + /* Global used in WOL setup with multiport cards */ + static int global_quad_port_a = 0; + ++#ifdef DEV_NETMAP ++#include ++#endif /* DEV_NETMAP */ ++ + /********************************************************************* + * Device identification routine + * +@@ -714,6 +718,9 @@ + + adapter->led_dev = led_create(em_led_func, adapter, + device_get_nameunit(dev)); ++#ifdef DEV_NETMAP ++ em_netmap_attach(adapter); ++#endif /* DEV_NETMAP */ + + INIT_DEBUGOUT("em_attach: end"); + +@@ -785,6 +792,10 @@ + ether_ifdetach(adapter->ifp); + callout_drain(&adapter->timer); + ++#ifdef DEV_NETMAP ++ netmap_detach(ifp); ++#endif /* DEV_NETMAP */ ++ + em_free_pci_resources(adapter); + bus_generic_detach(dev); + if_free(ifp); +@@ -3213,6 +3224,10 @@ + struct adapter *adapter = txr->adapter; + struct em_buffer *txbuf; + int i; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), ++ NR_TX, txr->me, 0); ++#endif + + /* Clear the old descriptor contents */ + EM_TX_LOCK(txr); +@@ -3232,6 +3247,16 @@ + m_freem(txbuf->m_head); + txbuf->m_head = NULL; + } ++#ifdef DEV_NETMAP ++ if (slot) { ++ /* reload the map for netmap mode */ ++ netmap_load_map(txr->txtag, ++ txbuf->map, NMB(slot), ++ adapter->rx_mbuf_sz); ++ slot++; ++ } ++#endif /* DEV_NETMAP */ ++ + /* clear the watch index */ + txbuf->next_eop = -1; + } +@@ -3682,6 +3707,12 @@ + struct ifnet *ifp = adapter->ifp; + + EM_TX_LOCK_ASSERT(txr); ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->tx_rings[txr->me].si, PI_NET); ++ return (FALSE); ++ } ++#endif /* DEV_NETMAP */ + + /* No work, make sure watchdog is off */ + if (txr->tx_avail == adapter->num_tx_desc) { +@@ -3978,6 +4009,33 @@ + if (++j == adapter->num_rx_desc) + j = 0; + } ++#ifdef DEV_NETMAP ++ { ++ /* slot is NULL if we are not in netmap mode */ ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), ++ NR_RX, rxr->me, rxr->next_to_check); ++ /* ++ * we need to restore all buffer addresses in the ring as they might ++ * be in the wrong state if we are exiting from netmap mode. ++ */ ++ for (j = 0; j != adapter->num_rx_desc; ++j) { ++ void *addr; ++ rxbuf = &rxr->rx_buffers[j]; ++ if (rxbuf->m_head == NULL && !slot) ++ continue; ++ addr = slot ? NMB(slot) : rxbuf->m_head->m_data; ++ // XXX load or reload ? ++ netmap_load_map(rxr->rxtag, rxbuf->map, addr, adapter->rx_mbuf_sz); ++ /* Update descriptor */ ++ rxr->rx_base[j].buffer_addr = htole64(vtophys(addr)); ++ bus_dmamap_sync(rxr->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); ++ if (slot) ++ slot++; ++ } ++ /* Setup our descriptor indices */ ++ NA(adapter->ifp)->rx_rings[rxr->me].nr_hwcur = rxr->next_to_check; ++ } ++#endif /* DEV_NETMAP */ + + fail: + rxr->next_to_refresh = i; +@@ -4247,6 +4305,14 @@ + + EM_RX_LOCK(rxr); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->rx_rings[rxr->me].si, PI_NET); ++ EM_RX_UNLOCK(rxr); ++ return (0); ++ } ++#endif /* DEV_NETMAP */ ++ + for (i = rxr->next_to_check, processed = 0; count != 0;) { + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) +Index: dev/re/if_re.c +=================================================================== +--- dev/re/if_re.c (revision 227552) ++++ dev/re/if_re.c (working copy) +@@ -291,6 +291,10 @@ + static void re_setwol (struct rl_softc *); + static void re_clrwol (struct rl_softc *); + ++#ifdef DEV_NETMAP ++#include ++#endif /* !DEV_NETMAP */ ++ + #ifdef RE_DIAG + static int re_diag (struct rl_softc *); + #endif +@@ -1583,6 +1587,9 @@ + */ + ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header); + ++#ifdef DEV_NETMAP ++ re_netmap_attach(sc); ++#endif /* DEV_NETMAP */ + #ifdef RE_DIAG + /* + * Perform hardware diagnostic on the original RTL8169. +@@ -1778,6 +1785,9 @@ + bus_dma_tag_destroy(sc->rl_ldata.rl_stag); + } + ++#ifdef DEV_NETMAP ++ netmap_detach(ifp); ++#endif /* DEV_NETMAP */ + if (sc->rl_parent_tag) + bus_dma_tag_destroy(sc->rl_parent_tag); + +@@ -1952,6 +1962,9 @@ + sc->rl_ldata.rl_tx_desc_cnt * sizeof(struct rl_desc)); + for (i = 0; i < sc->rl_ldata.rl_tx_desc_cnt; i++) + sc->rl_ldata.rl_tx_desc[i].tx_m = NULL; ++#ifdef DEV_NETMAP ++ re_netmap_tx_init(sc); ++#endif /* DEV_NETMAP */ + /* Set EOR. */ + desc = &sc->rl_ldata.rl_tx_list[sc->rl_ldata.rl_tx_desc_cnt - 1]; + desc->rl_cmdstat |= htole32(RL_TDESC_CMD_EOR); +@@ -1979,6 +1992,9 @@ + if ((error = re_newbuf(sc, i)) != 0) + return (error); + } ++#ifdef DEV_NETMAP ++ re_netmap_rx_init(sc); ++#endif /* DEV_NETMAP */ + + /* Flush the RX descriptors */ + +@@ -2035,6 +2051,12 @@ + RL_LOCK_ASSERT(sc); + + ifp = sc->rl_ifp; ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->rx_rings->si, PI_NET); ++ return 0; ++ } ++#endif /* DEV_NETMAP */ + if (ifp->if_mtu > RL_MTU && (sc->rl_flags & RL_FLAG_JUMBOV2) != 0) + jumbo = 1; + else +@@ -2276,6 +2298,12 @@ + return; + + ifp = sc->rl_ifp; ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ selwakeuppri(&NA(ifp)->tx_rings[0].si, PI_NET); ++ return; ++ } ++#endif /* DEV_NETMAP */ + /* Invalidate the TX descriptor list */ + bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, + sc->rl_ldata.rl_tx_list_map, +@@ -2794,6 +2822,20 @@ + + sc = ifp->if_softc; + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ struct netmap_kring *kring = &NA(ifp)->tx_rings[0]; ++ if (sc->rl_ldata.rl_tx_prodidx != kring->nr_hwcur) { ++ /* kick the tx unit */ ++ CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); ++#ifdef RE_TX_MODERATION ++ CSR_WRITE_4(sc, RL_TIMERCNT, 1); ++#endif ++ sc->rl_watchdog_timer = 5; ++ } ++ return; ++ } ++#endif /* DEV_NETMAP */ + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != + IFF_DRV_RUNNING || (sc->rl_flags & RL_FLAG_LINK) == 0) + return; +Index: dev/ixgbe/ixgbe.c +=================================================================== +--- dev/ixgbe/ixgbe.c (revision 227552) ++++ dev/ixgbe/ixgbe.c (working copy) +@@ -313,6 +313,10 @@ + static int fdir_pballoc = 1; + #endif + ++#ifdef DEV_NETMAP ++#include ++#endif /* DEV_NETMAP */ ++ + /********************************************************************* + * Device identification routine + * +@@ -578,6 +582,9 @@ + + ixgbe_add_hw_stats(adapter); + ++#ifdef DEV_NETMAP ++ ixgbe_netmap_attach(adapter); ++#endif /* DEV_NETMAP */ + INIT_DEBUGOUT("ixgbe_attach: end"); + return (0); + err_late: +@@ -652,6 +659,9 @@ + + ether_ifdetach(adapter->ifp); + callout_drain(&adapter->timer); ++#ifdef DEV_NETMAP ++ netmap_detach(adapter->ifp); ++#endif /* DEV_NETMAP */ + ixgbe_free_pci_resources(adapter); + bus_generic_detach(dev); + if_free(adapter->ifp); +@@ -1719,6 +1729,7 @@ + if (++i == adapter->num_tx_desc) + i = 0; + ++ // XXX should we sync each buffer ? + txbuf->m_head = NULL; + txbuf->eop_index = -1; + } +@@ -2813,6 +2824,10 @@ + struct adapter *adapter = txr->adapter; + struct ixgbe_tx_buf *txbuf; + int i; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(adapter->ifp), ++ NR_TX, txr->me, 0); ++#endif + + /* Clear the old ring contents */ + IXGBE_TX_LOCK(txr); +@@ -2832,6 +2847,13 @@ + m_freem(txbuf->m_head); + txbuf->m_head = NULL; + } ++#ifdef DEV_NETMAP ++ if (slot) { ++ netmap_load_map(txr->txtag, txbuf->map, ++ NMB(slot), adapter->rx_mbuf_sz); ++ slot++; ++ } ++#endif /* DEV_NETMAP */ + /* Clear the EOP index */ + txbuf->eop_index = -1; + } +@@ -3310,6 +3332,20 @@ + + mtx_assert(&txr->tx_mtx, MA_OWNED); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ struct netmap_adapter *na = NA(ifp); ++ ++ selwakeuppri(&na->tx_rings[txr->me].si, PI_NET); ++ IXGBE_TX_UNLOCK(txr); ++ IXGBE_CORE_LOCK(adapter); ++ selwakeuppri(&na->tx_rings[na->num_queues + 1].si, PI_NET); ++ IXGBE_CORE_UNLOCK(adapter); ++ IXGBE_TX_LOCK(txr); // the caller is supposed to own the lock ++ return (FALSE); ++ } ++#endif /* DEV_NETMAP */ ++ + if (txr->tx_avail == adapter->num_tx_desc) { + txr->queue_status = IXGBE_QUEUE_IDLE; + return FALSE; +@@ -3698,6 +3734,10 @@ + bus_dma_segment_t pseg[1], hseg[1]; + struct lro_ctrl *lro = &rxr->lro; + int rsize, nsegs, error = 0; ++#ifdef DEV_NETMAP ++ struct netmap_slot *slot = netmap_reset(NA(rxr->adapter->ifp), ++ NR_RX, rxr->me, 0); ++#endif /* DEV_NETMAP */ + + adapter = rxr->adapter; + ifp = adapter->ifp; +@@ -3721,6 +3761,18 @@ + struct mbuf *mh, *mp; + + rxbuf = &rxr->rx_buffers[j]; ++#ifdef DEV_NETMAP ++ if (slot) { ++ netmap_load_map(rxr->ptag, ++ rxbuf->pmap, NMB(slot), ++ adapter->rx_mbuf_sz); ++ /* Update descriptor */ ++ rxr->rx_base[j].read.pkt_addr = ++ htole64(vtophys(NMB(slot))); ++ slot++; ++ continue; ++ } ++#endif /* DEV_NETMAP */ + /* + ** Don't allocate mbufs if not + ** doing header split, its wasteful +@@ -4148,6 +4200,18 @@ + + IXGBE_RX_LOCK(rxr); + ++#ifdef DEV_NETMAP ++ if (ifp->if_capenable & IFCAP_NETMAP) { ++ struct netmap_adapter *na = NA(ifp); ++ ++ selwakeuppri(&na->rx_rings[rxr->me].si, PI_NET); ++ IXGBE_RX_UNLOCK(rxr); ++ IXGBE_CORE_LOCK(adapter); ++ selwakeuppri(&na->rx_rings[na->num_queues + 1].si, PI_NET); ++ IXGBE_CORE_UNLOCK(adapter); ++ return (0); ++ } ++#endif /* DEV_NETMAP */ + for (i = rxr->next_to_check; count != 0;) { + struct mbuf *sendmp, *mh, *mp; + u32 rsc, ptype; diff --git a/sys/dev/netmap/if_em_netmap.h b/sys/dev/netmap/if_em_netmap.h new file mode 100644 index 000000000000..0e220e755d68 --- /dev/null +++ b/sys/dev/netmap/if_em_netmap.h @@ -0,0 +1,383 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: if_em_netmap.h 9662 2011-11-16 13:18:06Z luigi $ + * + * netmap changes for if_em. + */ + +#include +#include +#include +#include /* vtophys ? */ +#include + +static void em_netmap_block_tasks(struct adapter *); +static void em_netmap_unblock_tasks(struct adapter *); +static int em_netmap_reg(struct ifnet *, int onoff); +static int em_netmap_txsync(void *, u_int, int); +static int em_netmap_rxsync(void *, u_int, int); +static void em_netmap_lock_wrapper(void *, int, u_int); + +static void +em_netmap_attach(struct adapter *adapter) +{ + struct netmap_adapter na; + + bzero(&na, sizeof(na)); + + na.ifp = adapter->ifp; + na.separate_locks = 1; + na.num_tx_desc = adapter->num_tx_desc; + na.num_rx_desc = adapter->num_rx_desc; + na.nm_txsync = em_netmap_txsync; + na.nm_rxsync = em_netmap_rxsync; + na.nm_lock = em_netmap_lock_wrapper; + na.nm_register = em_netmap_reg; + /* + * adapter->rx_mbuf_sz is set by SIOCSETMTU, but in netmap mode + * we allocate the buffers on the first register. So we must + * disallow a SIOCSETMTU when if_capenable & IFCAP_NETMAP is set. + */ + na.buff_size = MCLBYTES; + netmap_attach(&na, adapter->num_queues); +} + + +/* + * wrapper to export locks to the generic code + */ +static void +em_netmap_lock_wrapper(void *_a, int what, u_int queueid) +{ + struct adapter *adapter = _a; + + ASSERT(queueid < adapter->num_queues); + switch (what) { + case NETMAP_CORE_LOCK: + EM_CORE_LOCK(adapter); + break; + case NETMAP_CORE_UNLOCK: + EM_CORE_UNLOCK(adapter); + break; + case NETMAP_TX_LOCK: + EM_TX_LOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_TX_UNLOCK: + EM_TX_UNLOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_RX_LOCK: + EM_RX_LOCK(&adapter->rx_rings[queueid]); + break; + case NETMAP_RX_UNLOCK: + EM_RX_UNLOCK(&adapter->rx_rings[queueid]); + break; + } +} + + +static void +em_netmap_block_tasks(struct adapter *adapter) +{ + if (adapter->msix > 1) { /* MSIX */ + int i; + struct tx_ring *txr = adapter->tx_rings; + struct rx_ring *rxr = adapter->rx_rings; + + for (i = 0; i < adapter->num_queues; i++, txr++, rxr++) { + taskqueue_block(txr->tq); + taskqueue_drain(txr->tq, &txr->tx_task); + taskqueue_block(rxr->tq); + taskqueue_drain(rxr->tq, &rxr->rx_task); + } + } else { /* legacy */ + taskqueue_block(adapter->tq); + taskqueue_drain(adapter->tq, &adapter->link_task); + taskqueue_drain(adapter->tq, &adapter->que_task); + } +} + + +static void +em_netmap_unblock_tasks(struct adapter *adapter) +{ + if (adapter->msix > 1) { + struct tx_ring *txr = adapter->tx_rings; + struct rx_ring *rxr = adapter->rx_rings; + int i; + + for (i = 0; i < adapter->num_queues; i++) { + taskqueue_unblock(txr->tq); + taskqueue_unblock(rxr->tq); + } + } else { /* legacy */ + taskqueue_unblock(adapter->tq); + } +} + +/* + * register-unregister routine + */ +static int +em_netmap_reg(struct ifnet *ifp, int onoff) +{ + struct adapter *adapter = ifp->if_softc; + struct netmap_adapter *na = NA(ifp); + int error = 0; + + if (na == NULL) + return EINVAL; /* no netmap support here */ + + em_disable_intr(adapter); + + /* Tell the stack that the interface is no longer active */ + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + em_netmap_block_tasks(adapter); + + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + + /* save if_transmit for later restore. + * XXX also if_start and if_qflush ? + */ + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_start; + + em_init_locked(adapter); + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { + error = ENOMEM; + goto fail; + } + } else { +fail: + /* restore if_transmit */ + ifp->if_transmit = na->if_transmit; + ifp->if_capenable &= ~IFCAP_NETMAP; + em_init_locked(adapter); /* also enable intr */ + + } + em_netmap_unblock_tasks(adapter); + return (error); +} + +/* + * Reconcile hardware and user view of the transmit ring, see + * ixgbe.c for details. + */ +static int +em_netmap_txsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + /* generate an interrupt approximately every half ring */ + int report_frequency = kring->nkr_num_slots >> 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + EM_TX_LOCK(txr); + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_POSTREAD); + + /* record completed transmissions TODO + * + * instead of using TDH, we could read the transmitted status bit. + */ + j = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); + if (j >= kring->nkr_num_slots) { /* XXX can happen */ + D("TDH wrap %d", j); + j -= kring->nkr_num_slots; + } + int delta = j - txr->next_to_clean; + if (delta) { + /* new transmissions were completed, increment + ring->nr_hwavail. */ + if (delta < 0) + delta += kring->nkr_num_slots; + txr->next_to_clean = j; + kring->nr_hwavail += delta; + } + + /* update avail to what the hardware knows */ + ring->avail = kring->nr_hwavail; + + j = kring->nr_hwcur; + if (j != k) { /* we have packets to send */ + n = 0; + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct e1000_tx_desc *curr = &txr->tx_base[j]; + struct em_buffer *txbuf = &txr->tx_buffers[j]; + int flags = ((slot->flags & NS_REPORT) || + j == 0 || j == report_frequency) ? + E1000_TXD_CMD_RS : 0; + void *addr = NMB(slot); + int len = slot->len; + if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { + if (do_lock) + EM_TX_UNLOCK(txr); + return netmap_ring_reinit(kring); + } + + slot->flags &= ~NS_REPORT; + curr->upper.data = 0; + curr->lower.data = + htole32( + adapter->txd_cmd | + (E1000_TXD_CMD_EOP | flags) | + slot->len); + if (slot->flags & NS_BUF_CHANGED) { + curr->buffer_addr = htole64(vtophys(addr)); + /* buffer has changed, unload and reload map */ + netmap_reload_map(txr->txtag, txbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(txr->txtag, txbuf->map, + BUS_DMASYNC_PREWRITE); + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwcur = ring->cur; + + /* decrease avail by number of sent packets */ + ring->avail -= n; + kring->nr_hwavail = ring->avail; + + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + + E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), + ring->cur); + } + if (do_lock) + EM_TX_UNLOCK(txr); + return 0; +} + +/* + * Reconcile kernel and user view of the receive ring, see ixgbe.c + */ +static int +em_netmap_rxsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + EM_RX_LOCK(rxr); + /* XXX check sync modes */ + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + /* acknowledge all the received packets. */ + j = rxr->next_to_check; + for (n = 0; ; n++) { + struct e1000_rx_desc *curr = &rxr->rx_base[j]; + + if ((curr->status & E1000_RXD_STAT_DD) == 0) + break; + ring->slot[j].len = le16toh(curr->length); + bus_dmamap_sync(rxr->tag, rxr->rx_buffers[j].map, + BUS_DMASYNC_POSTREAD); + j = (j == lim) ? 0 : j + 1; + } + if (n) { + rxr->next_to_check = j; + kring->nr_hwavail += n; + } + + /* skip past packets that userspace has already processed: + * making them available for reception. + * advance nr_hwcur and issue a bus_dmamap_sync on the + * buffers so it is safe to write to them. + * Also increase nr_hwavail + */ + j = kring->nr_hwcur; + if (j != k) { /* userspace has read some packets. */ + n = 0; + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct e1000_rx_desc *curr = &rxr->rx_base[j]; + struct em_buffer *rxbuf = &rxr->rx_buffers[j]; + void *addr = NMB(slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + if (do_lock) + EM_RX_UNLOCK(rxr); + return netmap_ring_reinit(kring); + } + + curr->status = 0; + if (slot->flags & NS_BUF_CHANGED) { + curr->buffer_addr = htole64(vtophys(addr)); + /* buffer has changed, unload and reload map */ + netmap_reload_map(rxr->rxtag, rxbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(rxr->rxtag, rxbuf->map, + BUS_DMASYNC_PREREAD); + + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwavail -= n; + kring->nr_hwcur = ring->cur; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + /* + * IMPORTANT: we must leave one free slot in the ring, + * so move j back by one unit + */ + j = (j == 0) ? lim : j - 1; + E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), j); + } + /* tell userspace that there are new packets */ + ring->avail = kring->nr_hwavail ; + if (do_lock) + EM_RX_UNLOCK(rxr); + return 0; +} diff --git a/sys/dev/netmap/if_igb_netmap.h b/sys/dev/netmap/if_igb_netmap.h new file mode 100644 index 000000000000..0c147063b211 --- /dev/null +++ b/sys/dev/netmap/if_igb_netmap.h @@ -0,0 +1,378 @@ +/* + * Copyright (C) 2011 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: if_igb_netmap.h 9662 2011-11-16 13:18:06Z luigi $ + * + * netmap modifications for igb + * contribured by Ahmed Kooli + */ + +#include +#include +#include +#include /* vtophys ? */ +#include + +static int igb_netmap_reg(struct ifnet *, int onoff); +static int igb_netmap_txsync(void *, u_int, int); +static int igb_netmap_rxsync(void *, u_int, int); +static void igb_netmap_lock_wrapper(void *, int, u_int); + + +static void +igb_netmap_attach(struct adapter *adapter) +{ + struct netmap_adapter na; + + bzero(&na, sizeof(na)); + + na.ifp = adapter->ifp; + na.separate_locks = 1; + na.num_tx_desc = adapter->num_tx_desc; + na.num_rx_desc = adapter->num_rx_desc; + na.nm_txsync = igb_netmap_txsync; + na.nm_rxsync = igb_netmap_rxsync; + na.nm_lock = igb_netmap_lock_wrapper; + na.nm_register = igb_netmap_reg; + /* + * adapter->rx_mbuf_sz is set by SIOCSETMTU, but in netmap mode + * we allocate the buffers on the first register. So we must + * disallow a SIOCSETMTU when if_capenable & IFCAP_NETMAP is set. + */ + na.buff_size = MCLBYTES; + netmap_attach(&na, adapter->num_queues); +} + + +/* + * wrapper to export locks to the generic code + */ +static void +igb_netmap_lock_wrapper(void *_a, int what, u_int queueid) +{ + struct adapter *adapter = _a; + + ASSERT(queueid < adapter->num_queues); + switch (what) { + case NETMAP_CORE_LOCK: + IGB_CORE_LOCK(adapter); + break; + case NETMAP_CORE_UNLOCK: + IGB_CORE_UNLOCK(adapter); + break; + case NETMAP_TX_LOCK: + IGB_TX_LOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_TX_UNLOCK: + IGB_TX_UNLOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_RX_LOCK: + IGB_RX_LOCK(&adapter->rx_rings[queueid]); + break; + case NETMAP_RX_UNLOCK: + IGB_RX_UNLOCK(&adapter->rx_rings[queueid]); + break; + } +} + + +/* + * support for netmap register/unregisted. We are already under core lock. + * only called on the first init or the last unregister. + */ +static int +igb_netmap_reg(struct ifnet *ifp, int onoff) +{ + struct adapter *adapter = ifp->if_softc; + struct netmap_adapter *na = NA(ifp); + int error = 0; + + if (!na) + return EINVAL; + + igb_disable_intr(adapter); + + /* Tell the stack that the interface is no longer active */ + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + + /* save if_transmit to restore it later */ + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_start; + + igb_init_locked(adapter); + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { + error = ENOMEM; + goto fail; + } + } else { +fail: + /* restore if_transmit */ + ifp->if_transmit = na->if_transmit; + ifp->if_capenable &= ~IFCAP_NETMAP; + igb_init_locked(adapter); /* also enables intr */ + } + return (error); +} + + +/* + * Reconcile kernel and user view of the transmit ring. + * + * Userspace has filled tx slots up to cur (excluded). + * The last unused slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available + * (using the special value -1 to indicate idle transmit ring). + * The function must first update avail to what the kernel + * knows, subtract the newly used slots (cur - nr_hwcur) + * from both avail and nr_hwavail, and set nr_hwcur = cur + * issuing a dmamap_sync on all slots. + * + * Check parameters in the struct netmap_ring. + * We don't use avail, only check for bogus values. + * Make sure cur is valid, and same goes for buffer indexes and lengths. + * To avoid races, read the values once, and never use those from + * the ring afterwards. + */ +static int +igb_netmap_txsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + /* generate an interrupt approximately every half ring */ + int report_frequency = kring->nkr_num_slots >> 1; + + k = ring->cur; /* ring is not protected by any lock */ + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + IGB_TX_LOCK(txr); + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_POSTREAD); + + /* record completed transmissions. TODO + * + * Instead of reading from the TDH register, we could and try to check + * the status bit of descriptor packets. + */ + j = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); + if (j >= kring->nkr_num_slots) /* XXX can it happen ? */ + j -= kring->nkr_num_slots; + int delta = j - txr->next_to_clean; + if (delta) { + /* new tx were completed */ + if (delta < 0) + delta += kring->nkr_num_slots; + txr->next_to_clean = j; + kring->nr_hwavail += delta; + } + + /* update avail to what the hardware knows */ + ring->avail = kring->nr_hwavail; + + j = kring->nr_hwcur; + if (j != k) { /* we have new packets to send */ + u32 olinfo_status = 0; + n = 0; + + /* 82575 needs the queue index added */ + if (adapter->hw.mac.type == e1000_82575) + olinfo_status |= txr->me << 4; + + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct igb_tx_buffer *txbuf = &txr->tx_buffers[j]; + union e1000_adv_tx_desc *curr = + (union e1000_adv_tx_desc *)&txr->tx_base[j]; + void *addr = NMB(slot); + int flags = ((slot->flags & NS_REPORT) || + j == 0 || j == report_frequency) ? + E1000_ADVTXD_DCMD_RS : 0; + int len = slot->len; + + if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { + if (do_lock) + IGB_TX_UNLOCK(txr); + return netmap_ring_reinit(kring); + } + + slot->flags &= ~NS_REPORT; + curr->read.buffer_addr = htole64(vtophys(addr)); + curr->read.olinfo_status = + htole32(olinfo_status | + (len<< E1000_ADVTXD_PAYLEN_SHIFT)); + curr->read.cmd_type_len = + htole32(len | E1000_ADVTXD_DTYP_DATA | + E1000_ADVTXD_DCMD_IFCS | + E1000_ADVTXD_DCMD_DEXT | + E1000_ADVTXD_DCMD_EOP | flags); + if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, unload and reload map */ + netmap_reload_map(txr->txtag, txbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(txr->txtag, txbuf->map, + BUS_DMASYNC_PREWRITE); + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwcur = k; + + /* decrease avail by number of sent packets */ + ring->avail -= n; + kring->nr_hwavail = ring->avail; + + /* Set the watchdog */ + txr->queue_status = IGB_QUEUE_WORKING; + txr->watchdog_time = ticks; + + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + + E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), k); + } + if (do_lock) + IGB_TX_UNLOCK(txr); + return 0; +} + + +/* + * Reconcile kernel and user view of the receive ring. + * + * Userspace has read rx slots up to cur (excluded). + * The last unread slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available. + * We must subtract the newly consumed slots (cur - nr_hwcur) + * from nr_hwavail, clearing the descriptors for the next + * read, tell the hardware that they are available, + * and set nr_hwcur = cur and avail = nr_hwavail. + * issuing a dmamap_sync on all slots. + */ +static int +igb_netmap_rxsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; /* ring is not protected by any lock */ + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + IGB_RX_LOCK(rxr); + + /* Sync the ring. */ + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + j = rxr->next_to_check; + for (n = 0; ; n++) { + union e1000_adv_rx_desc *curr = &rxr->rx_base[j]; + uint32_t staterr = le32toh(curr->wb.upper.status_error); + + if ((staterr & E1000_RXD_STAT_DD) == 0) + break; + ring->slot[j].len = le16toh(curr->wb.upper.length); + + bus_dmamap_sync(rxr->ptag, + rxr->rx_buffers[j].pmap, BUS_DMASYNC_POSTREAD); + j = (j == lim) ? 0 : j + 1; + } + if (n) { + rxr->next_to_check = j; + kring->nr_hwavail += n; + if (kring->nr_hwavail >= lim - 10) { + ND("rx ring %d almost full %d", ring_nr, kring->nr_hwavail); + } + } + + /* skip past packets that userspace has already processed, + * making them available for reception. + * advance nr_hwcur and issue a bus_dmamap_sync on the + * buffers so it is safe to write to them. + * Also increase nr_hwavail + */ + j = kring->nr_hwcur; + if (j != k) { /* userspace has read some packets. */ + n = 0; + while (j != k) { + struct netmap_slot *slot = ring->slot + j; + union e1000_adv_rx_desc *curr = &rxr->rx_base[j]; + struct igb_rx_buf *rxbuf = rxr->rx_buffers + j; + void *addr = NMB(slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + if (do_lock) + IGB_RX_UNLOCK(rxr); + return netmap_ring_reinit(kring); + } + + curr->wb.upper.status_error = 0; + curr->read.pkt_addr = htole64(vtophys(addr)); + if (slot->flags & NS_BUF_CHANGED) { + netmap_reload_map(rxr->ptag, rxbuf->pmap, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(rxr->ptag, rxbuf->pmap, + BUS_DMASYNC_PREREAD); + + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwavail -= n; + kring->nr_hwcur = ring->cur; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + /* IMPORTANT: we must leave one free slot in the ring, + * so move j back by one unit + */ + j = (j == 0) ? lim : j - 1; + E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), j); + } + /* tell userspace that there are new packets */ + ring->avail = kring->nr_hwavail ; + if (do_lock) + IGB_RX_UNLOCK(rxr); + return 0; +} diff --git a/sys/dev/netmap/if_lem_netmap.h b/sys/dev/netmap/if_lem_netmap.h new file mode 100644 index 000000000000..a8f34989bcc4 --- /dev/null +++ b/sys/dev/netmap/if_lem_netmap.h @@ -0,0 +1,344 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: if_lem_netmap.h 9662 2011-11-16 13:18:06Z luigi $ + * + * netmap support for if_lem.c + */ + +#include +#include +#include +#include /* vtophys ? */ +#include + +static int lem_netmap_reg(struct ifnet *, int onoff); +static int lem_netmap_txsync(void *, u_int, int); +static int lem_netmap_rxsync(void *, u_int, int); +static void lem_netmap_lock_wrapper(void *, int, u_int); + + +SYSCTL_NODE(_dev, OID_AUTO, lem, CTLFLAG_RW, 0, "lem card"); + +static void +lem_netmap_attach(struct adapter *adapter) +{ + struct netmap_adapter na; + + bzero(&na, sizeof(na)); + + na.ifp = adapter->ifp; + na.separate_locks = 1; + na.num_tx_desc = adapter->num_tx_desc; + na.num_rx_desc = adapter->num_rx_desc; + na.nm_txsync = lem_netmap_txsync; + na.nm_rxsync = lem_netmap_rxsync; + na.nm_lock = lem_netmap_lock_wrapper; + na.nm_register = lem_netmap_reg; + na.buff_size = MCLBYTES; + netmap_attach(&na, 1); +} + + +static void +lem_netmap_lock_wrapper(void *_a, int what, u_int ringid) +{ + struct adapter *adapter = _a; + + /* only one ring here so ignore the ringid */ + switch (what) { + case NETMAP_CORE_LOCK: + EM_CORE_LOCK(adapter); + break; + case NETMAP_CORE_UNLOCK: + EM_CORE_UNLOCK(adapter); + break; + case NETMAP_TX_LOCK: + EM_TX_LOCK(adapter); + break; + case NETMAP_TX_UNLOCK: + EM_TX_UNLOCK(adapter); + break; + case NETMAP_RX_LOCK: + EM_RX_LOCK(adapter); + break; + case NETMAP_RX_UNLOCK: + EM_RX_UNLOCK(adapter); + break; + } +} + + +/* + * Reconcile kernel and user view of the transmit ring. see ixgbe.c + */ +static int +lem_netmap_txsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->tx_rings[0]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + /* generate an interrupt approximately every half ring */ + int report_frequency = kring->nkr_num_slots >> 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + EM_TX_LOCK(adapter); + bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, + BUS_DMASYNC_POSTREAD); + + /* record completed transmissions TODO + * + * instead of using TDH, we could read the transmitted status bit. + */ + j = E1000_READ_REG(&adapter->hw, E1000_TDH(0)); + if (j >= kring->nkr_num_slots) { /* can it happen ? */ + D("bad TDH %d", j); + j -= kring->nkr_num_slots; + } + int delta = j - adapter->next_tx_to_clean; + if (delta) { + if (delta < 0) + delta += kring->nkr_num_slots; + adapter->next_tx_to_clean = j; + kring->nr_hwavail += delta; + } + + /* update avail to what the hardware knows */ + ring->avail = kring->nr_hwavail; + + j = kring->nr_hwcur; + if (j != k) { /* we have new packets to send */ + n = 0; + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct e1000_tx_desc *curr = &adapter->tx_desc_base[j]; + struct em_buffer *txbuf = &adapter->tx_buffer_area[j]; + void *addr = NMB(slot); + int flags = ((slot->flags & NS_REPORT) || + j == 0 || j == report_frequency) ? + E1000_TXD_CMD_RS : 0; + int len = slot->len; + + if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { + if (do_lock) + EM_TX_UNLOCK(adapter); + return netmap_ring_reinit(kring); + } + + curr->upper.data = 0; + /* always interrupt. XXX make it conditional */ + curr->lower.data = + htole32( adapter->txd_cmd | len | + (E1000_TXD_CMD_EOP | flags) ); + if (slot->flags & NS_BUF_CHANGED) { + curr->buffer_addr = htole64(vtophys(addr)); + /* buffer has changed, unload and reload map */ + netmap_reload_map(adapter->txtag, txbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(adapter->txtag, txbuf->map, + BUS_DMASYNC_PREWRITE); + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwcur = ring->cur; + + /* decrease avail by number of sent packets */ + ring->avail -= n; + kring->nr_hwavail = ring->avail; + + bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + + E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), ring->cur); + } + if (do_lock) + EM_TX_UNLOCK(adapter); + return 0; +} + + +/* + * Reconcile kernel and user view of the receive ring. see ixgbe.c + */ +static int +lem_netmap_rxsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->rx_rings[0]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + EM_RX_LOCK(adapter); + /* XXX check sync modes */ + bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + /* acknowldge all the received packets. */ + j = adapter->next_rx_desc_to_check; + for (n = 0; ; n++) { + struct e1000_rx_desc *curr = &adapter->rx_desc_base[j]; + int len = le16toh(adapter->rx_desc_base[j].length) - 4; // CRC + + if ((curr->status & E1000_RXD_STAT_DD) == 0) + break; + + if (len < 0) { + D("bogus pkt size at %d", j); + len = 0; + } + ring->slot[j].len = len; + bus_dmamap_sync(adapter->rxtag, adapter->rx_buffer_area[j].map, + BUS_DMASYNC_POSTREAD); + j = (j == lim) ? 0 : j + 1; + } + if (n) { + adapter->next_rx_desc_to_check = j; + kring->nr_hwavail += n; + } + + /* skip past packets that userspace has already processed, + * making them available for reception. We don't need to set + * the length as it is the same for all slots. + */ + j = kring->nr_hwcur; + if (j != k) { /* userspace has read some packets. */ + n = 0; + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct e1000_rx_desc *curr = &adapter->rx_desc_base[j]; + struct em_buffer *rxbuf = &adapter->rx_buffer_area[j]; + void *addr = NMB(slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + if (do_lock) + EM_RX_UNLOCK(adapter); + return netmap_ring_reinit(kring); + } + curr = &adapter->rx_desc_base[j]; + curr->status = 0; + if (slot->flags & NS_BUF_CHANGED) { + curr->buffer_addr = htole64(vtophys(addr)); + /* buffer has changed, unload and reload map */ + netmap_reload_map(adapter->rxtag, rxbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(adapter->rxtag, rxbuf->map, + BUS_DMASYNC_PREREAD); + + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwavail -= n; + kring->nr_hwcur = ring->cur; + bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + /* + * IMPORTANT: we must leave one free slot in the ring, + * so move j back by one unit + */ + j = (j == 0) ? lim : j - 1; + E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), j); + } + + /* tell userspace that there are new packets */ + ring->avail = kring->nr_hwavail ; + if (do_lock) + EM_RX_UNLOCK(adapter); + return 0; +} + + +/* + * Register/unregister routine + */ +static int +lem_netmap_reg(struct ifnet *ifp, int onoff) +{ + struct adapter *adapter = ifp->if_softc; + struct netmap_adapter *na = NA(ifp); + int error = 0; + + if (!na) + return EINVAL; + + lem_disable_intr(adapter); + + /* Tell the stack that the interface is no longer active */ + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + /* lem_netmap_block_tasks(adapter); */ +#ifndef EM_LEGACY_IRQ + taskqueue_block(adapter->tq); + taskqueue_drain(adapter->tq, &adapter->rxtx_task); + taskqueue_drain(adapter->tq, &adapter->link_task); +#endif /* !EM_LEGCY_IRQ */ + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + + /* save if_transmit to restore it when exiting. + * XXX what about if_start and if_qflush ? + */ + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_start; + + lem_init_locked(adapter); + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { + error = ENOMEM; + goto fail; + } + } else { +fail: + /* restore non-netmap mode */ + ifp->if_transmit = na->if_transmit; + ifp->if_capenable &= ~IFCAP_NETMAP; + lem_init_locked(adapter); /* also enables intr */ + } + +#ifndef EM_LEGACY_IRQ + taskqueue_unblock(adapter->tq); +#endif /* !EM_LEGCY_IRQ */ + + return (error); +} diff --git a/sys/dev/netmap/if_re_netmap.h b/sys/dev/netmap/if_re_netmap.h new file mode 100644 index 000000000000..efccf3a795bc --- /dev/null +++ b/sys/dev/netmap/if_re_netmap.h @@ -0,0 +1,415 @@ +/* + * Copyright (C) 2011 Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: if_re_netmap.h 9662 2011-11-16 13:18:06Z luigi $ + * + * netmap support for if_re + */ + +#include +#include +#include +#include /* vtophys ? */ +#include + +static int re_netmap_reg(struct ifnet *, int onoff); +static int re_netmap_txsync(void *, u_int, int); +static int re_netmap_rxsync(void *, u_int, int); +static void re_netmap_lock_wrapper(void *, int, u_int); + +static void +re_netmap_attach(struct rl_softc *sc) +{ + struct netmap_adapter na; + + bzero(&na, sizeof(na)); + + na.ifp = sc->rl_ifp; + na.separate_locks = 0; + na.num_tx_desc = sc->rl_ldata.rl_tx_desc_cnt; + na.num_rx_desc = sc->rl_ldata.rl_rx_desc_cnt; + na.nm_txsync = re_netmap_txsync; + na.nm_rxsync = re_netmap_rxsync; + na.nm_lock = re_netmap_lock_wrapper; + na.nm_register = re_netmap_reg; + na.buff_size = MCLBYTES; + netmap_attach(&na, 1); +} + + +/* + * wrapper to export locks to the generic code + * We should not use the tx/rx locks + */ +static void +re_netmap_lock_wrapper(void *_a, int what, u_int queueid) +{ + struct rl_softc *adapter = _a; + + switch (what) { + case NETMAP_CORE_LOCK: + RL_LOCK(adapter); + break; + case NETMAP_CORE_UNLOCK: + RL_UNLOCK(adapter); + break; + + case NETMAP_TX_LOCK: + case NETMAP_RX_LOCK: + case NETMAP_TX_UNLOCK: + case NETMAP_RX_UNLOCK: + D("invalid lock call %d, no tx/rx locks here", what); + break; + } +} + + +/* + * support for netmap register/unregisted. We are already under core lock. + * only called on the first register or the last unregister. + */ +static int +re_netmap_reg(struct ifnet *ifp, int onoff) +{ + struct rl_softc *adapter = ifp->if_softc; + struct netmap_adapter *na = NA(ifp); + int error = 0; + + if (!na) + return EINVAL; + /* Tell the stack that the interface is no longer active */ + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + re_stop(adapter); + + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + + /* save if_transmit and restore it */ + na->if_transmit = ifp->if_transmit; + /* XXX if_start and if_qflush ??? */ + ifp->if_transmit = netmap_start; + + re_init_locked(adapter); + + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { + error = ENOMEM; + goto fail; + } + } else { +fail: + /* restore if_transmit */ + ifp->if_transmit = na->if_transmit; + ifp->if_capenable &= ~IFCAP_NETMAP; + re_init_locked(adapter); /* also enables intr */ + } + return (error); + +} + + +/* + * Reconcile kernel and user view of the transmit ring. + * + * Userspace has filled tx slots up to cur (excluded). + * The last unused slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available + * (using the special value -1 to indicate idle transmit ring). + * The function must first update avail to what the kernel + * knows (translating the -1 to nkr_num_slots - 1), + * subtract the newly used slots (cur - nr_hwcur) + * from both avail and nr_hwavail, and set nr_hwcur = cur + * issuing a dmamap_sync on all slots. + */ +static int +re_netmap_txsync(void *a, u_int ring_nr, int do_lock) +{ + struct rl_softc *sc = a; + struct rl_txdesc *txd = sc->rl_ldata.rl_tx_desc; + struct netmap_adapter *na = NA(sc->rl_ifp); + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + RL_LOCK(sc); + + /* Sync the TX descriptor list */ + bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, + sc->rl_ldata.rl_tx_list_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + /* record completed transmissions */ + for (n = 0, j = sc->rl_ldata.rl_tx_considx; + j != sc->rl_ldata.rl_tx_prodidx; + n++, j = RL_TX_DESC_NXT(sc, j)) { + uint32_t cmdstat = + le32toh(sc->rl_ldata.rl_tx_list[j].rl_cmdstat); + if (cmdstat & RL_TDESC_STAT_OWN) + break; + } + if (n > 0) { + sc->rl_ldata.rl_tx_considx = j; + sc->rl_ldata.rl_tx_free += n; + kring->nr_hwavail += n; + } + + /* update avail to what the hardware knows */ + ring->avail = kring->nr_hwavail; + + /* we trust prodidx, not hwcur */ + j = kring->nr_hwcur = sc->rl_ldata.rl_tx_prodidx; + if (j != k) { /* we have new packets to send */ + n = 0; + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct rl_desc *desc = &sc->rl_ldata.rl_tx_list[j]; + int cmd = slot->len | RL_TDESC_CMD_EOF | + RL_TDESC_CMD_OWN | RL_TDESC_CMD_SOF ; + void *addr = NMB(slot); + int len = slot->len; + + if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { + if (do_lock) + RL_UNLOCK(sc); + return netmap_ring_reinit(kring); + } + + if (j == lim) /* mark end of ring */ + cmd |= RL_TDESC_CMD_EOR; + + if (slot->flags & NS_BUF_CHANGED) { + uint64_t paddr = vtophys(addr); + desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); + desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + /* buffer has changed, unload and reload map */ + netmap_reload_map(sc->rl_ldata.rl_tx_mtag, + txd[j].tx_dmamap, addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + slot->flags &= ~NS_REPORT; + desc->rl_cmdstat = htole32(cmd); + bus_dmamap_sync(sc->rl_ldata.rl_tx_mtag, + txd[j].tx_dmamap, BUS_DMASYNC_PREWRITE); + j = (j == lim) ? 0 : j + 1; + n++; + } + sc->rl_ldata.rl_tx_prodidx = kring->nr_hwcur = ring->cur; + + /* decrease avail by number of sent packets */ + ring->avail -= n; + kring->nr_hwavail = ring->avail; + + bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, + sc->rl_ldata.rl_tx_list_map, + BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD); + + /* start ? */ + CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); + } + if (do_lock) + RL_UNLOCK(sc); + return 0; +} + + +/* + * Reconcile kernel and user view of the receive ring. + * + * Userspace has read rx slots up to cur (excluded). + * The last unread slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available. + * We must subtract the newly consumed slots (cur - nr_hwcur) + * from nr_hwavail, clearing the descriptors for the next + * read, tell the hardware that they are available, + * and set nr_hwcur = cur and avail = nr_hwavail. + * issuing a dmamap_sync on all slots. + */ +static int +re_netmap_rxsync(void *a, u_int ring_nr, int do_lock) +{ + struct rl_softc *sc = a; + struct rl_rxdesc *rxd = sc->rl_ldata.rl_rx_desc; + struct netmap_adapter *na = NA(sc->rl_ifp); + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + RL_LOCK(sc); + /* XXX check sync modes */ + bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, + sc->rl_ldata.rl_rx_list_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + /* + * The device uses all the buffers in the ring, so we need + * another termination condition in addition to RL_RDESC_STAT_OWN + * cleared (all buffers could have it cleared. The easiest one + * is to limit the amount of data reported up to 'lim' + */ + j = sc->rl_ldata.rl_rx_prodidx; + for (n = kring->nr_hwavail; n < lim ; n++) { + struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[j]; + uint32_t rxstat = le32toh(cur_rx->rl_cmdstat); + uint32_t total_len; + + if ((rxstat & RL_RDESC_STAT_OWN) != 0) + break; + total_len = rxstat & sc->rl_rxlenmask; + /* XXX subtract crc */ + total_len = (total_len < 4) ? 0 : total_len - 4; + kring->ring->slot[j].len = total_len; + /* sync was in re_newbuf() */ + bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, + rxd[j].rx_dmamap, BUS_DMASYNC_POSTREAD); + j = RL_RX_DESC_NXT(sc, j); + } + if (n != kring->nr_hwavail) { + sc->rl_ldata.rl_rx_prodidx = j; + sc->rl_ifp->if_ipackets += n - kring->nr_hwavail; + kring->nr_hwavail = n; + } + + /* skip past packets that userspace has already processed, + * making them available for reception. + * advance nr_hwcur and issue a bus_dmamap_sync on the + * buffers so it is safe to write to them. + * Also increase nr_hwavail + */ + j = kring->nr_hwcur; + if (j != k) { /* userspace has read some packets. */ + n = 0; + while (j != k) { + struct netmap_slot *slot = ring->slot + j; + struct rl_desc *desc = &sc->rl_ldata.rl_rx_list[j]; + int cmd = na->buff_size | RL_RDESC_CMD_OWN; + void *addr = NMB(slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + if (do_lock) + RL_UNLOCK(sc); + return netmap_ring_reinit(kring); + } + + if (j == lim) /* mark end of ring */ + cmd |= RL_RDESC_CMD_EOR; + + desc->rl_cmdstat = htole32(cmd); + slot->flags &= ~NS_REPORT; + if (slot->flags & NS_BUF_CHANGED) { + uint64_t paddr = vtophys(addr); + desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); + desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + netmap_reload_map(sc->rl_ldata.rl_rx_mtag, + rxd[j].rx_dmamap, addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, + rxd[j].rx_dmamap, BUS_DMASYNC_PREREAD); + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwavail -= n; + kring->nr_hwcur = k; + /* Flush the RX DMA ring */ + + bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, + sc->rl_ldata.rl_rx_list_map, + BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD); + } + /* tell userspace that there are new packets */ + ring->avail = kring->nr_hwavail ; + if (do_lock) + RL_UNLOCK(sc); + return 0; +} + +static void +re_netmap_tx_init(struct rl_softc *sc) +{ + struct rl_txdesc *txd; + struct rl_desc *desc; + int i; + struct netmap_adapter *na = NA(sc->rl_ifp); + struct netmap_slot *slot = netmap_reset(na, NR_TX, 0, 0); + + /* slot is NULL if we are not in netmap mode */ + if (!slot) + return; + /* in netmap mode, overwrite addresses and maps */ + txd = sc->rl_ldata.rl_tx_desc; + desc = sc->rl_ldata.rl_tx_list; + + for (i = 0; i < sc->rl_ldata.rl_tx_desc_cnt; i++) { + void *addr = NMB(slot+i); + uint64_t paddr = vtophys(addr); + + desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); + desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + netmap_load_map(sc->rl_ldata.rl_tx_mtag, + txd[i].tx_dmamap, addr, na->buff_size); + } +} + +static void +re_netmap_rx_init(struct rl_softc *sc) +{ + /* slot is NULL if we are not in netmap mode */ + struct netmap_adapter *na = NA(sc->rl_ifp); + struct netmap_slot *slot = netmap_reset(na, NR_RX, 0, 0); + struct rl_desc *desc = sc->rl_ldata.rl_rx_list; + uint32_t cmdstat; + int i; + + if (!slot) + return; + + for (i = 0; i < sc->rl_ldata.rl_rx_desc_cnt; i++) { + void *addr = NMB(slot+i); + uint64_t paddr = vtophys(addr); + + desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); + desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + cmdstat = slot[i].len = na->buff_size; // XXX + if (i == sc->rl_ldata.rl_rx_desc_cnt - 1) + cmdstat |= RL_RDESC_CMD_EOR; + desc[i].rl_cmdstat = htole32(cmdstat | RL_RDESC_CMD_OWN); + + netmap_reload_map(sc->rl_ldata.rl_rx_mtag, + sc->rl_ldata.rl_rx_desc[i].rx_dmamap, + addr, na->buff_size); + } +} diff --git a/sys/dev/netmap/ixgbe_netmap.h b/sys/dev/netmap/ixgbe_netmap.h new file mode 100644 index 000000000000..a4d5491d67f1 --- /dev/null +++ b/sys/dev/netmap/ixgbe_netmap.h @@ -0,0 +1,376 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: ixgbe_netmap.h 9662 2011-11-16 13:18:06Z luigi $ + * + * netmap modifications for ixgbe + */ + +#include +#include +// #include +// #include /* vtophys ? */ +#include + +static int ixgbe_netmap_reg(struct ifnet *, int onoff); +static int ixgbe_netmap_txsync(void *, u_int, int); +static int ixgbe_netmap_rxsync(void *, u_int, int); +static void ixgbe_netmap_lock_wrapper(void *, int, u_int); + + +SYSCTL_NODE(_dev, OID_AUTO, ixgbe, CTLFLAG_RW, 0, "ixgbe card"); + +static void +ixgbe_netmap_attach(struct adapter *adapter) +{ + struct netmap_adapter na; + + bzero(&na, sizeof(na)); + + na.ifp = adapter->ifp; + na.separate_locks = 1; + na.num_tx_desc = adapter->num_tx_desc; + na.num_rx_desc = adapter->num_rx_desc; + na.nm_txsync = ixgbe_netmap_txsync; + na.nm_rxsync = ixgbe_netmap_rxsync; + na.nm_lock = ixgbe_netmap_lock_wrapper; + na.nm_register = ixgbe_netmap_reg; + /* + * adapter->rx_mbuf_sz is set by SIOCSETMTU, but in netmap mode + * we allocate the buffers on the first register. So we must + * disallow a SIOCSETMTU when if_capenable & IFCAP_NETMAP is set. + */ + na.buff_size = MCLBYTES; + netmap_attach(&na, adapter->num_queues); +} + + +/* + * wrapper to export locks to the generic code + */ +static void +ixgbe_netmap_lock_wrapper(void *_a, int what, u_int queueid) +{ + struct adapter *adapter = _a; + + ASSERT(queueid < adapter->num_queues); + switch (what) { + case NETMAP_CORE_LOCK: + IXGBE_CORE_LOCK(adapter); + break; + case NETMAP_CORE_UNLOCK: + IXGBE_CORE_UNLOCK(adapter); + break; + case NETMAP_TX_LOCK: + IXGBE_TX_LOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_TX_UNLOCK: + IXGBE_TX_UNLOCK(&adapter->tx_rings[queueid]); + break; + case NETMAP_RX_LOCK: + IXGBE_RX_LOCK(&adapter->rx_rings[queueid]); + break; + case NETMAP_RX_UNLOCK: + IXGBE_RX_UNLOCK(&adapter->rx_rings[queueid]); + break; + } +} + + +/* + * support for netmap register/unregisted. We are already under core lock. + * only called on the first init or the last unregister. + */ +static int +ixgbe_netmap_reg(struct ifnet *ifp, int onoff) +{ + struct adapter *adapter = ifp->if_softc; + struct netmap_adapter *na = NA(ifp); + int error = 0; + + if (!na) + return EINVAL; + + ixgbe_disable_intr(adapter); + + /* Tell the stack that the interface is no longer active */ + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + + /* save if_transmit to restore it later */ + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_start; + + ixgbe_init_locked(adapter); + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { + error = ENOMEM; + goto fail; + } + } else { +fail: + /* restore if_transmit */ + ifp->if_transmit = na->if_transmit; + ifp->if_capenable &= ~IFCAP_NETMAP; + ixgbe_init_locked(adapter); /* also enables intr */ + } + return (error); +} + + +/* + * Reconcile kernel and user view of the transmit ring. + * + * Userspace has filled tx slots up to cur (excluded). + * The last unused slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available + * (using the special value -1 to indicate idle transmit ring). + * The function must first update avail to what the kernel + * knows, subtract the newly used slots (cur - nr_hwcur) + * from both avail and nr_hwavail, and set nr_hwcur = cur + * issuing a dmamap_sync on all slots. + * + * Check parameters in the struct netmap_ring. + * We don't use avail, only check for bogus values. + * Make sure cur is valid, and same goes for buffer indexes and lengths. + * To avoid races, read the values once, and never use those from + * the ring afterwards. + */ +static int +ixgbe_netmap_txsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n = 0, lim = kring->nkr_num_slots - 1; + + /* generate an interrupt approximately every half ring */ + int report_frequency = kring->nkr_num_slots >> 1; + + k = ring->cur; /* ring is not protected by any lock */ + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + IXGBE_TX_LOCK(txr); + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_POSTREAD); + + /* update avail to what the hardware knows */ + ring->avail = kring->nr_hwavail; + + j = kring->nr_hwcur; + if (j != k) { /* we have new packets to send */ + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; + struct ixgbe_tx_buf *txbuf = &txr->tx_buffers[j]; + union ixgbe_adv_tx_desc *curr = &txr->tx_base[j]; + void *addr = NMB(slot); + int flags = ((slot->flags & NS_REPORT) || + j == 0 || j == report_frequency) ? + IXGBE_TXD_CMD_RS : 0; + int len = slot->len; + + if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { + if (do_lock) + IXGBE_TX_UNLOCK(txr); + return netmap_ring_reinit(kring); + } + + slot->flags &= ~NS_REPORT; + curr->read.buffer_addr = htole64(vtophys(addr)); + curr->read.olinfo_status = 0; + curr->read.cmd_type_len = + htole32(txr->txd_cmd | len | + (IXGBE_ADVTXD_DTYP_DATA | + IXGBE_ADVTXD_DCMD_IFCS | + IXGBE_TXD_CMD_EOP | flags) ); + if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, unload and reload map */ + netmap_reload_map(txr->txtag, txbuf->map, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(txr->txtag, txbuf->map, + BUS_DMASYNC_PREWRITE); + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwcur = k; + + /* decrease avail by number of sent packets */ + ring->avail -= n; + kring->nr_hwavail = ring->avail; + + bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + + IXGBE_WRITE_REG(&adapter->hw, IXGBE_TDT(txr->me), k); + } + + if (n == 0 || kring->nr_hwavail < 1) { + /* record completed transmissions. TODO + * + * The datasheet discourages the use of TDH to find out the + * number of sent packets; the right way to do so, is to check + * the DD bit inside the status of a packet descriptor. On the + * other hand, we avoid to set the `report status' bit for + * *all* outgoing packets (kind of interrupt mitigation), + * consequently the DD bit is not guaranteed to be set for all + * the packets: thats way, for the moment we continue to use + * TDH. + */ + j = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(ring_nr)); + if (j >= kring->nkr_num_slots) { /* XXX can happen */ + D("TDH wrap %d", j); + j -= kring->nkr_num_slots; + } + int delta = j - txr->next_to_clean; + if (delta) { + /* new transmissions were completed, increment + ring->nr_hwavail. */ + if (delta < 0) + delta += kring->nkr_num_slots; + txr->next_to_clean = j; + kring->nr_hwavail += delta; + ring->avail = kring->nr_hwavail; + } + } + + if (do_lock) + IXGBE_TX_UNLOCK(txr); + return 0; +} + + +/* + * Reconcile kernel and user view of the receive ring. + * + * Userspace has read rx slots up to cur (excluded). + * The last unread slot previously known to the kernel was nr_hwcur, + * and the last interrupt reported nr_hwavail slots available. + * We must subtract the newly consumed slots (cur - nr_hwcur) + * from nr_hwavail, clearing the descriptors for the next + * read, tell the hardware that they are available, + * and set nr_hwcur = cur and avail = nr_hwavail. + * issuing a dmamap_sync on all slots. + */ +static int +ixgbe_netmap_rxsync(void *a, u_int ring_nr, int do_lock) +{ + struct adapter *adapter = a; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + struct netmap_adapter *na = NA(adapter->ifp); + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, k, n, lim = kring->nkr_num_slots - 1; + + k = ring->cur; /* ring is not protected by any lock */ + if ( (kring->nr_kflags & NR_REINIT) || k > lim) + return netmap_ring_reinit(kring); + + if (do_lock) + IXGBE_RX_LOCK(rxr); + /* XXX check sync modes */ + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + + j = rxr->next_to_check; + for (n = 0; ; n++) { + union ixgbe_adv_rx_desc *curr = &rxr->rx_base[j]; + uint32_t staterr = le32toh(curr->wb.upper.status_error); + + if ((staterr & IXGBE_RXD_STAT_DD) == 0) + break; + ring->slot[j].len = le16toh(curr->wb.upper.length); + bus_dmamap_sync(rxr->ptag, + rxr->rx_buffers[j].pmap, BUS_DMASYNC_POSTREAD); + j = (j == lim) ? 0 : j + 1; + } + if (n) { + rxr->next_to_check = j; + kring->nr_hwavail += n; + if (kring->nr_hwavail >= lim - 10) { + ND("rx ring %d almost full %d", ring_nr, kring->nr_hwavail); + } + } + + /* skip past packets that userspace has already processed, + * making them available for reception. + * advance nr_hwcur and issue a bus_dmamap_sync on the + * buffers so it is safe to write to them. + * Also increase nr_hwavail + */ + j = kring->nr_hwcur; + if (j != k) { /* userspace has read some packets. */ + n = 0; + while (j != k) { + struct netmap_slot *slot = ring->slot + j; + union ixgbe_adv_rx_desc *curr = &rxr->rx_base[j]; + struct ixgbe_rx_buf *rxbuf = rxr->rx_buffers + j; + void *addr = NMB(slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + if (do_lock) + IXGBE_RX_UNLOCK(rxr); + return netmap_ring_reinit(kring); + } + + curr->wb.upper.status_error = 0; + curr->read.pkt_addr = htole64(vtophys(addr)); + if (slot->flags & NS_BUF_CHANGED) { + netmap_reload_map(rxr->ptag, rxbuf->pmap, + addr, na->buff_size); + slot->flags &= ~NS_BUF_CHANGED; + } + + bus_dmamap_sync(rxr->ptag, rxbuf->pmap, + BUS_DMASYNC_PREREAD); + + j = (j == lim) ? 0 : j + 1; + n++; + } + kring->nr_hwavail -= n; + kring->nr_hwcur = ring->cur; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + /* IMPORTANT: we must leave one free slot in the ring, + * so move j back by one unit + */ + j = (j == 0) ? lim : j - 1; + IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), j); + } + /* tell userspace that there are new packets */ + ring->avail = kring->nr_hwavail ; + if (do_lock) + IXGBE_RX_UNLOCK(rxr); + return 0; +} diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c new file mode 100644 index 000000000000..7645a4e6e32b --- /dev/null +++ b/sys/dev/netmap/netmap.c @@ -0,0 +1,1762 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: netmap.c 9662 2011-11-16 13:18:06Z luigi $ + * + * This module supports memory mapped access to network devices, + * see netmap(4). + * + * The module uses a large, memory pool allocated by the kernel + * and accessible as mmapped memory by multiple userspace threads/processes. + * The memory pool contains packet buffers and "netmap rings", + * i.e. user-accessible copies of the interface's queues. + * + * Access to the network card works like this: + * 1. a process/thread issues one or more open() on /dev/netmap, to create + * select()able file descriptor on which events are reported. + * 2. on each descriptor, the process issues an ioctl() to identify + * the interface that should report events to the file descriptor. + * 3. on each descriptor, the process issues an mmap() request to + * map the shared memory region within the process' address space. + * The list of interesting queues is indicated by a location in + * the shared memory region. + * 4. using the functions in the netmap(4) userspace API, a process + * can look up the occupation state of a queue, access memory buffers, + * and retrieve received packets or enqueue packets to transmit. + * 5. using some ioctl()s the process can synchronize the userspace view + * of the queue with the actual status in the kernel. This includes both + * receiving the notification of new packets, and transmitting new + * packets on the output interface. + * 6. select() or poll() can be used to wait for events on individual + * transmit or receive queues (or all queues for a given interface). + */ + +#include /* prerequisite */ +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include /* defines used in kernel.h */ +#include /* types used in module initialization */ +#include /* cdevsw struct */ +#include /* uio struct */ +#include +#include /* struct socket */ +#include +#include /* PROT_EXEC */ +#include +#include /* vtophys */ +#include /* vtophys */ +#include /* sockaddrs */ +#include +#include +#include +#include +#include /* BIOCIMMEDIATE */ +#include +#include +#include /* bus_dmamap_* */ + +MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); + +/* + * lock and unlock for the netmap memory allocator + */ +#define NMA_LOCK() mtx_lock(&netmap_mem_d->nm_mtx); +#define NMA_UNLOCK() mtx_unlock(&netmap_mem_d->nm_mtx); + +/* + * Default amount of memory pre-allocated by the module. + * We start with a large size and then shrink our demand + * according to what is avalable when the module is loaded. + * At the moment the block is contiguous, but we can easily + * restrict our demand to smaller units (16..64k) + */ +#define NETMAP_MEMORY_SIZE (64 * 1024 * PAGE_SIZE) +static void * netmap_malloc(size_t size, const char *msg); +static void netmap_free(void *addr, const char *msg); + +/* + * Allocator for a pool of packet buffers. For each buffer we have + * one entry in the bitmap to signal the state. Allocation scans + * the bitmap, but since this is done only on attach, we are not + * too worried about performance + * XXX if we need to allocate small blocks, a translation + * table is used both for kernel virtual address and physical + * addresses. + */ +struct netmap_buf_pool { + u_int total_buffers; /* total buffers. */ + u_int free; + u_int bufsize; + char *base; /* buffer base address */ + uint32_t *bitmap; /* one bit per buffer, 1 means free */ +}; +struct netmap_buf_pool nm_buf_pool; +/* XXX move these two vars back into netmap_buf_pool */ +u_int netmap_total_buffers; +char *netmap_buffer_base; + +/* user-controlled variables */ +int netmap_verbose; + +static int no_timestamp; /* don't timestamp on rxsync */ + +SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); +SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, + CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); +SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, + CTLFLAG_RW, &no_timestamp, 0, "no_timestamp"); +SYSCTL_INT(_dev_netmap, OID_AUTO, total_buffers, + CTLFLAG_RD, &nm_buf_pool.total_buffers, 0, "total_buffers"); +SYSCTL_INT(_dev_netmap, OID_AUTO, free_buffers, + CTLFLAG_RD, &nm_buf_pool.free, 0, "free_buffers"); + +/* + * Allocate n buffers from the ring, and fill the slot. + * Buffer 0 is the 'junk' buffer. + */ +static void +netmap_new_bufs(struct netmap_buf_pool *p, struct netmap_slot *slot, u_int n) +{ + uint32_t bi = 0; /* index in the bitmap */ + uint32_t mask, j, i = 0; /* slot counter */ + + if (n > p->free) { + D("only %d out of %d buffers available", i, n); + return; + } + /* termination is guaranteed by p->free */ + while (i < n && p->free > 0) { + uint32_t cur = p->bitmap[bi]; + if (cur == 0) { /* bitmask is fully used */ + bi++; + continue; + } + /* locate a slot */ + for (j = 0, mask = 1; (cur & mask) == 0; j++, mask <<= 1) ; + p->bitmap[bi] &= ~mask; /* slot in use */ + p->free--; + slot[i].buf_idx = bi*32+j; + slot[i].len = p->bufsize; + slot[i].flags = NS_BUF_CHANGED; + i++; + } + ND("allocated %d buffers, %d available", n, p->free); +} + + +static void +netmap_free_buf(struct netmap_buf_pool *p, uint32_t i) +{ + uint32_t pos, mask; + if (i >= p->total_buffers) { + D("invalid free index %d", i); + return; + } + pos = i / 32; + mask = 1 << (i % 32); + if (p->bitmap[pos] & mask) { + D("slot %d already free", i); + return; + } + p->bitmap[pos] |= mask; + p->free++; +} + + +/* Descriptor of the memory objects handled by our memory allocator. */ +struct netmap_mem_obj { + TAILQ_ENTRY(netmap_mem_obj) nmo_next; /* next object in the + chain. */ + int nmo_used; /* flag set on used memory objects. */ + size_t nmo_size; /* size of the memory area reserved for the + object. */ + void *nmo_data; /* pointer to the memory area. */ +}; + +/* Wrap our memory objects to make them ``chainable``. */ +TAILQ_HEAD(netmap_mem_obj_h, netmap_mem_obj); + + +/* Descriptor of our custom memory allocator. */ +struct netmap_mem_d { + struct mtx nm_mtx; /* lock used to handle the chain of memory + objects. */ + struct netmap_mem_obj_h nm_molist; /* list of memory objects */ + size_t nm_size; /* total amount of memory used for rings etc. */ + size_t nm_totalsize; /* total amount of allocated memory + (the difference is used for buffers) */ + size_t nm_buf_start; /* offset of packet buffers. + This is page-aligned. */ + size_t nm_buf_len; /* total memory for buffers */ + void *nm_buffer; /* pointer to the whole pre-allocated memory + area. */ +}; + + +/* Structure associated to each thread which registered an interface. */ +struct netmap_priv_d { + struct netmap_if *np_nifp; /* netmap interface descriptor. */ + + struct ifnet *np_ifp; /* device for which we hold a reference */ + int np_ringid; /* from the ioctl */ + u_int np_qfirst, np_qlast; /* range of rings to scan */ + uint16_t np_txpoll; +}; + + +static struct cdev *netmap_dev; /* /dev/netmap character device. */ +static struct netmap_mem_d *netmap_mem_d; /* Our memory allocator. */ + + +static d_mmap_t netmap_mmap; +static d_ioctl_t netmap_ioctl; +static d_poll_t netmap_poll; + +#ifdef NETMAP_KEVENT +static d_kqfilter_t netmap_kqfilter; +#endif + +static struct cdevsw netmap_cdevsw = { + .d_version = D_VERSION, + .d_name = "netmap", + .d_mmap = netmap_mmap, + .d_ioctl = netmap_ioctl, + .d_poll = netmap_poll, +#ifdef NETMAP_KEVENT + .d_kqfilter = netmap_kqfilter, +#endif +}; + +#ifdef NETMAP_KEVENT +static int netmap_kqread(struct knote *, long); +static int netmap_kqwrite(struct knote *, long); +static void netmap_kqdetach(struct knote *); + +static struct filterops netmap_read_filterops = { + .f_isfd = 1, + .f_attach = NULL, + .f_detach = netmap_kqdetach, + .f_event = netmap_kqread, +}; + +static struct filterops netmap_write_filterops = { + .f_isfd = 1, + .f_attach = NULL, + .f_detach = netmap_kqdetach, + .f_event = netmap_kqwrite, +}; + +/* + * support for the kevent() system call. + * + * This is the kevent filter, and is executed each time a new event + * is triggered on the device. This function execute some operation + * depending on the received filter. + * + * The implementation should test the filters and should implement + * filter operations we are interested on (a full list in /sys/event.h). + * + * On a match we should: + * - set kn->kn_fop + * - set kn->kn_hook + * - call knlist_add() to deliver the event to the application. + * + * Return 0 if the event should be delivered to the application. + */ +static int +netmap_kqfilter(struct cdev *dev, struct knote *kn) +{ + /* declare variables needed to read/write */ + + switch(kn->kn_filter) { + case EVFILT_READ: + if (netmap_verbose) + D("%s kqfilter: EVFILT_READ" ifp->if_xname); + + /* read operations */ + kn->kn_fop = &netmap_read_filterops; + break; + + case EVFILT_WRITE: + if (netmap_verbose) + D("%s kqfilter: EVFILT_WRITE" ifp->if_xname); + + /* write operations */ + kn->kn_fop = &netmap_write_filterops; + break; + + default: + if (netmap_verbose) + D("%s kqfilter: invalid filter" ifp->if_xname); + return(EINVAL); + } + + kn->kn_hook = 0;// + knlist_add(&netmap_sc->tun_rsel.si_note, kn, 0); + + return (0); +} +#endif /* NETMAP_KEVENT */ + +/* + * File descriptor's private data destructor. + * + * Call nm_register(ifp,0) to stop netmap mode on the interface and + * revert to normal operation. We expect that np_ifp has not gone. + */ +static void +netmap_dtor(void *data) +{ + struct netmap_priv_d *priv = data; + struct ifnet *ifp = priv->np_ifp; + struct netmap_adapter *na = NA(ifp); + struct netmap_if *nifp = priv->np_nifp; + + if (0) + printf("%s starting for %p ifp %p\n", __FUNCTION__, priv, + priv ? priv->np_ifp : NULL); + + na->nm_lock(ifp->if_softc, NETMAP_CORE_LOCK, 0); + + na->refcount--; + if (na->refcount <= 0) { /* last instance */ + u_int i; + + D("deleting last netmap instance for %s", ifp->if_xname); + /* + * there is a race here with *_netmap_task() and + * netmap_poll(), which don't run under NETMAP_CORE_LOCK. + * na->refcount == 0 && na->ifp->if_capenable & IFCAP_NETMAP + * (aka NETMAP_DELETING(na)) are a unique marker that the + * device is dying. + * Before destroying stuff we sleep a bit, and then complete + * the job. NIOCREG should realize the condition and + * loop until they can continue; the other routines + * should check the condition at entry and quit if + * they cannot run. + */ + na->nm_lock(ifp->if_softc, NETMAP_CORE_UNLOCK, 0); + tsleep(na, 0, "NIOCUNREG", 4); + na->nm_lock(ifp->if_softc, NETMAP_CORE_LOCK, 0); + na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */ + /* Wake up any sleeping threads. netmap_poll will + * then return POLLERR + */ + for (i = 0; i < na->num_queues + 2; i++) { + selwakeuppri(&na->tx_rings[i].si, PI_NET); + selwakeuppri(&na->rx_rings[i].si, PI_NET); + } + /* release all buffers */ + NMA_LOCK(); + for (i = 0; i < na->num_queues + 1; i++) { + int j, lim; + struct netmap_ring *ring; + + ND("tx queue %d", i); + ring = na->tx_rings[i].ring; + lim = na->tx_rings[i].nkr_num_slots; + for (j = 0; j < lim; j++) + netmap_free_buf(&nm_buf_pool, + ring->slot[j].buf_idx); + + ND("rx queue %d", i); + ring = na->rx_rings[i].ring; + lim = na->rx_rings[i].nkr_num_slots; + for (j = 0; j < lim; j++) + netmap_free_buf(&nm_buf_pool, + ring->slot[j].buf_idx); + } + NMA_UNLOCK(); + netmap_free(na->tx_rings[0].ring, "shadow rings"); + wakeup(na); + } + netmap_free(nifp, "nifp"); + + na->nm_lock(ifp->if_softc, NETMAP_CORE_UNLOCK, 0); + + if_rele(ifp); + + bzero(priv, sizeof(*priv)); /* XXX for safety */ + free(priv, M_DEVBUF); +} + + + +/* + * Create and return a new ``netmap_if`` object, and possibly also + * rings and packet buffors. + * + * Return NULL on failure. + */ +static void * +netmap_if_new(const char *ifname, struct netmap_adapter *na) +{ + struct netmap_if *nifp; + struct netmap_ring *ring; + char *buff; + u_int i, len, ofs; + u_int n = na->num_queues + 1; /* shorthand, include stack queue */ + + /* + * the descriptor is followed inline by an array of offsets + * to the tx and rx rings in the shared memory region. + */ + len = sizeof(struct netmap_if) + 2 * n * sizeof(ssize_t); + nifp = netmap_malloc(len, "nifp"); + if (nifp == NULL) + return (NULL); + + /* initialize base fields */ + *(int *)(uintptr_t)&nifp->ni_num_queues = na->num_queues; + strncpy(nifp->ni_name, ifname, IFNAMSIZ); + + (na->refcount)++; /* XXX atomic ? we are under lock */ + if (na->refcount > 1) + goto final; + + /* + * If this is the first instance, allocate the shadow rings and + * buffers for this card (one for each hw queue, one for the host). + * The rings are contiguous, but have variable size. + * The entire block is reachable at + * na->tx_rings[0].ring + */ + + len = n * (2 * sizeof(struct netmap_ring) + + (na->num_tx_desc + na->num_rx_desc) * + sizeof(struct netmap_slot) ); + buff = netmap_malloc(len, "shadow rings"); + if (buff == NULL) { + D("failed to allocate %d bytes for %s shadow ring", + len, ifname); +error: + (na->refcount)--; + netmap_free(nifp, "nifp, rings failed"); + return (NULL); + } + /* do we have the bufers ? we are in need of num_tx_desc buffers for + * each tx ring and num_tx_desc buffers for each rx ring. */ + len = n * (na->num_tx_desc + na->num_rx_desc); + NMA_LOCK(); + if (nm_buf_pool.free < len) { + NMA_UNLOCK(); + netmap_free(buff, "not enough bufs"); + goto error; + } + /* + * in the kring, store the pointers to the shared rings + * and initialize the rings. We are under NMA_LOCK(). + */ + ofs = 0; + for (i = 0; i < n; i++) { + struct netmap_kring *kring; + int numdesc; + + /* Transmit rings */ + kring = &na->tx_rings[i]; + numdesc = na->num_tx_desc; + bzero(kring, sizeof(*kring)); + kring->na = na; + + ring = kring->ring = (struct netmap_ring *)(buff + ofs); + *(ssize_t *)(uintptr_t)&ring->buf_ofs = + nm_buf_pool.base - (char *)ring; + ND("txring[%d] at %p ofs %d", i, ring, ring->buf_ofs); + *(int *)(int *)(uintptr_t)&ring->num_slots = + kring->nkr_num_slots = numdesc; + + /* + * IMPORTANT: + * Always keep one slot empty, so we can detect new + * transmissions comparing cur and nr_hwcur (they are + * the same only if there are no new transmissions). + */ + ring->avail = kring->nr_hwavail = numdesc - 1; + ring->cur = kring->nr_hwcur = 0; + netmap_new_bufs(&nm_buf_pool, ring->slot, numdesc); + + ofs += sizeof(struct netmap_ring) + + numdesc * sizeof(struct netmap_slot); + + /* Receive rings */ + kring = &na->rx_rings[i]; + numdesc = na->num_rx_desc; + bzero(kring, sizeof(*kring)); + kring->na = na; + + ring = kring->ring = (struct netmap_ring *)(buff + ofs); + *(ssize_t *)(uintptr_t)&ring->buf_ofs = + nm_buf_pool.base - (char *)ring; + ND("rxring[%d] at %p offset %d", i, ring, ring->buf_ofs); + *(int *)(int *)(uintptr_t)&ring->num_slots = + kring->nkr_num_slots = numdesc; + ring->cur = kring->nr_hwcur = 0; + ring->avail = kring->nr_hwavail = 0; /* empty */ + netmap_new_bufs(&nm_buf_pool, ring->slot, numdesc); + ofs += sizeof(struct netmap_ring) + + numdesc * sizeof(struct netmap_slot); + } + NMA_UNLOCK(); + for (i = 0; i < n+1; i++) { + // XXX initialize the selrecord structs. + } +final: + /* + * fill the slots for the rx and tx queues. They contain the offset + * between the ring and nifp, so the information is usable in + * userspace to reach the ring from the nifp. + */ + for (i = 0; i < n; i++) { + char *base = (char *)nifp; + *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] = + (char *)na->tx_rings[i].ring - base; + *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n] = + (char *)na->rx_rings[i].ring - base; + } + return (nifp); +} + + +/* + * mmap(2) support for the "netmap" device. + * + * Expose all the memory previously allocated by our custom memory + * allocator: this way the user has only to issue a single mmap(2), and + * can work on all the data structures flawlessly. + * + * Return 0 on success, -1 otherwise. + */ +static int +#if __FreeBSD_version < 900000 +netmap_mmap(__unused struct cdev *dev, vm_offset_t offset, vm_paddr_t *paddr, + int nprot) +#else +netmap_mmap(__unused struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, + int nprot, __unused vm_memattr_t *memattr) +#endif +{ + if (nprot & PROT_EXEC) + return (-1); // XXX -1 or EINVAL ? + ND("request for offset 0x%x", (uint32_t)offset); + *paddr = vtophys(netmap_mem_d->nm_buffer) + offset; + + return (0); +} + + +/* + * handler for synchronization of the queues from/to the host + */ +static void +netmap_sync_to_host(struct netmap_adapter *na) +{ + struct netmap_kring *kring = &na->tx_rings[na->num_queues]; + struct netmap_ring *ring = kring->ring; + struct mbuf *head = NULL, *tail = NULL, *m; + u_int n, lim = kring->nkr_num_slots - 1; + + na->nm_lock(na->ifp->if_softc, NETMAP_CORE_LOCK, 0); + + /* Take packets from hwcur to cur and pass them up. + * In case of no buffers we give up. At the end of the loop, + * the queue is drained in all cases. + */ + for (n = kring->nr_hwcur; n != ring->cur;) { + struct netmap_slot *slot = &ring->slot[n]; + + n = (n == lim) ? 0 : n + 1; + if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE) { + D("bad pkt at %d len %d", n, slot->len); + continue; + } + m = m_devget(NMB(slot), slot->len, 0, na->ifp, NULL); + + if (m == NULL) + break; + if (tail) + tail->m_nextpkt = m; + else + head = m; + tail = m; + m->m_nextpkt = NULL; + } + kring->nr_hwcur = ring->cur; + kring->nr_hwavail = ring->avail = lim; + na->nm_lock(na->ifp->if_softc, NETMAP_CORE_UNLOCK, 0); + + /* send packets up, outside the lock */ + while ((m = head) != NULL) { + head = head->m_nextpkt; + m->m_nextpkt = NULL; + m->m_pkthdr.rcvif = na->ifp; + if (netmap_verbose & NM_VERB_HOST) + D("sending up pkt %p size %d", m, m->m_pkthdr.len); + (na->ifp->if_input)(na->ifp, m); + } +} + +/* + * This routine also does the selrecord if called from the poll handler + * (we know because td != NULL). + */ +static void +netmap_sync_from_host(struct netmap_adapter *na, struct thread *td) +{ + struct netmap_kring *kring = &na->rx_rings[na->num_queues]; + struct netmap_ring *ring = kring->ring; + int delta; + + na->nm_lock(na->ifp->if_softc, NETMAP_CORE_LOCK, 0); + + /* skip past packets processed by userspace, + * and then sync cur/avail with hwcur/hwavail + */ + delta = ring->cur - kring->nr_hwcur; + if (delta < 0) + delta += kring->nkr_num_slots; + kring->nr_hwavail -= delta; + kring->nr_hwcur = ring->cur; + ring->avail = kring->nr_hwavail; + if (ring->avail == 0 && td) + selrecord(td, &kring->si); + if (ring->avail && (netmap_verbose & NM_VERB_HOST)) + D("%d pkts from stack", ring->avail); + na->nm_lock(na->ifp->if_softc, NETMAP_CORE_UNLOCK, 0); +} + + +/* + * get a refcounted reference to an interface. + * Return ENXIO if the interface does not exist, EINVAL if netmap + * is not supported by the interface. + * If successful, hold a reference. + */ +static int +get_ifp(const char *name, struct ifnet **ifp) +{ + *ifp = ifunit_ref(name); + if (*ifp == NULL) + return (ENXIO); + /* can do this if the capability exists and if_pspare[0] + * points to the netmap descriptor. + */ + if ((*ifp)->if_capabilities & IFCAP_NETMAP && NA(*ifp)) + return 0; /* valid pointer, we hold the refcount */ + if_rele(*ifp); + return EINVAL; // not NETMAP capable +} + + +/* + * Error routine called when txsync/rxsync detects an error. + * Can't do much more than resetting cur = hwcur, avail = hwavail. + * Return 1 on reinit. + */ +int +netmap_ring_reinit(struct netmap_kring *kring) +{ + struct netmap_ring *ring = kring->ring; + u_int i, lim = kring->nkr_num_slots - 1; + int errors = 0; + + D("called for %s", kring->na->ifp->if_xname); + if (ring->cur > lim) + errors++; + for (i = 0; i <= lim; i++) { + u_int idx = ring->slot[i].buf_idx; + u_int len = ring->slot[i].len; + if (idx < 2 || idx >= netmap_total_buffers) { + if (!errors++) + D("bad buffer at slot %d idx %d len %d ", i, idx, len); + ring->slot[i].buf_idx = 0; + ring->slot[i].len = 0; + } else if (len > NETMAP_BUF_SIZE) { + ring->slot[i].len = 0; + if (!errors++) + D("bad len %d at slot %d idx %d", + len, i, idx); + } + } + if (errors) { + int pos = kring - kring->na->tx_rings; + int n = kring->na->num_queues + 2; + + D("total %d errors", errors); + errors++; + D("%s %s[%d] reinit, cur %d -> %d avail %d -> %d", + kring->na->ifp->if_xname, + pos < n ? "TX" : "RX", pos < n ? pos : pos - n, + ring->cur, kring->nr_hwcur, + ring->avail, kring->nr_hwavail); + ring->cur = kring->nr_hwcur; + ring->avail = kring->nr_hwavail; + ring->flags |= NR_REINIT; + kring->na->flags |= NR_REINIT; + } + return (errors ? 1 : 0); +} + +/* + * Clean the reinit flag for our rings. + * XXX at the moment, clear for all rings + */ +static void +netmap_clean_reinit(struct netmap_adapter *na) +{ + //struct netmap_kring *kring; + u_int i; + + na->flags &= ~NR_REINIT; + D("--- NR_REINIT reset on %s", na->ifp->if_xname); + for (i = 0; i < na->num_queues + 1; i++) { + na->tx_rings[i].ring->flags &= ~NR_REINIT; + na->rx_rings[i].ring->flags &= ~NR_REINIT; + } +} + +/* + * Set the ring ID. For devices with a single queue, a request + * for all rings is the same as a single ring. + */ +static int +netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) +{ + struct ifnet *ifp = priv->np_ifp; + struct netmap_adapter *na = NA(ifp); + void *adapter = na->ifp->if_softc; /* shorthand */ + u_int i = ringid & NETMAP_RING_MASK; + /* first time we don't lock */ + int need_lock = (priv->np_qfirst != priv->np_qlast); + + if ( (ringid & NETMAP_HW_RING) && i >= na->num_queues) { + D("invalid ring id %d", i); + return (EINVAL); + } + if (need_lock) + na->nm_lock(adapter, NETMAP_CORE_LOCK, 0); + priv->np_ringid = ringid; + if (ringid & NETMAP_SW_RING) { + priv->np_qfirst = na->num_queues; + priv->np_qlast = na->num_queues + 1; + } else if (ringid & NETMAP_HW_RING) { + priv->np_qfirst = i; + priv->np_qlast = i + 1; + } else { + priv->np_qfirst = 0; + priv->np_qlast = na->num_queues; + } + priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; + if (need_lock) + na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0); + if (ringid & NETMAP_SW_RING) + D("ringid %s set to SW RING", ifp->if_xname); + else if (ringid & NETMAP_HW_RING) + D("ringid %s set to HW RING %d", ifp->if_xname, + priv->np_qfirst); + else + D("ringid %s set to all %d HW RINGS", ifp->if_xname, + priv->np_qlast); + return 0; +} + +/* + * ioctl(2) support for the "netmap" device. + * + * Following a list of accepted commands: + * - NIOCGINFO + * - SIOCGIFADDR just for convenience + * - NIOCREGIF + * - NIOCUNREGIF + * - NIOCTXSYNC + * - NIOCRXSYNC + * + * Return 0 on success, errno otherwise. + */ +static int +netmap_ioctl(__unused struct cdev *dev, u_long cmd, caddr_t data, + __unused int fflag, __unused struct thread *td) +{ + struct netmap_priv_d *priv = NULL; + struct ifnet *ifp; + struct nmreq *nmr = (struct nmreq *) data; + struct netmap_adapter *na; + void *adapter; + int error; + u_int i; + struct netmap_if *nifp; + + error = devfs_get_cdevpriv((void **)&priv); + if (error != ENOENT && error != 0) + return (error); + + error = 0; /* Could be ENOENT */ + switch (cmd) { + case NIOCGINFO: /* return capabilities etc */ + /* memsize is always valid */ + nmr->nr_memsize = netmap_mem_d->nm_totalsize; + nmr->nr_offset = 0; + nmr->nr_numrings = 0; + nmr->nr_numslots = 0; + if (nmr->nr_name[0] == '\0') /* just get memory info */ + break; + error = get_ifp(nmr->nr_name, &ifp); /* get a refcount */ + if (error) + break; + na = NA(ifp); /* retrieve netmap_adapter */ + nmr->nr_numrings = na->num_queues; + nmr->nr_numslots = na->num_tx_desc; + if_rele(ifp); /* return the refcount */ + break; + + case NIOCREGIF: + if (priv != NULL) /* thread already registered */ + return netmap_set_ringid(priv, nmr->nr_ringid); + /* find the interface and a reference */ + error = get_ifp(nmr->nr_name, &ifp); /* keep reference */ + if (error) + break; + na = NA(ifp); /* retrieve netmap adapter */ + adapter = na->ifp->if_softc; /* shorthand */ + /* + * Allocate the private per-thread structure. + * XXX perhaps we can use a blocking malloc ? + */ + priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (priv == NULL) { + error = ENOMEM; + if_rele(ifp); /* return the refcount */ + break; + } + + + for (i = 10; i > 0; i--) { + na->nm_lock(adapter, NETMAP_CORE_LOCK, 0); + if (!NETMAP_DELETING(na)) + break; + na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0); + tsleep(na, 0, "NIOCREGIF", hz/10); + } + if (i == 0) { + D("too many NIOCREGIF attempts, give up"); + error = EINVAL; + free(priv, M_DEVBUF); + if_rele(ifp); /* return the refcount */ + break; + } + + priv->np_ifp = ifp; /* store the reference */ + error = netmap_set_ringid(priv, nmr->nr_ringid); + if (error) + goto error; + priv->np_nifp = nifp = netmap_if_new(nmr->nr_name, na); + if (nifp == NULL) { /* allocation failed */ + error = ENOMEM; + } else if (ifp->if_capenable & IFCAP_NETMAP) { + /* was already set */ + } else { + /* Otherwise set the card in netmap mode + * and make it use the shared buffers. + */ + error = na->nm_register(ifp, 1); /* mode on */ + if (error) { + /* + * do something similar to netmap_dtor(). + */ + netmap_free(na->tx_rings[0].ring, "rings, reg.failed"); + free(na->tx_rings, M_DEVBUF); + na->tx_rings = na->rx_rings = NULL; + na->refcount--; + netmap_free(nifp, "nifp, rings failed"); + nifp = NULL; + } + } + + if (error) { /* reg. failed, release priv and ref */ +error: + na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0); + free(priv, M_DEVBUF); + if_rele(ifp); /* return the refcount */ + break; + } + + na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0); + error = devfs_set_cdevpriv(priv, netmap_dtor); + + if (error != 0) { + /* could not assign the private storage for the + * thread, call the destructor explicitly. + */ + netmap_dtor(priv); + break; + } + + /* return the offset of the netmap_if object */ + nmr->nr_numrings = na->num_queues; + nmr->nr_numslots = na->num_tx_desc; + nmr->nr_memsize = netmap_mem_d->nm_totalsize; + nmr->nr_offset = + ((char *) nifp - (char *) netmap_mem_d->nm_buffer); + break; + + case NIOCUNREGIF: + if (priv == NULL) + return (ENXIO); + + /* the interface is unregistered inside the + destructor of the private data. */ + devfs_clear_cdevpriv(); + break; + + case NIOCTXSYNC: + case NIOCRXSYNC: + if (priv == NULL) + return (ENXIO); + ifp = priv->np_ifp; /* we have a reference */ + na = NA(ifp); /* retrieve netmap adapter */ + adapter = ifp->if_softc; /* shorthand */ + + if (na->flags & NR_REINIT) + netmap_clean_reinit(na); + + if (priv->np_qfirst == na->num_queues) { + /* queues to/from host */ + if (cmd == NIOCTXSYNC) + netmap_sync_to_host(na); + else + netmap_sync_from_host(na, NULL); + return error; + } + + for (i = priv->np_qfirst; i < priv->np_qlast; i++) { + if (cmd == NIOCTXSYNC) { + struct netmap_kring *kring = &na->tx_rings[i]; + if (netmap_verbose & NM_VERB_TXSYNC) + D("sync tx ring %d cur %d hwcur %d", + i, kring->ring->cur, + kring->nr_hwcur); + na->nm_txsync(adapter, i, 1 /* do lock */); + if (netmap_verbose & NM_VERB_TXSYNC) + D("after sync tx ring %d cur %d hwcur %d", + i, kring->ring->cur, + kring->nr_hwcur); + } else { + na->nm_rxsync(adapter, i, 1 /* do lock */); + microtime(&na->rx_rings[i].ring->ts); + } + } + + break; + + case BIOCIMMEDIATE: + case BIOCGHDRCMPLT: + case BIOCSHDRCMPLT: + case BIOCSSEESENT: + D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); + break; + + default: + { + /* + * allow device calls + */ + struct socket so; + bzero(&so, sizeof(so)); + error = get_ifp(nmr->nr_name, &ifp); /* keep reference */ + if (error) + break; + so.so_vnet = ifp->if_vnet; + // so->so_proto not null. + error = ifioctl(&so, cmd, data, td); + if_rele(ifp); + } + } + + return (error); +} + + +/* + * select(2) and poll(2) handlers for the "netmap" device. + * + * Can be called for one or more queues. + * Return true the event mask corresponding to ready events. + * If there are no ready events, do a selrecord on either individual + * selfd or on the global one. + * Device-dependent parts (locking and sync of tx/rx rings) + * are done through callbacks. + */ +static int +netmap_poll(__unused struct cdev *dev, int events, struct thread *td) +{ + struct netmap_priv_d *priv = NULL; + struct netmap_adapter *na; + struct ifnet *ifp; + struct netmap_kring *kring; + u_int i, check_all, want_tx, want_rx, revents = 0; + void *adapter; + + if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) + return POLLERR; + + ifp = priv->np_ifp; + // XXX check for deleting() ? + if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) + return POLLERR; + + if (netmap_verbose & 0x8000) + D("device %s events 0x%x", ifp->if_xname, events); + want_tx = events & (POLLOUT | POLLWRNORM); + want_rx = events & (POLLIN | POLLRDNORM); + + adapter = ifp->if_softc; + na = NA(ifp); /* retrieve netmap adapter */ + + /* pending reinit, report up as a poll error. Pending + * reads and writes are lost. + */ + if (na->flags & NR_REINIT) { + netmap_clean_reinit(na); + revents |= POLLERR; + } + /* how many queues we are scanning */ + i = priv->np_qfirst; + if (i == na->num_queues) { /* from/to host */ + if (priv->np_txpoll || want_tx) { + /* push any packets up, then we are always ready */ + kring = &na->tx_rings[i]; + netmap_sync_to_host(na); + revents |= want_tx; + } + if (want_rx) { + kring = &na->rx_rings[i]; + if (kring->ring->avail == 0) + netmap_sync_from_host(na, td); + if (kring->ring->avail > 0) { + revents |= want_rx; + } + } + return (revents); + } + + /* + * check_all is set if the card has more than one queue and + * the client is polling all of them. If true, we sleep on + * the "global" selfd, otherwise we sleep on individual selfd + * (we can only sleep on one of them per direction). + * The interrupt routine in the driver should always wake on + * the individual selfd, and also on the global one if the card + * has more than one ring. + * + * If the card has only one lock, we just use that. + * If the card has separate ring locks, we just use those + * unless we are doing check_all, in which case the whole + * loop is wrapped by the global lock. + * We acquire locks only when necessary: if poll is called + * when buffers are available, we can just return without locks. + * + * rxsync() is only called if we run out of buffers on a POLLIN. + * txsync() is called if we run out of buffers on POLLOUT, or + * there are pending packets to send. The latter can be disabled + * passing NETMAP_NO_TX_POLL in the NIOCREG call. + */ + check_all = (i + 1 != priv->np_qlast); + + /* + * core_lock indicates what to do with the core lock. + * The core lock is used when either the card has no individual + * locks, or it has individual locks but we are cheking all + * rings so we need the core lock to avoid missing wakeup events. + * + * It has three possible states: + * NO_CL we don't need to use the core lock, e.g. + * because we are protected by individual locks. + * NEED_CL we need the core lock. In this case, when we + * call the lock routine, move to LOCKED_CL + * to remember to release the lock once done. + * LOCKED_CL core lock is set, so we need to release it. + */ + enum {NO_CL, NEED_CL, LOCKED_CL }; + int core_lock = (check_all || !na->separate_locks) ? + NEED_CL:NO_CL; + /* + * We start with a lock free round which is good if we have + * data available. If this fails, then lock and call the sync + * routines. + */ + for (i = priv->np_qfirst; want_rx && i < priv->np_qlast; i++) { + kring = &na->rx_rings[i]; + if (kring->ring->avail > 0) { + revents |= want_rx; + want_rx = 0; /* also breaks the loop */ + } + } + for (i = priv->np_qfirst; want_tx && i < priv->np_qlast; i++) { + kring = &na->tx_rings[i]; + if (kring->ring->avail > 0) { + revents |= want_tx; + want_tx = 0; /* also breaks the loop */ + } + } + + /* + * If we to push packets out (priv->np_txpoll) or want_tx is + * still set, we do need to run the txsync calls (on all rings, + * to avoid that the tx rings stall). + */ + if (priv->np_txpoll || want_tx) { + for (i = priv->np_qfirst; i < priv->np_qlast; i++) { + kring = &na->tx_rings[i]; + if (!want_tx && kring->ring->cur == kring->nr_hwcur) + continue; + if (core_lock == NEED_CL) { + na->nm_lock(adapter, NETMAP_CORE_LOCK, 0); + core_lock = LOCKED_CL; + } + if (na->separate_locks) + na->nm_lock(adapter, NETMAP_TX_LOCK, i); + if (netmap_verbose & NM_VERB_TXSYNC) + D("send %d on %s %d", + kring->ring->cur, + ifp->if_xname, i); + if (na->nm_txsync(adapter, i, 0 /* no lock */)) + revents |= POLLERR; + + if (want_tx) { + if (kring->ring->avail > 0) { + /* stop at the first ring. We don't risk + * starvation. + */ + revents |= want_tx; + want_tx = 0; + } else if (!check_all) + selrecord(td, &kring->si); + } + if (na->separate_locks) + na->nm_lock(adapter, NETMAP_TX_UNLOCK, i); + } + } + + /* + * now if want_rx is still set we need to lock and rxsync. + * Do it on all rings because otherwise we starve. + */ + if (want_rx) { + for (i = priv->np_qfirst; i < priv->np_qlast; i++) { + kring = &na->rx_rings[i]; + if (core_lock == NEED_CL) { + na->nm_lock(adapter, NETMAP_CORE_LOCK, 0); + core_lock = LOCKED_CL; + } + if (na->separate_locks) + na->nm_lock(adapter, NETMAP_RX_LOCK, i); + + if (na->nm_rxsync(adapter, i, 0 /* no lock */)) + revents |= POLLERR; + if (no_timestamp == 0 || + kring->ring->flags & NR_TIMESTAMP) + microtime(&kring->ring->ts); + + if (kring->ring->avail > 0) + revents |= want_rx; + else if (!check_all) + selrecord(td, &kring->si); + if (na->separate_locks) + na->nm_lock(adapter, NETMAP_RX_UNLOCK, i); + } + } + if (check_all && revents == 0) { + i = na->num_queues + 1; /* the global queue */ + if (want_tx) + selrecord(td, &na->tx_rings[i].si); + if (want_rx) + selrecord(td, &na->rx_rings[i].si); + } + if (core_lock == LOCKED_CL) + na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0); + + return (revents); +} + +/*------- driver support routines ------*/ + +/* + * Initialize a ``netmap_adapter`` object created by driver on attach. + * We allocate a block of memory with room for a struct netmap_adapter + * plus two sets of N+2 struct netmap_kring (where N is the number + * of hardware rings): + * krings 0..N-1 are for the hardware queues. + * kring N is for the host stack queue + * kring N+1 is only used for the selinfo for all queues. + * Return 0 on success, ENOMEM otherwise. + */ +int +netmap_attach(struct netmap_adapter *na, int num_queues) +{ + int n = num_queues + 2; + int size = sizeof(*na) + 2 * n * sizeof(struct netmap_kring); + void *buf; + struct ifnet *ifp = na->ifp; + + if (ifp == NULL) { + D("ifp not set, giving up"); + return EINVAL; + } + na->refcount = 0; + na->num_queues = num_queues; + + buf = malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO); + if (buf) { + ifp->if_pspare[0] = buf; + na->tx_rings = (void *)((char *)buf + sizeof(*na)); + na->rx_rings = na->tx_rings + n; + bcopy(na, buf, sizeof(*na)); + ifp->if_capabilities |= IFCAP_NETMAP; + } + D("%s for %s", buf ? "ok" : "failed", ifp->if_xname); + + return (buf ? 0 : ENOMEM); +} + + +/* + * Free the allocated memory linked to the given ``netmap_adapter`` + * object. + */ +void +netmap_detach(struct ifnet *ifp) +{ + u_int i; + struct netmap_adapter *na = NA(ifp); + + if (!na) + return; + + for (i = 0; i < na->num_queues + 2; i++) { + knlist_destroy(&na->tx_rings[i].si.si_note); + knlist_destroy(&na->rx_rings[i].si.si_note); + } + bzero(na, sizeof(*na)); + ifp->if_pspare[0] = NULL; + free(na, M_DEVBUF); +} + + +/* + * intercept packets coming from the network stack and present + * them to netmap as incoming packets on a separate ring. + * We are not locked when called. + */ +int +netmap_start(struct ifnet *ifp, struct mbuf *m) +{ + struct netmap_adapter *na = NA(ifp); + u_int i, len, n = na->num_queues; + int error = EBUSY; + struct netmap_kring *kring = &na->rx_rings[n]; + struct netmap_slot *slot; + + len = m->m_pkthdr.len; + if (netmap_verbose & NM_VERB_HOST) + D("%s packet %d len %d from the stack", ifp->if_xname, + kring->nr_hwcur + kring->nr_hwavail, len); + na->nm_lock(ifp->if_softc, NETMAP_CORE_LOCK, 0); + if (kring->nr_hwavail >= (int)kring->nkr_num_slots - 1) { + D("stack ring %s full\n", ifp->if_xname); + goto done; /* no space */ + } + if (len > na->buff_size) { + D("drop packet size %d > %d", len, na->buff_size); + goto done; /* too long for us */ + } + + /* compute the insert position */ + i = kring->nr_hwcur + kring->nr_hwavail; + if (i >= kring->nkr_num_slots) + i -= kring->nkr_num_slots; + slot = &kring->ring->slot[i]; + m_copydata(m, 0, len, NMB(slot)); + slot->len = len; + kring->nr_hwavail++; + if (netmap_verbose & NM_VERB_HOST) + D("wake up host ring %s %d", na->ifp->if_xname, na->num_queues); + selwakeuppri(&kring->si, PI_NET); + error = 0; +done: + na->nm_lock(ifp->if_softc, NETMAP_CORE_UNLOCK, 0); + + /* release the mbuf in either cases of success or failure. As an + * alternative, put the mbuf in a free list and free the list + * only when really necessary. + */ + m_freem(m); + + return (error); +} + + +/* + * netmap_reset() is called by the driver routines when reinitializing + * a ring. The driver is in charge of locking to protect the kring. + * If netmap mode is not set just return NULL. + * Otherwise set NR_REINIT (in the ring and in na) to signal + * that a ring has been reinitialized, + * set cur = hwcur = 0 and avail = hwavail = num_slots - 1 . + * IT IS IMPORTANT to leave one slot free even in the tx ring because + * we rely on cur=hwcur only for empty rings. + * These are good defaults but can be overridden later in the device + * specific code if, after a reinit, the ring does not start from 0 + * (e.g. if_em.c does this). + * + * XXX we shouldn't be touching the ring, but there is a + * race anyways and this is our best option. + * + * XXX setting na->flags makes the syscall code faster, as there is + * only one place to check. On the other hand, we will need a better + * way to notify multiple threads that rings have been reset. + * One way is to increment na->rst_count at each ring reset. + * Each thread in its own priv structure will keep a matching counter, + * and on a reset will acknowledge and clean its own rings. + */ +struct netmap_slot * +netmap_reset(struct netmap_adapter *na, enum txrx tx, int n, + u_int new_cur) +{ + struct netmap_kring *kring; + struct netmap_ring *ring; + struct netmap_slot *slot; + u_int i; + + if (na == NULL) + return NULL; /* no netmap support here */ + if (!(na->ifp->if_capenable & IFCAP_NETMAP)) + return NULL; /* nothing to reinitialize */ + kring = tx == NR_TX ? na->tx_rings + n : na->rx_rings + n; + ring = kring->ring; + if (tx == NR_TX) { + /* + * The last argument is the new value of next_to_clean. + * + * In the TX ring, we have P pending transmissions (from + * next_to_clean to nr_hwcur) followed by nr_hwavail free slots. + * Generally we can use all the slots in the ring so + * P = ring_size - nr_hwavail hence (modulo ring_size): + * next_to_clean == nr_hwcur + nr_hwavail + * + * If, upon a reset, nr_hwavail == ring_size and next_to_clean + * does not change we have nothing to report. Otherwise some + * pending packets may be lost, or newly injected packets will. + */ + /* if hwcur does not change, nothing to report. + * otherwise remember the change so perhaps we can + * shift the block at the next reinit + */ + if (new_cur == kring->nr_hwcur && + kring->nr_hwavail == kring->nkr_num_slots - 1) { + /* all ok */ + D("+++ NR_REINIT ok on %s TX[%d]", na->ifp->if_xname, n); + } else { + D("+++ NR_REINIT set on %s TX[%d]", na->ifp->if_xname, n); + } + ring->flags |= NR_REINIT; + na->flags |= NR_REINIT; + ring->avail = kring->nr_hwavail = kring->nkr_num_slots - 1; + ring->cur = kring->nr_hwcur = new_cur; + } else { + /* + * The last argument is the next free slot. + * In the RX ring we have nr_hwavail full buffers starting + * from nr_hwcur. + * If nr_hwavail == 0 and nr_hwcur does not change we are ok + * otherwise we might be in trouble as the buffers are + * changing. + */ + if (new_cur == kring->nr_hwcur && kring->nr_hwavail == 0) { + /* all ok */ + D("+++ NR_REINIT ok on %s RX[%d]", na->ifp->if_xname, n); + } else { + D("+++ NR_REINIT set on %s RX[%d]", na->ifp->if_xname, n); + } + ring->flags |= NR_REINIT; + na->flags |= NR_REINIT; + ring->avail = kring->nr_hwavail = 0; /* no data */ + ring->cur = kring->nr_hwcur = new_cur; + } + + slot = ring->slot; + /* + * Check that buffer indexes are correct. If we find a + * bogus value we are a bit in trouble because we cannot + * recover easily. Best we can do is (probably) persistently + * reset the ring. + */ + for (i = 0; i < kring->nkr_num_slots; i++) { + if (slot[i].buf_idx >= netmap_total_buffers) { + D("invalid buf_idx %d at slot %d", slot[i].buf_idx, i); + slot[i].buf_idx = 0; /* XXX reset */ + } + /* XXX we don't really need to set the length */ + slot[i].len = 0; + } + /* wakeup possible waiters, both on the ring and on the global + * selfd. Perhaps a bit early now but the device specific + * routine is locked so hopefully we won't have a race. + */ + selwakeuppri(&kring->si, PI_NET); + selwakeuppri(&kring[na->num_queues + 1 - n].si, PI_NET); + return kring->ring->slot; +} + +static void +ns_dmamap_cb(__unused void *arg, __unused bus_dma_segment_t * segs, + __unused int nseg, __unused int error) +{ +} + +/* unload a bus_dmamap and create a new one. Used when the + * buffer in the slot is changed. + * XXX buflen is probably not needed, buffers have constant size. + */ +void +netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, + void *buf, bus_size_t buflen) +{ + bus_addr_t paddr; + bus_dmamap_unload(tag, map); + bus_dmamap_load(tag, map, buf, buflen, ns_dmamap_cb, &paddr, + BUS_DMA_NOWAIT); +} + +void +netmap_load_map(bus_dma_tag_t tag, bus_dmamap_t map, + void *buf, bus_size_t buflen) +{ + bus_addr_t paddr; + bus_dmamap_load(tag, map, buf, buflen, ns_dmamap_cb, &paddr, + BUS_DMA_NOWAIT); +} + +/*------ netmap memory allocator -------*/ +/* + * Request for a chunk of memory. + * + * Memory objects are arranged into a list, hence we need to walk this + * list until we find an object with the needed amount of data free. + * This sounds like a completely inefficient implementation, but given + * the fact that data allocation is done once, we can handle it + * flawlessly. + * + * Return NULL on failure. + */ +static void * +netmap_malloc(size_t size, __unused const char *msg) +{ + struct netmap_mem_obj *mem_obj, *new_mem_obj; + void *ret = NULL; + + NMA_LOCK(); + TAILQ_FOREACH(mem_obj, &netmap_mem_d->nm_molist, nmo_next) { + if (mem_obj->nmo_used != 0 || mem_obj->nmo_size < size) + continue; + + new_mem_obj = malloc(sizeof(struct netmap_mem_obj), M_NETMAP, + M_WAITOK | M_ZERO); + TAILQ_INSERT_BEFORE(mem_obj, new_mem_obj, nmo_next); + + new_mem_obj->nmo_used = 1; + new_mem_obj->nmo_size = size; + new_mem_obj->nmo_data = mem_obj->nmo_data; + memset(new_mem_obj->nmo_data, 0, new_mem_obj->nmo_size); + + mem_obj->nmo_size -= size; + mem_obj->nmo_data = (char *) mem_obj->nmo_data + size; + if (mem_obj->nmo_size == 0) { + TAILQ_REMOVE(&netmap_mem_d->nm_molist, mem_obj, + nmo_next); + free(mem_obj, M_NETMAP); + } + + ret = new_mem_obj->nmo_data; + + break; + } + NMA_UNLOCK(); + ND("%s: %d bytes at %p", msg, size, ret); + + return (ret); +} + +/* + * Return the memory to the allocator. + * + * While freeing a memory object, we try to merge adjacent chunks in + * order to reduce memory fragmentation. + */ +static void +netmap_free(void *addr, const char *msg) +{ + size_t size; + struct netmap_mem_obj *cur, *prev, *next; + + if (addr == NULL) { + D("NULL addr for %s", msg); + return; + } + + NMA_LOCK(); + TAILQ_FOREACH(cur, &netmap_mem_d->nm_molist, nmo_next) { + if (cur->nmo_data == addr && cur->nmo_used) + break; + } + if (cur == NULL) { + NMA_UNLOCK(); + D("invalid addr %s %p", msg, addr); + return; + } + + size = cur->nmo_size; + cur->nmo_used = 0; + + /* merge current chunk of memory with the previous one, + if present. */ + prev = TAILQ_PREV(cur, netmap_mem_obj_h, nmo_next); + if (prev && prev->nmo_used == 0) { + TAILQ_REMOVE(&netmap_mem_d->nm_molist, cur, nmo_next); + prev->nmo_size += cur->nmo_size; + free(cur, M_NETMAP); + cur = prev; + } + + /* merge with the next one */ + next = TAILQ_NEXT(cur, nmo_next); + if (next && next->nmo_used == 0) { + TAILQ_REMOVE(&netmap_mem_d->nm_molist, next, nmo_next); + cur->nmo_size += next->nmo_size; + free(next, M_NETMAP); + } + NMA_UNLOCK(); + ND("freed %s %d bytes at %p", msg, size, addr); +} + + +/* + * Initialize the memory allocator. + * + * Create the descriptor for the memory , allocate the pool of memory + * and initialize the list of memory objects with a single chunk + * containing the whole pre-allocated memory marked as free. + * + * Start with a large size, then halve as needed if we fail to + * allocate the block. While halving, always add one extra page + * because buffers 0 and 1 are used for special purposes. + * Return 0 on success, errno otherwise. + */ +static int +netmap_memory_init(void) +{ + struct netmap_mem_obj *mem_obj; + void *buf = NULL; + int i, n, sz = NETMAP_MEMORY_SIZE; + int extra_sz = 0; // space for rings and two spare buffers + + for (; !buf && sz >= 1<<20; sz >>=1) { + extra_sz = sz/200; + extra_sz = (extra_sz + 2*PAGE_SIZE - 1) & ~(PAGE_SIZE-1); + buf = contigmalloc(sz + extra_sz, + M_NETMAP, + M_WAITOK | M_ZERO, + 0, /* low address */ + -1UL, /* high address */ + PAGE_SIZE, /* alignment */ + 0 /* boundary */ + ); + } + if (buf == NULL) + return (ENOMEM); + sz += extra_sz; + netmap_mem_d = malloc(sizeof(struct netmap_mem_d), M_NETMAP, + M_WAITOK | M_ZERO); + mtx_init(&netmap_mem_d->nm_mtx, "netmap memory allocator lock", NULL, + MTX_DEF); + TAILQ_INIT(&netmap_mem_d->nm_molist); + netmap_mem_d->nm_buffer = buf; + netmap_mem_d->nm_totalsize = sz; + + /* + * A buffer takes 2k, a slot takes 8 bytes + ring overhead, + * so the ratio is 200:1. In other words, we can use 1/200 of + * the memory for the rings, and the rest for the buffers, + * and be sure we never run out. + */ + netmap_mem_d->nm_size = sz/200; + netmap_mem_d->nm_buf_start = + (netmap_mem_d->nm_size + PAGE_SIZE - 1) & ~(PAGE_SIZE-1); + netmap_mem_d->nm_buf_len = sz - netmap_mem_d->nm_buf_start; + + nm_buf_pool.base = netmap_mem_d->nm_buffer; + nm_buf_pool.base += netmap_mem_d->nm_buf_start; + netmap_buffer_base = nm_buf_pool.base; + D("netmap_buffer_base %p (offset %d)", + netmap_buffer_base, netmap_mem_d->nm_buf_start); + /* number of buffers, they all start as free */ + + netmap_total_buffers = nm_buf_pool.total_buffers = + netmap_mem_d->nm_buf_len / NETMAP_BUF_SIZE; + nm_buf_pool.bufsize = NETMAP_BUF_SIZE; + + D("Have %d MB, use %dKB for rings, %d buffers at %p", + (sz >> 20), (netmap_mem_d->nm_size >> 10), + nm_buf_pool.total_buffers, nm_buf_pool.base); + + /* allocate and initialize the bitmap. Entry 0 is considered + * always busy (used as default when there are no buffers left). + */ + n = (nm_buf_pool.total_buffers + 31) / 32; + nm_buf_pool.bitmap = malloc(sizeof(uint32_t) * n, M_NETMAP, + M_WAITOK | M_ZERO); + nm_buf_pool.bitmap[0] = ~3; /* slot 0 and 1 always busy */ + for (i = 1; i < n; i++) + nm_buf_pool.bitmap[i] = ~0; + nm_buf_pool.free = nm_buf_pool.total_buffers - 2; + + mem_obj = malloc(sizeof(struct netmap_mem_obj), M_NETMAP, + M_WAITOK | M_ZERO); + TAILQ_INSERT_HEAD(&netmap_mem_d->nm_molist, mem_obj, nmo_next); + mem_obj->nmo_used = 0; + mem_obj->nmo_size = netmap_mem_d->nm_size; + mem_obj->nmo_data = netmap_mem_d->nm_buffer; + + return (0); +} + + +/* + * Finalize the memory allocator. + * + * Free all the memory objects contained inside the list, and deallocate + * the pool of memory; finally free the memory allocator descriptor. + */ +static void +netmap_memory_fini(void) +{ + struct netmap_mem_obj *mem_obj; + + while (!TAILQ_EMPTY(&netmap_mem_d->nm_molist)) { + mem_obj = TAILQ_FIRST(&netmap_mem_d->nm_molist); + TAILQ_REMOVE(&netmap_mem_d->nm_molist, mem_obj, nmo_next); + if (mem_obj->nmo_used == 1) { + printf("netmap: leaked %d bytes at %p\n", + mem_obj->nmo_size, + mem_obj->nmo_data); + } + free(mem_obj, M_NETMAP); + } + contigfree(netmap_mem_d->nm_buffer, netmap_mem_d->nm_totalsize, M_NETMAP); + // XXX mutex_destroy(nm_mtx); + free(netmap_mem_d, M_NETMAP); +} + + +/* + * Module loader. + * + * Create the /dev/netmap device and initialize all global + * variables. + * + * Return 0 on success, errno on failure. + */ +static int +netmap_init(void) +{ + int error; + + + error = netmap_memory_init(); + if (error != 0) { + printf("netmap: unable to initialize the memory allocator."); + return (error); + } + printf("netmap: loaded module with %d Mbytes\n", + netmap_mem_d->nm_totalsize >> 20); + + netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, + "netmap"); + + return (0); +} + + +/* + * Module unloader. + * + * Free all the memory, and destroy the ``/dev/netmap`` device. + */ +static void +netmap_fini(void) +{ + destroy_dev(netmap_dev); + + netmap_memory_fini(); + + printf("netmap: unloaded module.\n"); +} + + +/* + * Kernel entry point. + * + * Initialize/finalize the module and return. + * + * Return 0 on success, errno on failure. + */ +static int +netmap_loader(__unused struct module *module, int event, __unused void *arg) +{ + int error = 0; + + switch (event) { + case MOD_LOAD: + error = netmap_init(); + break; + + case MOD_UNLOAD: + netmap_fini(); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + + +DEV_MODULE(netmap, netmap_loader, NULL); diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h new file mode 100644 index 000000000000..5434609c447b --- /dev/null +++ b/sys/dev/netmap/netmap_kern.h @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: netmap_kern.h 9662 2011-11-16 13:18:06Z luigi $ + * + * The header contains the definitions of constants and function + * prototypes used only in kernelspace. + */ + +#ifndef _NET_NETMAP_KERN_H_ +#define _NET_NETMAP_KERN_H_ + +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_NETMAP); +#endif + +#define ND(format, ...) +#define D(format, ...) \ + do { \ + struct timeval __xxts; \ + microtime(&__xxts); \ + printf("%03d.%06d %s [%d] " format "\n",\ + (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + } while (0) + +struct netmap_adapter; + +/* + * private, kernel view of a ring. + * + * XXX 20110627-todo + * The index in the NIC and netmap ring is offset by nkr_hwofs slots. + * This is so that, on a reset, buffers owned by userspace are not + * modified by the kernel. In particular: + * RX rings: the next empty buffer (hwcur + hwavail + hwofs) coincides + * the next empty buffer as known by the hardware (next_to_check or so). + * TX rings: hwcur + hwofs coincides with next_to_send + */ +struct netmap_kring { + struct netmap_ring *ring; + u_int nr_hwcur; + int nr_hwavail; + u_int nr_kflags; + u_int nkr_num_slots; + + u_int nkr_hwofs; /* offset between NIC and netmap ring */ + struct netmap_adapter *na; // debugging + struct selinfo si; /* poll/select wait queue */ +}; + +/* + * This struct is part of and extends the 'struct adapter' (or + * equivalent) device descriptor. It contains all fields needed to + * support netmap operation. + */ +struct netmap_adapter { + int refcount; /* number of user-space descriptors using this + interface, which is equal to the number of + struct netmap_if objs in the mapped region. */ + + int separate_locks; /* set if the interface suports different + locks for rx, tx and core. */ + + u_int num_queues; /* number of tx/rx queue pairs: this is + a duplicate field needed to simplify the + signature of ``netmap_detach``. */ + + u_int num_tx_desc; /* number of descriptor in each queue */ + u_int num_rx_desc; + u_int buff_size; + + u_int flags; /* NR_REINIT */ + /* tx_rings and rx_rings are private but allocated + * as a contiguous chunk of memory. Each array has + * N+1 entries, for the adapter queues and for the host queue. + */ + struct netmap_kring *tx_rings; /* array of TX rings. */ + struct netmap_kring *rx_rings; /* array of RX rings. */ + + /* copy of if_qflush and if_transmit pointers, to intercept + * packets from the network stack when netmap is active. + * XXX probably if_qflush is not necessary. + */ + void (*if_qflush)(struct ifnet *); + int (*if_transmit)(struct ifnet *, struct mbuf *); + + /* references to the ifnet and device routines, used by + * the generic netmap functions. + */ + struct ifnet *ifp; /* adapter is ifp->if_softc */ + + int (*nm_register)(struct ifnet *, int onoff); + void (*nm_lock)(void *, int what, u_int ringid); + int (*nm_txsync)(void *, u_int ring, int lock); + int (*nm_rxsync)(void *, u_int ring, int lock); +}; + +/* + * The combination of "enable" (ifp->if_capabilities &IFCAP_NETMAP) + * and refcount gives the status of the interface, namely: + * + * enable refcount Status + * + * FALSE 0 normal operation + * FALSE != 0 -- (impossible) + * TRUE 1 netmap mode + * TRUE 0 being deleted. + */ + +#define NETMAP_DELETING(_na) ( ((_na)->refcount == 0) && \ + ( (_na)->ifp->if_capenable & IFCAP_NETMAP) ) + +/* + * parameters for (*nm_lock)(adapter, what, index) + */ +enum { + NETMAP_NO_LOCK = 0, + NETMAP_CORE_LOCK, NETMAP_CORE_UNLOCK, + NETMAP_TX_LOCK, NETMAP_TX_UNLOCK, + NETMAP_RX_LOCK, NETMAP_RX_UNLOCK, +}; + +/* + * The following are support routines used by individual drivers to + * support netmap operation. + * + * netmap_attach() initializes a struct netmap_adapter, allocating the + * struct netmap_ring's and the struct selinfo. + * + * netmap_detach() frees the memory allocated by netmap_attach(). + * + * netmap_start() replaces the if_transmit routine of the interface, + * and is used to intercept packets coming from the stack. + * + * netmap_load_map/netmap_reload_map are helper routines to set/reset + * the dmamap for a packet buffer + * + * netmap_reset() is a helper routine to be called in the driver + * when reinitializing a ring. + */ +int netmap_attach(struct netmap_adapter *, int); +void netmap_detach(struct ifnet *); +int netmap_start(struct ifnet *, struct mbuf *); +enum txrx { NR_RX = 0, NR_TX = 1 }; +struct netmap_slot *netmap_reset(struct netmap_adapter *na, + enum txrx tx, int n, u_int new_cur); +void netmap_load_map(bus_dma_tag_t tag, bus_dmamap_t map, + void *buf, bus_size_t buflen); +void netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, + void *buf, bus_size_t buflen); +int netmap_ring_reinit(struct netmap_kring *); + +/* + * XXX eventually, get rid of netmap_total_buffers and netmap_buffer_base + * in favour of the structure + */ +// struct netmap_buf_pool; +// extern struct netmap_buf_pool nm_buf_pool; +extern u_int netmap_total_buffers; +extern char *netmap_buffer_base; +extern int netmap_verbose; // XXX debugging +enum { /* verbose flags */ + NM_VERB_ON = 1, /* generic verbose */ + NM_VERB_HOST = 0x2, /* verbose host stack */ + NM_VERB_RXSYNC = 0x10, /* verbose on rxsync/txsync */ + NM_VERB_TXSYNC = 0x20, + NM_VERB_RXINTR = 0x100, /* verbose on rx/tx intr (driver) */ + NM_VERB_TXINTR = 0x200, + NM_VERB_NIC_RXSYNC = 0x1000, /* verbose on rx/tx intr (driver) */ + NM_VERB_NIC_TXSYNC = 0x2000, +}; + +/* + * return a pointer to the struct netmap adapter from the ifp + */ +#define NA(_ifp) ((struct netmap_adapter *)(_ifp)->if_pspare[0]) + + +/* + * return the address of a buffer. + * XXX this is a special version with hardwired 2k bufs + * On error return netmap_buffer_base which is detected as a bad pointer. + */ +static inline char * +NMB(struct netmap_slot *slot) +{ + uint32_t i = slot->buf_idx; + return (i >= netmap_total_buffers) ? netmap_buffer_base : +#if NETMAP_BUF_SIZE == 2048 + netmap_buffer_base + (i << 11); +#else + netmap_buffer_base + (i *NETMAP_BUF_SIZE); +#endif +} + +#endif /* _NET_NETMAP_KERN_H_ */ diff --git a/sys/net/netmap.h b/sys/net/netmap.h new file mode 100644 index 000000000000..be9c686a49ed --- /dev/null +++ b/sys/net/netmap.h @@ -0,0 +1,281 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * 3. Neither the name of the authors nor the names of their contributors + * may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY MATTEO LANDI AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL MATTEO LANDI OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: netmap.h 9662 2011-11-16 13:18:06Z luigi $ + * + * This header contains the definitions of the constants and the + * structures needed by the ``netmap'' module, both kernel and + * userspace. + */ + +#ifndef _NET_NETMAP_H_ +#define _NET_NETMAP_H_ + +/* + * --- Netmap data structures --- + * + * The data structures used by netmap are shown below. Those in + * capital letters are in an mmapp()ed area shared with userspace, + * while others are private to the kernel. + * Shared structures do not contain pointers but only relative + * offsets, so that addressing is portable between kernel and userspace. + * + * The 'softc' of each interface is extended with a struct netmap_adapter + * containing information to support netmap operation. In addition to + * the fixed fields, it has two pointers to reach the arrays of + * 'struct netmap_kring' which in turn reaches the various + * struct netmap_ring, shared with userspace. + + + softc ++----------------+ +| standard fields| +| if_pspare[0] ----------+ ++----------------+ | + | ++----------------+<------+ +|(netmap_adapter)| +| | netmap_kring +| tx_rings *--------------------------------->+-------------+ +| | netmap_kring | ring *---------> ... +| rx_rings *---------->+--------------+ | nr_hwcur | ++----------------+ | ring *-------+ | nr_hwavail | + | nr_hwcur | | | selinfo | + | nr_hwavail | | +-------------+ + | selinfo | | | ... | + +--------------+ | (na_num_rings+1 entries) + | .... | | | | + (na_num_rings+1 entries) +-------------+ + | | | + +--------------+ | + | NETMAP_RING + +---->+-------------+ + / | cur | + NETMAP_IF (nifp, one per file desc.) / | avail | + +---------------+ / | buf_ofs | + | ni_num_queues | / +=============+ + | | / | buf_idx | slot[0] + | | / | len, flags | + | | / +-------------+ + +===============+ / | buf_idx | slot[1] + | txring_ofs[0] | (rel.to nifp)--' | len, flags | + | txring_ofs[1] | +-------------+ + (num_rings+1 entries) (nr_num_slots entries) + | txring_ofs[n] | | buf_idx | slot[n-1] + +---------------+ | len, flags | + | rxring_ofs[0] | +-------------+ + | rxring_ofs[1] | + (num_rings+1 entries) + | txring_ofs[n] | + +---------------+ + + * The NETMAP_RING is the shadow ring that mirrors the NIC rings. + * Each slot has the index of a buffer, its length and some flags. + * In user space, the buffer address is computed as + * (char *)ring + buf_ofs + index*MAX_BUF_SIZE + * In the kernel, buffers do not necessarily need to be contiguous, + * and the virtual and physical addresses are derived through + * a lookup table. When userspace wants to use a different buffer + * in a location, it must set the NS_BUF_CHANGED flag to make + * sure that the kernel recomputes updates the hardware ring and + * other fields (bus_dmamap, etc.) as needed. + * + * Normally the driver is not requested to report the result of + * transmissions (this can dramatically speed up operation). + * However the user may request to report completion by setting + * NS_REPORT. + */ +struct netmap_slot { + uint32_t buf_idx; /* buffer index */ + uint16_t len; /* packet length, to be copied to/from the hw ring */ + uint16_t flags; /* buf changed, etc. */ +#define NS_BUF_CHANGED 0x0001 /* must resync the map, buffer changed */ +#define NS_REPORT 0x0002 /* ask the hardware to report results + * e.g. by generating an interrupt + */ +}; + +/* + * Netmap representation of a TX or RX ring (also known as "queue"). + * This is a queue implemented as a fixed-size circular array. + * At the software level, two fields are important: avail and cur. + * + * In TX rings: + * avail indicates the number of slots available for transmission. + * It is decremented by the application when it appends a + * packet, and set to nr_hwavail (see below) on a + * NIOCTXSYNC to reflect the actual state of the queue + * (keeping track of completed transmissions). + * cur indicates the empty slot to use for the next packet + * to send (i.e. the "tail" of the queue). + * It is incremented by the application. + * + * The kernel side of netmap uses two additional fields in its own + * private ring structure, netmap_kring: + * nr_hwcur is a copy of nr_cur on an NIOCTXSYNC. + * nr_hwavail is the number of slots known as available by the + * hardware. It is updated on an INTR (inc by the + * number of packets sent) and on a NIOCTXSYNC + * (decrease by nr_cur - nr_hwcur) + * A special case, nr_hwavail is -1 if the transmit + * side is idle (no pending transmits). + * + * In RX rings: + * avail is the number of packets available (possibly 0). + * It is decremented by the software when it consumes + * a packet, and set to nr_hwavail on a NIOCRXSYNC + * cur indicates the first slot that contains a packet + * (the "head" of the queue). + * It is incremented by the software when it consumes + * a packet. + * + * The kernel side of netmap uses two additional fields in the kring: + * nr_hwcur is a copy of nr_cur on an NIOCRXSYNC + * nr_hwavail is the number of packets available. It is updated + * on INTR (inc by the number of new packets arrived) + * and on NIOCRXSYNC (decreased by nr_cur - nr_hwcur). + * + * DATA OWNERSHIP/LOCKING: + * The netmap_ring is owned by the user program and it is only + * accessed or modified in the upper half of the kernel during + * a system call. + * + * The netmap_kring is only modified by the upper half of the kernel. + */ +struct netmap_ring { + /* + * nr_buf_base_ofs is meant to be used through macros. + * It contains the offset of the buffer region from this + * descriptor. + */ + const ssize_t buf_ofs; + const uint32_t num_slots; /* number of slots in the ring. */ + uint32_t avail; /* number of usable slots */ + uint32_t cur; /* 'current' r/w position */ + + const uint16_t nr_buf_size; + uint16_t flags; + /* + * When a ring is reinitialized, the kernel sets kflags. + * On exit from a syscall, if the flag is found set, we + * also reinitialize the nr_* variables. The kflag is then + * unconditionally copied to nr_flags and cleared. + */ +#define NR_REINIT 0x0001 /* ring reinitialized! */ +#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ + + struct timeval ts; /* time of last *sync() */ + + /* the slots follow. This struct has variable size */ + struct netmap_slot slot[0]; /* array of slots. */ +}; + + +/* + * Netmap representation of an interface and its queue(s). + * There is one netmap_if for each file descriptor on which we want + * to select/poll. We assume that on each interface has the same number + * of receive and transmit queues. + * select/poll operates on one or all pairs depending on the value of + * nmr_queueid passed on the ioctl. + */ +struct netmap_if { + char ni_name[IFNAMSIZ]; /* name of the interface. */ + const u_int ni_version; /* API version, currently unused */ + const u_int ni_num_queues; /* number of queue pairs (TX/RX). */ + const u_int ni_rx_queues; /* if zero, use ni_num_queues */ + /* + * the following array contains the offset of the + * each netmap ring from this structure. The first num_queues+1 + * refer to the tx rings, the next n+1 refer to the rx rings. + * The area is filled up by the kernel on NIOCREG, + * and then only read by userspace code. + * entries 0..ni_num_queues-1 indicate the hardware queues, + * entry ni_num_queues is the queue from/to the stack. + */ + const ssize_t ring_ofs[0]; +}; + +#ifndef IFCAP_NETMAP /* this should go in net/if.h */ +#define IFCAP_NETMAP 0x100000 +#endif + +#ifndef NIOCREGIF +/* + * ioctl names and related fields + * + * NIOCGINFO takes a struct ifreq, the interface name is the input, + * the outputs are number of queues and number of descriptor + * for each queue (useful to set number of threads etc.). + * + * NIOCREGIF takes an interface name within a struct ifreq, + * and activates netmap mode on the interface (if possible). + * + * NIOCUNREGIF unregisters the interface associated to the fd. + * + * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, + * whose identity is set in NIOCREGIF through nr_ringid + */ + +/* + * struct nmreq overlays a struct ifreq + */ +struct nmreq { + char nr_name[IFNAMSIZ]; + uint32_t nr_version; /* API version (unused) */ + uint32_t nr_offset; /* nifp offset in the shared region */ + uint32_t nr_memsize; /* size of the shared region */ + uint32_t nr_numslots; /* descriptors per queue */ + uint16_t nr_numrings; + uint16_t nr_ringid; /* ring(s) we care about */ +#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */ +#define NETMAP_SW_RING 0x2000 /* we process the sw ring */ +#define NETMAP_NO_TX_POLL 0x1000 /* no gratuitous txsync on poll */ +#define NETMAP_RING_MASK 0xfff /* the ring number */ +}; + +/* + * default buf size is 2048, but it may make sense to have + * it shorter for better cache usage. + */ + +#define NETMAP_BUF_SIZE (2048) +#define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */ +#define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */ +#define NIOCUNREGIF _IO('i', 147) /* interface unregister */ +#define NIOCTXSYNC _IO('i', 148) /* sync tx queues */ +#define NIOCRXSYNC _IO('i', 149) /* sync rx queues */ +#endif /* !NIOCREGIF */ + +#endif /* _NET_NETMAP_H_ */ diff --git a/sys/net/netmap_user.h b/sys/net/netmap_user.h new file mode 100644 index 000000000000..c9443b89e43f --- /dev/null +++ b/sys/net/netmap_user.h @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * 3. Neither the name of the authors nor the names of their contributors + * may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY MATTEO LANDI AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL MATTEO LANDI OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: netmap_user.h 9495 2011-10-18 15:28:23Z luigi $ + * + * This header contains the macros used to manipulate netmap structures + * and packets in userspace. See netmap(4) for more information. + * + * The address of the struct netmap_if, say nifp, is determined + * by the value returned from ioctl(.., NIOCREG, ...) and the mmap + * region: + * ioctl(fd, NIOCREG, &req); + * mem = mmap(0, ... ); + * nifp = NETMAP_IF(mem, req.nr_nifp); + * (so simple, we could just do it manually) + * + * From there: + * struct netmap_ring *NETMAP_TXRING(nifp, index) + * struct netmap_ring *NETMAP_RXRING(nifp, index) + * we can access ring->nr_cur, ring->nr_avail, ring->nr_flags + * + * ring->slot[i] gives us the i-th slot (we can access + * directly plen, flags, bufindex) + * + * char *buf = NETMAP_BUF(ring, index) returns a pointer to + * the i-th buffer + * + * Since rings are circular, we have macros to compute the next index + * i = NETMAP_RING_NEXT(ring, i); + */ + +#ifndef _NET_NETMAP_USER_H_ +#define _NET_NETMAP_USER_H_ + +#define NETMAP_IF(b, o) (struct netmap_if *)((char *)(b) + (o)) + +#define NETMAP_TXRING(nifp, index) \ + ((struct netmap_ring *)((char *)(nifp) + \ + (nifp)->ring_ofs[index] ) ) + +#define NETMAP_RXRING(nifp, index) \ + ((struct netmap_ring *)((char *)(nifp) + \ + (nifp)->ring_ofs[index + (nifp)->ni_num_queues+1] ) ) + +#if NETMAP_BUF_SIZE != 2048 +#error cannot handle odd size +#define NETMAP_BUF(ring, index) \ + ((char *)(ring) + (ring)->buf_ofs + ((index)*NETMAP_BUF_SIZE)) +#else +#define NETMAP_BUF(ring, index) \ + ((char *)(ring) + (ring)->buf_ofs + ((index)<<11)) +#endif + +#define NETMAP_RING_NEXT(r, i) \ + ((i)+1 == (r)->num_slots ? 0 : (i) + 1 ) + +/* + * Return 1 if the given tx ring is empty. + * + * @r netmap_ring descriptor pointer. + * Special case, a negative value in hwavail indicates that the + * transmit queue is idle. + * XXX revise + */ +#define NETMAP_TX_RING_EMPTY(r) ((r)->avail >= (r)->num_slots - 1) + +#endif /* _NET_NETMAP_USER_H_ */ diff --git a/tools/tools/README b/tools/tools/README index 253b2e08f739..9c3db2fc5364 100644 --- a/tools/tools/README +++ b/tools/tools/README @@ -50,6 +50,7 @@ mfc Merge a directory from HEAD to a branch where it does not mid Create a Message-ID database for mailing lists. mwl Tools specific to the Marvell 88W8363 support ncpus Count the number of processors +netmap Test applications for netmap(4) notescheck Check for missing devices and options in NOTES files. npe Tools specific to the Intel IXP4XXX NPE device nxge A diagnostic tool for the nxge(4) driver diff --git a/tools/tools/netmap/Makefile b/tools/tools/netmap/Makefile new file mode 100644 index 000000000000..4b682e52a311 --- /dev/null +++ b/tools/tools/netmap/Makefile @@ -0,0 +1,25 @@ +# +# $FreeBSD$ +# +# For multiple programs using a single source file each, +# we can just define 'progs' and create custom targets. +PROGS = pkt-gen bridge testpcap libnetmap.so + +CLEANFILES = $(PROGS) pcap.o +NO_MAN= +CFLAGS += -Werror -Wall -nostdinc -I/usr/include -I../../../sys +CFLAGS += -Wextra + +LDFLAGS += -lpthread -lpcap + +.include +.include + +all: $(PROGS) + +testpcap: pcap.c libnetmap.so + $(CC) $(CFLAGS) -L. -lnetmap -o ${.TARGET} pcap.c + +libnetmap.so: pcap.c + $(CC) $(CFLAGS) -fpic -c ${.ALLSRC} + $(CC) -shared -o ${.TARGET} ${.ALLSRC:.c=.o} diff --git a/tools/tools/netmap/README b/tools/tools/netmap/README new file mode 100644 index 000000000000..9a1ba6096188 --- /dev/null +++ b/tools/tools/netmap/README @@ -0,0 +1,11 @@ +$FreeBSD$ + +This directory contains examples that use netmap + + pkt-gen a packet sink/source using the netmap API + + bridge a two-port jumper wire, also using the native API + + testpcap a jumper wire using libnetmap (or libpcap) + + click* various click examples diff --git a/tools/tools/netmap/bridge.c b/tools/tools/netmap/bridge.c new file mode 100644 index 000000000000..2385a0811fb5 --- /dev/null +++ b/tools/tools/netmap/bridge.c @@ -0,0 +1,456 @@ +/* + * (C) 2011 Luigi Rizzo, Matteo Landi + * + * BSD license + * + * A netmap client to bridge two network interfaces + * (or one interface and the host stack). + * + * $FreeBSD$ + */ + +#include +#include /* signal */ +#include +#include +#include /* strcmp */ +#include /* open */ +#include /* close */ + +#include /* le64toh */ +#include /* PROT_* */ +#include /* ioctl */ +#include +#include +#include /* sockaddr.. */ +#include /* ntohs */ + +#include /* ifreq */ +#include +#include +#include + +#include /* sockaddr_in */ + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +int verbose = 0; + +/* debug support */ +#define ND(format, ...) {} +#define D(format, ...) do { \ + if (!verbose) break; \ + struct timeval _xxts; \ + gettimeofday(&_xxts, NULL); \ + fprintf(stderr, "%03d.%06d %s [%d] " format "\n", \ + (int)_xxts.tv_sec %1000, (int)_xxts.tv_usec, \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + } while (0) + + +char *version = "$Id: bridge.c 9642 2011-11-07 21:39:47Z luigi $"; + +static int do_abort = 0; + +/* + * info on a ring we handle + */ +struct my_ring { + const char *ifname; + int fd; + char *mem; /* userspace mmap address */ + u_int memsize; + u_int queueid; + u_int begin, end; /* first..last+1 rings to check */ + struct netmap_if *nifp; + struct netmap_ring *tx, *rx; /* shortcuts */ + + uint32_t if_flags; + uint32_t if_reqcap; + uint32_t if_curcap; +}; + +static void +sigint_h(__unused int sig) +{ + do_abort = 1; + signal(SIGINT, SIG_DFL); +} + + +static int +do_ioctl(struct my_ring *me, int what) +{ + struct ifreq ifr; + int error; + + bzero(&ifr, sizeof(ifr)); + strncpy(ifr.ifr_name, me->ifname, sizeof(ifr.ifr_name)); + switch (what) { + case SIOCSIFFLAGS: + ifr.ifr_flagshigh = me->if_flags >> 16; + ifr.ifr_flags = me->if_flags & 0xffff; + break; + case SIOCSIFCAP: + ifr.ifr_reqcap = me->if_reqcap; + ifr.ifr_curcap = me->if_curcap; + break; + } + error = ioctl(me->fd, what, &ifr); + if (error) { + D("ioctl error %d", what); + return error; + } + switch (what) { + case SIOCGIFFLAGS: + me->if_flags = (ifr.ifr_flagshigh << 16) | + (0xffff & ifr.ifr_flags); + if (verbose) + D("flags are 0x%x", me->if_flags); + break; + + case SIOCGIFCAP: + me->if_reqcap = ifr.ifr_reqcap; + me->if_curcap = ifr.ifr_curcap; + if (verbose) + D("curcap are 0x%x", me->if_curcap); + break; + } + return 0; +} + +/* + * open a device. if me->mem is null then do an mmap. + */ +static int +netmap_open(struct my_ring *me, int ringid) +{ + int fd, err, l; + struct nmreq req; + + me->fd = fd = open("/dev/netmap", O_RDWR); + if (fd < 0) { + D("Unable to open /dev/netmap"); + return (-1); + } + bzero(&req, sizeof(req)); + strncpy(req.nr_name, me->ifname, sizeof(req.nr_name)); + req.nr_ringid = ringid; + err = ioctl(fd, NIOCGINFO, &req); + if (err) { + D("cannot get info on %s", me->ifname); + goto error; + } + me->memsize = l = req.nr_memsize; + if (verbose) + D("memsize is %d MB", l>>20); + err = ioctl(fd, NIOCREGIF, &req); + if (err) { + D("Unable to register %s", me->ifname); + goto error; + } + + if (me->mem == NULL) { + me->mem = mmap(0, l, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0); + if (me->mem == MAP_FAILED) { + D("Unable to mmap"); + me->mem = NULL; + goto error; + } + } + + me->nifp = NETMAP_IF(me->mem, req.nr_offset); + me->queueid = ringid; + if (ringid & NETMAP_SW_RING) { + me->begin = req.nr_numrings; + me->end = me->begin + 1; + } else if (ringid & NETMAP_HW_RING) { + me->begin = ringid & NETMAP_RING_MASK; + me->end = me->begin + 1; + } else { + me->begin = 0; + me->end = req.nr_numrings; + } + me->tx = NETMAP_TXRING(me->nifp, me->begin); + me->rx = NETMAP_RXRING(me->nifp, me->begin); + return (0); +error: + close(me->fd); + return -1; +} + + +static int +netmap_close(struct my_ring *me) +{ + D(""); + if (me->mem) + munmap(me->mem, me->memsize); + ioctl(me->fd, NIOCUNREGIF, NULL); + close(me->fd); + return (0); +} + + +/* + * move up to 'limit' pkts from rxring to txring swapping buffers. + */ +static int +process_rings(struct netmap_ring *rxring, struct netmap_ring *txring, + u_int limit, const char *msg) +{ + u_int j, k, m = 0; + + /* print a warning if any of the ring flags is set (e.g. NM_REINIT) */ + if (rxring->flags || txring->flags) + D("%s rxflags %x txflags %x", + msg, rxring->flags, txring->flags); + j = rxring->cur; /* RX */ + k = txring->cur; /* TX */ + if (rxring->avail < limit) + limit = rxring->avail; + if (txring->avail < limit) + limit = txring->avail; + m = limit; + while (limit-- > 0) { + struct netmap_slot *rs = &rxring->slot[j]; + struct netmap_slot *ts = &txring->slot[k]; + uint32_t pkt; + + /* swap packets */ + if (ts->buf_idx < 2 || rs->buf_idx < 2) { + D("wrong index rx[%d] = %d -> tx[%d] = %d", + j, rs->buf_idx, k, ts->buf_idx); + sleep(2); + } + pkt = ts->buf_idx; + ts->buf_idx = rs->buf_idx; + rs->buf_idx = pkt; + + /* copy the packet lenght. */ + if (rs->len < 14 || rs->len > 2048) + D("wrong len %d rx[%d] -> tx[%d]", rs->len, j, k); + else if (verbose > 1) + D("send len %d rx[%d] -> tx[%d]", rs->len, j, k); + ts->len = rs->len; + + /* report the buffer change. */ + ts->flags |= NS_BUF_CHANGED; + rs->flags |= NS_BUF_CHANGED; + j = NETMAP_RING_NEXT(rxring, j); + k = NETMAP_RING_NEXT(txring, k); + } + rxring->avail -= m; + txring->avail -= m; + rxring->cur = j; + txring->cur = k; + if (verbose && m > 0) + D("sent %d packets to %p", m, txring); + + return (m); +} + +/* move packts from src to destination */ +static int +move(struct my_ring *src, struct my_ring *dst, u_int limit) +{ + struct netmap_ring *txring, *rxring; + u_int m = 0, si = src->begin, di = dst->begin; + const char *msg = (src->queueid & NETMAP_SW_RING) ? + "host->net" : "net->host"; + + while (si < src->end && di < dst->end) { + rxring = NETMAP_RXRING(src->nifp, si); + txring = NETMAP_TXRING(dst->nifp, di); + ND("txring %p rxring %p", txring, rxring); + if (rxring->avail == 0) { + si++; + continue; + } + if (txring->avail == 0) { + di++; + continue; + } + m += process_rings(rxring, txring, limit, msg); + } + + return (m); +} + +/* + * how many packets on this set of queues ? + */ +static int +howmany(struct my_ring *me, int tx) +{ + u_int i, tot = 0; + + ND("me %p begin %d end %d", me, me->begin, me->end); + for (i = me->begin; i < me->end; i++) { + struct netmap_ring *ring = tx ? + NETMAP_TXRING(me->nifp, i) : NETMAP_RXRING(me->nifp, i); + tot += ring->avail; + } + if (0 && verbose && tot && !tx) + D("ring %s %s %s has %d avail at %d", + me->ifname, tx ? "tx": "rx", + me->end > me->nifp->ni_num_queues ? + "host":"net", + tot, NETMAP_TXRING(me->nifp, me->begin)->cur); + return tot; +} + +/* + * bridge [-v] if1 [if2] + * + * If only one name, or the two interfaces are the same, + * bridges userland and the adapter. Otherwise bridge + * two intefaces. + */ +int +main(int argc, char **argv) +{ + struct pollfd pollfd[2]; + int i; + u_int burst = 1024; + struct my_ring me[2]; + + fprintf(stderr, "%s %s built %s %s\n", + argv[0], version, __DATE__, __TIME__); + + bzero(me, sizeof(me)); + + while (argc > 1 && !strcmp(argv[1], "-v")) { + verbose++; + argv++; + argc--; + } + + if (argc < 2 || argc > 4) { + D("Usage: %s IFNAME1 [IFNAME2 [BURST]]", argv[0]); + return (1); + } + + /* setup netmap interface #1. */ + me[0].ifname = argv[1]; + if (argc == 2 || !strcmp(argv[1], argv[2])) { + D("same interface, endpoint 0 goes to host"); + i = NETMAP_SW_RING; + me[1].ifname = argv[1]; + } else { + /* two different interfaces. Take all rings on if1 */ + i = 0; // all hw rings + me[1].ifname = argv[2]; + } + if (netmap_open(me, i)) + return (1); + me[1].mem = me[0].mem; /* copy the pointer, so only one mmap */ + if (netmap_open(me+1, 0)) + return (1); + + /* if bridging two interfaces, set promisc mode */ + if (i != NETMAP_SW_RING) { + do_ioctl(me, SIOCGIFFLAGS); + if ((me[0].if_flags & IFF_UP) == 0) { + D("%s is down, bringing up...", me[0].ifname); + me[0].if_flags |= IFF_UP; + } + me[0].if_flags |= IFF_PPROMISC; + do_ioctl(me, SIOCSIFFLAGS); + + do_ioctl(me+1, SIOCGIFFLAGS); + me[1].if_flags |= IFF_PPROMISC; + do_ioctl(me+1, SIOCSIFFLAGS); + + /* also disable checksums etc. */ + do_ioctl(me, SIOCGIFCAP); + me[0].if_reqcap = me[0].if_curcap; + me[0].if_reqcap &= ~(IFCAP_HWCSUM | IFCAP_TSO | IFCAP_TOE); + do_ioctl(me+0, SIOCSIFCAP); + } + do_ioctl(me+1, SIOCGIFFLAGS); + if ((me[1].if_flags & IFF_UP) == 0) { + D("%s is down, bringing up...", me[1].ifname); + me[1].if_flags |= IFF_UP; + } + do_ioctl(me+1, SIOCSIFFLAGS); + + do_ioctl(me+1, SIOCGIFCAP); + me[1].if_reqcap = me[1].if_curcap; + me[1].if_reqcap &= ~(IFCAP_HWCSUM | IFCAP_TSO | IFCAP_TOE); + do_ioctl(me+1, SIOCSIFCAP); + if (argc > 3) + burst = atoi(argv[3]); /* packets burst size. */ + + /* setup poll(2) variables. */ + memset(pollfd, 0, sizeof(pollfd)); + for (i = 0; i < 2; i++) { + pollfd[i].fd = me[i].fd; + pollfd[i].events = (POLLIN); + } + + D("Wait 2 secs for link to come up..."); + sleep(2); + D("Ready to go, %s 0x%x/%d <-> %s 0x%x/%d.", + me[0].ifname, me[0].queueid, me[0].nifp->ni_num_queues, + me[1].ifname, me[1].queueid, me[1].nifp->ni_num_queues); + + /* main loop */ + signal(SIGINT, sigint_h); + while (!do_abort) { + int n0, n1, ret; + pollfd[0].events = pollfd[1].events = 0; + pollfd[0].revents = pollfd[1].revents = 0; + n0 = howmany(me, 0); + n1 = howmany(me + 1, 0); + if (n0) + pollfd[1].events |= POLLOUT; + else + pollfd[0].events |= POLLIN; + if (n1) + pollfd[0].events |= POLLOUT; + else + pollfd[1].events |= POLLIN; + ret = poll(pollfd, 2, 2500); + if (ret <= 0 || verbose) + D("poll %s [0] ev %x %x rx %d@%d tx %d," + " [1] ev %x %x rx %d@%d tx %d", + ret <= 0 ? "timeout" : "ok", + pollfd[0].events, + pollfd[0].revents, + howmany(me, 0), + me[0].rx->cur, + howmany(me, 1), + pollfd[1].events, + pollfd[1].revents, + howmany(me+1, 0), + me[1].rx->cur, + howmany(me+1, 1) + ); + if (ret < 0) + continue; + if (pollfd[0].revents & POLLERR) { + D("error on fd0, rxcur %d@%d", + me[0].rx->avail, me[0].rx->cur); + } + if (pollfd[1].revents & POLLERR) { + D("error on fd1, rxcur %d@%d", + me[1].rx->avail, me[1].rx->cur); + } + if (pollfd[0].revents & POLLOUT) { + move(me + 1, me, burst); + // XXX we don't need the ioctl */ + // ioctl(me[0].fd, NIOCTXSYNC, NULL); + } + if (pollfd[1].revents & POLLOUT) { + move(me, me + 1, burst); + // XXX we don't need the ioctl */ + // ioctl(me[1].fd, NIOCTXSYNC, NULL); + } + } + D("exiting"); + netmap_close(me + 1); + netmap_close(me + 0); + + return (0); +} diff --git a/tools/tools/netmap/click-test.cfg b/tools/tools/netmap/click-test.cfg new file mode 100644 index 000000000000..fc5759f88b1e --- /dev/null +++ b/tools/tools/netmap/click-test.cfg @@ -0,0 +1,19 @@ +// +// $FreeBSD$ +// +// A sample test configuration for click +// +// +// create a switch + +myswitch :: EtherSwitch; + +// two input devices + +c0 :: FromDevice(ix0, PROMISC true); +c1 :: FromDevice(ix1, PROMISC true); + +// and now pass packets around + +c0[0] -> [0]sw[0] -> Queue(10000) -> ToDevice(ix0); +c1[0] -> [1]sw[1] -> Queue(10000) -> ToDevice(ix1); diff --git a/tools/tools/netmap/pcap.c b/tools/tools/netmap/pcap.c new file mode 100644 index 000000000000..f010b839bfb2 --- /dev/null +++ b/tools/tools/netmap/pcap.c @@ -0,0 +1,761 @@ +/* + * (C) 2011 Luigi Rizzo + * + * BSD license + * + * A simple library that maps some pcap functions onto netmap + * This is not 100% complete but enough to let tcpdump, trafshow + * and other apps work. + * + * $FreeBSD$ + */ + +#include +#include /* signal */ +#include +#include +#include /* strcmp */ +#include /* open */ +#include /* close */ + +#include /* le64toh */ +#include /* PROT_* */ +#include /* ioctl */ +#include +#include +#include /* sockaddr.. */ +#include /* ntohs */ + +#include /* ifreq */ +#include +#include +#include + +#include /* sockaddr_in */ + +#include +#include + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +char *version = "$Id$"; +int verbose = 0; + +/* debug support */ +#define ND(format, ...) do {} while (0) +#define D(format, ...) do { \ + if (verbose) \ + fprintf(stderr, "--- %s [%d] " format "\n", \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + } while (0) + + +/* + * We redefine here a number of structures that are in pcap.h + * so we can compile this file without the system header. + */ +#ifndef PCAP_ERRBUF_SIZE +#define PCAP_ERRBUF_SIZE 128 + +/* + * Each packet is accompanied by a header including the timestamp, + * captured size and actual size. + */ +struct pcap_pkthdr { + struct timeval ts; /* time stamp */ + uint32_t caplen; /* length of portion present */ + uint32_t len; /* length this packet (off wire) */ +}; + +typedef struct pcap_if pcap_if_t; + +/* + * Representation of an interface address. + */ +struct pcap_addr { + struct pcap_addr *next; + struct sockaddr *addr; /* address */ + struct sockaddr *netmask; /* netmask for the above */ + struct sockaddr *broadaddr; /* broadcast addr for the above */ + struct sockaddr *dstaddr; /* P2P dest. address for the above */ +}; + +struct pcap_if { + struct pcap_if *next; + char *name; /* name to hand to "pcap_open_live()" */ + char *description; /* textual description of interface, or NULL */ + struct pcap_addr *addresses; + uint32_t flags; /* PCAP_IF_ interface flags */ +}; + +/* + * We do not support stats (yet) + */ +struct pcap_stat { + u_int ps_recv; /* number of packets received */ + u_int ps_drop; /* number of packets dropped */ + u_int ps_ifdrop; /* drops by interface XXX not yet supported */ +#ifdef WIN32 + u_int bs_capt; /* number of packets that reach the app. */ +#endif /* WIN32 */ +}; + +typedef void pcap_t; +typedef enum { + PCAP_D_INOUT = 0, + PCAP_D_IN, + PCAP_D_OUT +} pcap_direction_t; + + + +typedef void (*pcap_handler)(u_char *user, + const struct pcap_pkthdr *h, const u_char *bytes); + +char errbuf[PCAP_ERRBUF_SIZE]; + +pcap_t *pcap_open_live(const char *device, int snaplen, + int promisc, int to_ms, char *errbuf); + +int pcap_findalldevs(pcap_if_t **alldevsp, char *errbuf); +void pcap_close(pcap_t *p); +int pcap_get_selectable_fd(pcap_t *p); +int pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user); +int pcap_setnonblock(pcap_t *p, int nonblock, char *errbuf); +int pcap_setdirection(pcap_t *p, pcap_direction_t d); +char *pcap_lookupdev(char *errbuf); +int pcap_inject(pcap_t *p, const void *buf, size_t size); +int pcap_fileno(pcap_t *p); + +struct eproto { + const char *s; + u_short p; +}; +#endif /* !PCAP_ERRBUF_SIZE */ + +#ifdef __PIC__ +/* + * build as a shared library + */ + +char pcap_version[] = "libnetmap version 0.3"; + +/* + * Our equivalent of pcap_t + */ +struct my_ring { + struct nmreq nmr; + + int fd; + char *mem; /* userspace mmap address */ + u_int memsize; + u_int queueid; + u_int begin, end; /* first..last+1 rings to check */ + struct netmap_if *nifp; + + int snaplen; + char *errbuf; + int promisc; + int to_ms; + + struct pcap_pkthdr hdr; + + uint32_t if_flags; + uint32_t if_reqcap; + uint32_t if_curcap; + + struct pcap_stat st; + + char msg[PCAP_ERRBUF_SIZE]; +}; + + +static int +do_ioctl(struct my_ring *me, int what) +{ + struct ifreq ifr; + int error; + + bzero(&ifr, sizeof(ifr)); + strncpy(ifr.ifr_name, me->nmr.nr_name, sizeof(ifr.ifr_name)); + switch (what) { + case SIOCSIFFLAGS: + D("call SIOCSIFFLAGS 0x%x", me->if_flags); + ifr.ifr_flagshigh = (me->if_flags >> 16) & 0xffff; + ifr.ifr_flags = me->if_flags & 0xffff; + break; + case SIOCSIFCAP: + ifr.ifr_reqcap = me->if_reqcap; + ifr.ifr_curcap = me->if_curcap; + break; + } + error = ioctl(me->fd, what, &ifr); + if (error) { + D("ioctl 0x%x error %d", what, error); + return error; + } + switch (what) { + case SIOCSIFFLAGS: + case SIOCGIFFLAGS: + me->if_flags = (ifr.ifr_flagshigh << 16) | + (0xffff & ifr.ifr_flags); + D("flags are L 0x%x H 0x%x 0x%x", + (uint16_t)ifr.ifr_flags, + (uint16_t)ifr.ifr_flagshigh, me->if_flags); + break; + + case SIOCGIFCAP: + me->if_reqcap = ifr.ifr_reqcap; + me->if_curcap = ifr.ifr_curcap; + D("curcap are 0x%x", me->if_curcap); + break; + } + return 0; +} + + +/* + * open a device. if me->mem is null then do an mmap. + */ +static int +netmap_open(struct my_ring *me, int ringid) +{ + int fd, err, l; + u_int i; + struct nmreq req; + + me->fd = fd = open("/dev/netmap", O_RDWR); + if (fd < 0) { + D("Unable to open /dev/netmap"); + return (-1); + } + bzero(&req, sizeof(req)); + strncpy(req.nr_name, me->nmr.nr_name, sizeof(req.nr_name)); + req.nr_ringid = ringid; + err = ioctl(fd, NIOCGINFO, &req); + if (err) { + D("cannot get info on %s", me->nmr.nr_name); + goto error; + } + me->memsize = l = req.nr_memsize; + ND("memsize is %d MB", l>>20); + err = ioctl(fd, NIOCREGIF, &req); + if (err) { + D("Unable to register %s", me->nmr.nr_name); + goto error; + } + + if (me->mem == NULL) { + me->mem = mmap(0, l, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0); + if (me->mem == MAP_FAILED) { + D("Unable to mmap"); + me->mem = NULL; + goto error; + } + } + + me->nifp = NETMAP_IF(me->mem, req.nr_offset); + me->queueid = ringid; + if (ringid & NETMAP_SW_RING) { + me->begin = req.nr_numrings; + me->end = me->begin + 1; + } else if (ringid & NETMAP_HW_RING) { + me->begin = ringid & NETMAP_RING_MASK; + me->end = me->begin + 1; + } else { + me->begin = 0; + me->end = req.nr_numrings; + } + /* request timestamps for packets */ + for (i = me->begin; i < me->end; i++) { + struct netmap_ring *ring = NETMAP_RXRING(me->nifp, i); + ring->flags = NR_TIMESTAMP; + } + //me->tx = NETMAP_TXRING(me->nifp, 0); + return (0); +error: + close(me->fd); + return -1; +} + +/* + * There is a set of functions that tcpdump expects even if probably + * not used + */ +struct eproto eproto_db[] = { + { "ip", ETHERTYPE_IP }, + { "arp", ETHERTYPE_ARP }, + { (char *)0, 0 } +}; + + +int +pcap_findalldevs(pcap_if_t **alldevsp, __unused char *errbuf) +{ + struct ifaddrs *i_head, *i; + pcap_if_t *top = NULL, *cur; + struct pcap_addr *tail = NULL; + int l; + + D("listing all devs"); + *alldevsp = NULL; + i_head = NULL; + + if (getifaddrs(&i_head)) { + D("cannot get if addresses"); + return -1; + } + for (i = i_head; i; i = i->ifa_next) { + //struct ifaddrs *ifa; + struct pcap_addr *pca; + //struct sockaddr *sa; + + D("got interface %s", i->ifa_name); + if (!top || strcmp(top->name, i->ifa_name)) { + /* new interface */ + l = sizeof(*top) + strlen(i->ifa_name) + 1; + cur = calloc(1, l); + if (cur == NULL) { + D("no space for if descriptor"); + continue; + } + cur->name = (char *)(cur + 1); + //cur->flags = i->ifa_flags; + strcpy(cur->name, i->ifa_name); + cur->description = NULL; + cur->next = top; + top = cur; + tail = NULL; + } + /* now deal with addresses */ + D("%s addr family %d len %d %s %s", + top->name, + i->ifa_addr->sa_family, i->ifa_addr->sa_len, + i->ifa_netmask ? "Netmask" : "", + i->ifa_broadaddr ? "Broadcast" : ""); + l = sizeof(struct pcap_addr) + + (i->ifa_addr ? i->ifa_addr->sa_len:0) + + (i->ifa_netmask ? i->ifa_netmask->sa_len:0) + + (i->ifa_broadaddr? i->ifa_broadaddr->sa_len:0); + pca = calloc(1, l); + if (pca == NULL) { + D("no space for if addr"); + continue; + } +#define SA_NEXT(x) ((struct sockaddr *)((char *)(x) + (x)->sa_len)) + pca->addr = (struct sockaddr *)(pca + 1); + bcopy(i->ifa_addr, pca->addr, i->ifa_addr->sa_len); + if (i->ifa_netmask) { + pca->netmask = SA_NEXT(pca->addr); + bcopy(i->ifa_netmask, pca->netmask, i->ifa_netmask->sa_len); + if (i->ifa_broadaddr) { + pca->broadaddr = SA_NEXT(pca->netmask); + bcopy(i->ifa_broadaddr, pca->broadaddr, i->ifa_broadaddr->sa_len); + } + } + if (tail == NULL) { + top->addresses = pca; + } else { + tail->next = pca; + } + tail = pca; + + } + freeifaddrs(i_head); + *alldevsp = top; + return 0; +} + +void pcap_freealldevs(__unused pcap_if_t *alldevs) +{ + D("unimplemented"); +} + +char * +pcap_lookupdev(char *buf) +{ + D("%s", buf); + strcpy(buf, "/dev/netmap"); + return buf; +} + +pcap_t * +pcap_create(const char *source, char *errbuf) +{ + D("src %s (call open liveted)", source); + return pcap_open_live(source, 0, 1, 100, errbuf); +} + +int +pcap_activate(pcap_t *p) +{ + D("pcap %p running", p); + return 0; +} + +int +pcap_can_set_rfmon(__unused pcap_t *p) +{ + D(""); + return 0; /* no we can't */ +} + +int +pcap_set_snaplen(pcap_t *p, int snaplen) +{ + struct my_ring *me = p; + + D("len %d", snaplen); + me->snaplen = snaplen; + return 0; +} + +int +pcap_snapshot(pcap_t *p) +{ + struct my_ring *me = p; + + D("len %d", me->snaplen); + return me->snaplen; +} + +int +pcap_lookupnet(const char *device, uint32_t *netp, + uint32_t *maskp, __unused char *errbuf) +{ + + D("device %s", device); + inet_aton("10.0.0.255", (struct in_addr *)netp); + inet_aton("255.255.255.0",(struct in_addr *) maskp); + return 0; +} + +int +pcap_set_promisc(pcap_t *p, int promisc) +{ + struct my_ring *me = p; + + D("promisc %d", promisc); + if (do_ioctl(me, SIOCGIFFLAGS)) + D("SIOCGIFFLAGS failed"); + if (promisc) { + me->if_flags |= IFF_PPROMISC; + } else { + me->if_flags &= ~IFF_PPROMISC; + } + if (do_ioctl(me, SIOCSIFFLAGS)) + D("SIOCSIFFLAGS failed"); + return 0; +} + +int +pcap_set_timeout(pcap_t *p, int to_ms) +{ + struct my_ring *me = p; + + D("%d ms", to_ms); + me->to_ms = to_ms; + return 0; +} + +struct bpf_program; + +int +pcap_compile(__unused pcap_t *p, __unused struct bpf_program *fp, + const char *str, __unused int optimize, __unused uint32_t netmask) +{ + D("%s", str); + return 0; +} + +int +pcap_setfilter(__unused pcap_t *p, __unused struct bpf_program *fp) +{ + D(""); + return 0; +} + +int +pcap_datalink(__unused pcap_t *p) +{ + D(""); + return 1; // ethernet +} + +const char * +pcap_datalink_val_to_name(int dlt) +{ + D("%d", dlt); + return "DLT_EN10MB"; +} + +const char * +pcap_datalink_val_to_description(int dlt) +{ + D("%d", dlt); + return "Ethernet link"; +} + +struct pcap_stat; +int +pcap_stats(pcap_t *p, struct pcap_stat *ps) +{ + struct my_ring *me = p; + ND(""); + + me->st.ps_recv += 10; + *ps = me->st; + sprintf(me->msg, "stats not supported"); + return -1; +}; + +char * +pcap_geterr(pcap_t *p) +{ + struct my_ring *me = p; + + D(""); + return me->msg; +} + +pcap_t * +pcap_open_live(const char *device, __unused int snaplen, + int promisc, int to_ms, __unused char *errbuf) +{ + struct my_ring *me; + + D("request to open %s", device); + me = calloc(1, sizeof(*me)); + if (me == NULL) { + D("failed to allocate struct for %s", device); + return NULL; + } + strncpy(me->nmr.nr_name, device, sizeof(me->nmr.nr_name)); + if (netmap_open(me, 0)) { + D("error opening %s", device); + free(me); + return NULL; + } + me->to_ms = to_ms; + if (do_ioctl(me, SIOCGIFFLAGS)) + D("SIOCGIFFLAGS failed"); + if (promisc) { + me->if_flags |= IFF_PPROMISC; + if (do_ioctl(me, SIOCSIFFLAGS)) + D("SIOCSIFFLAGS failed"); + } + if (do_ioctl(me, SIOCGIFCAP)) + D("SIOCGIFCAP failed"); + me->if_reqcap &= ~(IFCAP_HWCSUM | IFCAP_TSO | IFCAP_TOE); + if (do_ioctl(me, SIOCSIFCAP)) + D("SIOCSIFCAP failed"); + + return (pcap_t *)me; +} + +void +pcap_close(pcap_t *p) +{ + struct my_ring *me = p; + + D(""); + if (!me) + return; + if (me->mem) + munmap(me->mem, me->memsize); + /* restore original flags ? */ + ioctl(me->fd, NIOCUNREGIF, NULL); + close(me->fd); + bzero(me, sizeof(*me)); + free(me); +} + +int +pcap_fileno(pcap_t *p) +{ + struct my_ring *me = p; + D("returns %d", me->fd); + return me->fd; +} + +int +pcap_get_selectable_fd(pcap_t *p) +{ + struct my_ring *me = p; + + ND(""); + return me->fd; +} + +int +pcap_setnonblock(__unused pcap_t *p, int nonblock, __unused char *errbuf) +{ + D("mode is %d", nonblock); + return 0; /* ignore */ +} + +int +pcap_setdirection(__unused pcap_t *p, __unused pcap_direction_t d) +{ + D(""); + return 0; /* ignore */ +}; + +int +pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user) +{ + struct my_ring *me = p; + int got = 0; + u_int si; + + ND("cnt %d", cnt); + /* scan all rings */ + for (si = me->begin; si < me->end; si++) { + struct netmap_ring *ring = NETMAP_RXRING(me->nifp, si); + ND("ring has %d pkts", ring->avail); + if (ring->avail == 0) + continue; + me->hdr.ts = ring->ts; + while ((cnt == -1 || cnt != got) && ring->avail > 0) { + u_int i = ring->cur; + u_int idx = ring->slot[i].buf_idx; + if (idx < 2) { + D("%s bogus RX index %d at offset %d", + me->nifp->ni_name, idx, i); + sleep(2); + } + u_char *buf = (u_char *)NETMAP_BUF(ring, idx); + me->hdr.len = me->hdr.caplen = ring->slot[i].len; + // D("call %p len %d", p, me->hdr.len); + callback(user, &me->hdr, buf); + ring->cur = NETMAP_RING_NEXT(ring, i); + ring->avail--; + got++; + } + } + return got; +} + +int +pcap_inject(pcap_t *p, const void *buf, size_t size) +{ + struct my_ring *me = p; + u_int si; + + ND("cnt %d", cnt); + /* scan all rings */ + for (si = me->begin; si < me->end; si++) { + struct netmap_ring *ring = NETMAP_TXRING(me->nifp, si); + + ND("ring has %d pkts", ring->avail); + if (ring->avail == 0) + continue; + u_int i = ring->cur; + u_int idx = ring->slot[i].buf_idx; + if (idx < 2) { + D("%s bogus TX index %d at offset %d", + me->nifp->ni_name, idx, i); + sleep(2); + } + u_char *dst = (u_char *)NETMAP_BUF(ring, idx); + ring->slot[i].len = size; + bcopy(buf, dst, size); + ring->cur = NETMAP_RING_NEXT(ring, i); + ring->avail--; + // if (ring->avail == 0) ioctl(me->fd, NIOCTXSYNC, NULL); + return size; + } + errno = ENOBUFS; + return -1; +} + +int +pcap_loop(pcap_t *p, int cnt, pcap_handler callback, u_char *user) +{ + struct my_ring *me = p; + struct pollfd fds[1]; + int i; + + ND("cnt %d", cnt); + memset(fds, 0, sizeof(fds)); + fds[0].fd = me->fd; + fds[0].events = (POLLIN); + + while (cnt == -1 || cnt > 0) { + if (poll(fds, 1, me->to_ms) <= 0) { + D("poll error/timeout"); + continue; + } + i = pcap_dispatch(p, cnt, callback, user); + if (cnt > 0) + cnt -= i; + } + return 0; +} + +#endif /* __PIC__ */ + +#ifndef __PIC__ +void do_send(u_char *user, const struct pcap_pkthdr *h, const u_char *buf) +{ + pcap_inject((pcap_t *)user, buf, h->caplen); +} + +/* + * a simple pcap test program, bridge between two interfaces. + */ +int +main(int argc, char **argv) +{ + pcap_t *p0, *p1; + int burst = 1024; + struct pollfd pollfd[2]; + + fprintf(stderr, "%s %s built %s %s\n", + argv[0], version, __DATE__, __TIME__); + + while (argc > 1 && !strcmp(argv[1], "-v")) { + verbose++; + argv++; + argc--; + } + + if (argc < 3 || argc > 4 || !strcmp(argv[1], argv[2])) { + D("Usage: %s IFNAME1 IFNAME2 [BURST]", argv[0]); + return (1); + } + if (argc > 3) + burst = atoi(argv[3]); + + p0 = pcap_open_live(argv[1], 0, 1, 100, NULL); + p1 = pcap_open_live(argv[2], 0, 1, 100, NULL); + D("%s", version); + D("open returns %p %p", p0, p1); + if (!p0 || !p1) + return(1); + bzero(pollfd, sizeof(pollfd)); + pollfd[0].fd = pcap_fileno(p0); + pollfd[1].fd = pcap_fileno(p1); + pollfd[0].events = pollfd[1].events = POLLIN; + for (;;) { + /* do i need to reset ? */ + pollfd[0].revents = pollfd[1].revents = 0; + int ret = poll(pollfd, 2, 1000); + if (ret <= 0 || verbose) + D("poll %s [0] ev %x %x [1] ev %x %x", + ret <= 0 ? "timeout" : "ok", + pollfd[0].events, + pollfd[0].revents, + pollfd[1].events, + pollfd[1].revents); + if (ret < 0) + continue; + if (pollfd[0].revents & POLLIN) + pcap_dispatch(p0, burst, do_send, p1); + if (pollfd[1].revents & POLLIN) + pcap_dispatch(p1, burst, do_send, p0); + } + + return (0); +} +#endif /* !__PIC__ */ diff --git a/tools/tools/netmap/pkt-gen.c b/tools/tools/netmap/pkt-gen.c new file mode 100644 index 000000000000..747bd9dde00b --- /dev/null +++ b/tools/tools/netmap/pkt-gen.c @@ -0,0 +1,1021 @@ +/* + * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * $Id: pkt-gen.c 9638 2011-11-07 18:07:43Z luigi $ + * + * Example program to show how to build a multithreaded packet + * source/sink using the netmap device. + * + * In this example we create a programmable number of threads + * to take care of all the queues of the interface used to + * send or receive traffic. + * + */ + +const char *default_payload="netmap pkt-gen Luigi Rizzo and Matteo Landi\n" + "http://info.iet.unipi.it/~luigi/netmap/ "; + +#include +#include /* pthread_* */ +#include /* pthread w/ affinity */ +#include /* signal */ +#include +#include +#include /* strcmp */ +#include /* open */ +#include /* close */ +#include /* getifaddrs */ + +#include /* PROT_* */ +#include /* ioctl */ +#include +#include /* sockaddr.. */ +#include /* ntohs */ +#include +#include /* cpu_set */ +#include /* sysctl */ +#include /* timersub */ + +#include +#include /* ifreq */ +#include /* LLADDR */ + +#include +#include +#include + +#include +#include +#include + + +static inline int min(int a, int b) { return a < b ? a : b; } + +/* debug support */ +#define D(format, ...) \ + fprintf(stderr, "%s [%d] " format "\n", \ + __FUNCTION__, __LINE__, ##__VA_ARGS__) + +#ifndef EXPERIMENTAL +#define EXPERIMENTAL 0 +#endif + +int verbose = 0; +#define MAX_QUEUES 64 /* no need to limit */ + +#define SKIP_PAYLOAD 1 /* do not check payload. */ + +#if EXPERIMENTAL +/* Wrapper around `rdtsc' to take reliable timestamps flushing the pipeline */ +#define netmap_rdtsc(t) \ + do { \ + u_int __regs[4]; \ + \ + do_cpuid(0, __regs); \ + (t) = rdtsc(); \ + } while (0) + +static __inline void +do_cpuid(u_int ax, u_int *p) +{ + __asm __volatile("cpuid" + : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3]) + : "0" (ax)); +} + +static __inline uint64_t +rdtsc(void) +{ + uint64_t rv; + + __asm __volatile("rdtsc" : "=A" (rv)); + return (rv); +} +#define MAX_SAMPLES 100000 +#endif /* EXPERIMENTAL */ + + +struct pkt { + struct ether_header eh; + struct ip ip; + struct udphdr udp; + uint8_t body[NETMAP_BUF_SIZE]; +} __attribute__((__packed__)); + +/* + * global arguments for all threads + */ +struct glob_arg { + const char *src_ip; + const char *dst_ip; + const char *src_mac; + const char *dst_mac; + int pkt_size; + int burst; + int npackets; /* total packets to send */ + int nthreads; + int cpus; + int use_pcap; + pcap_t *p; +}; + +struct mystat { + uint64_t containers[8]; +}; + +/* + * Arguments for a new thread. The same structure is used by + * the source and the sink + */ +struct targ { + struct glob_arg *g; + int used; + int completed; + int fd; + struct nmreq nmr; + struct netmap_if *nifp; + uint16_t qfirst, qlast; /* range of queues to scan */ + uint64_t count; + struct timeval tic, toc; + int me; + pthread_t thread; + int affinity; + + uint8_t dst_mac[6]; + uint8_t src_mac[6]; + u_int dst_mac_range; + u_int src_mac_range; + uint32_t dst_ip; + uint32_t src_ip; + u_int dst_ip_range; + u_int src_ip_range; + + struct pkt pkt; +}; + + +static struct targ *targs; +static int global_nthreads; + +/* control-C handler */ +static void +sigint_h(__unused int sig) +{ + for (int i = 0; i < global_nthreads; i++) { + /* cancel active threads. */ + if (targs[i].used == 0) + continue; + + D("Cancelling thread #%d\n", i); + pthread_cancel(targs[i].thread); + targs[i].used = 0; + } + + signal(SIGINT, SIG_DFL); +} + + +/* sysctl wrapper to return the number of active CPUs */ +static int +system_ncpus(void) +{ + int mib[2], ncpus; + size_t len; + + mib[0] = CTL_HW; + mib[1] = HW_NCPU; + len = sizeof(mib); + sysctl(mib, 2, &ncpus, &len, NULL, 0); + + return (ncpus); +} + +/* + * locate the src mac address for our interface, put it + * into the user-supplied buffer. return 0 if ok, -1 on error. + */ +static int +source_hwaddr(const char *ifname, char *buf) +{ + struct ifaddrs *ifaphead, *ifap; + int l = sizeof(ifap->ifa_name); + + if (getifaddrs(&ifaphead) != 0) { + D("getifaddrs %s failed", ifname); + return (-1); + } + + for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) { + struct sockaddr_dl *sdl = + (struct sockaddr_dl *)ifap->ifa_addr; + uint8_t *mac; + + if (!sdl || sdl->sdl_family != AF_LINK) + continue; + if (strncmp(ifap->ifa_name, ifname, l) != 0) + continue; + mac = (uint8_t *)LLADDR(sdl); + sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x", + mac[0], mac[1], mac[2], + mac[3], mac[4], mac[5]); + if (verbose) + D("source hwaddr %s", buf); + break; + } + freeifaddrs(ifaphead); + return ifap ? 0 : 1; +} + + +/* set the thread affinity. */ +static int +setaffinity(pthread_t me, int i) +{ + cpuset_t cpumask; + + if (i == -1) + return 0; + + /* Set thread affinity affinity.*/ + CPU_ZERO(&cpumask); + CPU_SET(i, &cpumask); + + if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) { + D("Unable to set affinity"); + return 1; + } + return 0; +} + +/* Compute the checksum of the given ip header. */ +static uint16_t +checksum(const void *data, uint16_t len) +{ + const uint8_t *addr = data; + uint32_t sum = 0; + + while (len > 1) { + sum += addr[0] * 256 + addr[1]; + addr += 2; + len -= 2; + } + + if (len == 1) + sum += *addr * 256; + + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); + + sum = htons(sum); + + return ~sum; +} + +/* + * Fill a packet with some payload. + */ +static void +initialize_packet(struct targ *targ) +{ + struct pkt *pkt = &targ->pkt; + struct ether_header *eh; + struct ip *ip; + struct udphdr *udp; + uint16_t paylen = targ->g->pkt_size - sizeof(*eh) - sizeof(*ip); + int i, l, l0 = strlen(default_payload); + char *p; + + for (i = 0; i < paylen;) { + l = min(l0, paylen - i); + bcopy(default_payload, pkt->body + i, l); + i += l; + } + pkt->body[i-1] = '\0'; + + udp = &pkt->udp; + udp->uh_sport = htons(1234); + udp->uh_dport = htons(4321); + udp->uh_ulen = htons(paylen); + udp->uh_sum = 0; // checksum(udp, sizeof(*udp)); + + ip = &pkt->ip; + ip->ip_v = IPVERSION; + ip->ip_hl = 5; + ip->ip_id = 0; + ip->ip_tos = IPTOS_LOWDELAY; + ip->ip_len = ntohs(targ->g->pkt_size - sizeof(*eh)); + ip->ip_id = 0; + ip->ip_off = htons(IP_DF); /* Don't fragment */ + ip->ip_ttl = IPDEFTTL; + ip->ip_p = IPPROTO_UDP; + inet_aton(targ->g->src_ip, (struct in_addr *)&ip->ip_src); + inet_aton(targ->g->dst_ip, (struct in_addr *)&ip->ip_dst); + targ->dst_ip = ip->ip_dst.s_addr; + targ->src_ip = ip->ip_src.s_addr; + p = index(targ->g->src_ip, '-'); + if (p) { + targ->dst_ip_range = atoi(p+1); + D("dst-ip sweep %d addresses", targ->dst_ip_range); + } + ip->ip_sum = checksum(ip, sizeof(*ip)); + + eh = &pkt->eh; + bcopy(ether_aton(targ->g->src_mac), targ->src_mac, 6); + bcopy(targ->src_mac, eh->ether_shost, 6); + p = index(targ->g->src_mac, '-'); + if (p) + targ->src_mac_range = atoi(p+1); + + bcopy(ether_aton(targ->g->dst_mac), targ->dst_mac, 6); + bcopy(targ->dst_mac, eh->ether_dhost, 6); + p = index(targ->g->dst_mac, '-'); + if (p) + targ->dst_mac_range = atoi(p+1); + eh->ether_type = htons(ETHERTYPE_IP); +} + +/* Check the payload of the packet for errors (use it for debug). + * Look for consecutive ascii representations of the size of the packet. + */ +static void +check_payload(char *p, int psize) +{ + char temp[64]; + int n_read, size, sizelen; + + /* get the length in ASCII of the length of the packet. */ + sizelen = sprintf(temp, "%d", psize) + 1; // include a whitespace + + /* dummy payload. */ + p += 14; /* skip packet header. */ + n_read = 14; + while (psize - n_read >= sizelen) { + sscanf(p, "%d", &size); + if (size != psize) { + D("Read %d instead of %d", size, psize); + break; + } + + p += sizelen; + n_read += sizelen; + } +} + + +/* + * create and enqueue a batch of packets on a ring. + * On the last one set NS_REPORT to tell the driver to generate + * an interrupt when done. + */ +static int +send_packets(struct netmap_ring *ring, struct pkt *pkt, + int size, u_int count, int fill_all) +{ + u_int sent, cur = ring->cur; + + if (ring->avail < count) + count = ring->avail; + + for (sent = 0; sent < count; sent++) { + struct netmap_slot *slot = &ring->slot[cur]; + char *p = NETMAP_BUF(ring, slot->buf_idx); + + if (fill_all) + memcpy(p, pkt, size); + + slot->len = size; + if (sent == count - 1) + slot->flags |= NS_REPORT; + cur = NETMAP_RING_NEXT(ring, cur); + } + ring->avail -= sent; + ring->cur = cur; + + return (sent); +} + +static void * +sender_body(void *data) +{ + struct targ *targ = (struct targ *) data; + + struct pollfd fds[1]; + struct netmap_if *nifp = targ->nifp; + struct netmap_ring *txring; + int i, n = targ->g->npackets / targ->g->nthreads, sent = 0; + int fill_all = 1; + + if (setaffinity(targ->thread, targ->affinity)) + goto quit; + /* setup poll(2) machanism. */ + memset(fds, 0, sizeof(fds)); + fds[0].fd = targ->fd; + fds[0].events = (POLLOUT); + + /* main loop.*/ + gettimeofday(&targ->tic, NULL); + if (targ->g->use_pcap) { + int size = targ->g->pkt_size; + void *pkt = &targ->pkt; + pcap_t *p = targ->g->p; + + for (; sent < n; sent++) { + if (pcap_inject(p, pkt, size) == -1) + break; + } + } else { + while (sent < n) { + + /* + * wait for available room in the send queue(s) + */ + if (poll(fds, 1, 2000) <= 0) { + D("poll error/timeout on queue %d\n", targ->me); + goto quit; + } + /* + * scan our queues and send on those with room + */ + if (sent > 100000) + fill_all = 0; + for (i = targ->qfirst; i < targ->qlast; i++) { + int m, limit = MIN(n - sent, targ->g->burst); + + txring = NETMAP_TXRING(nifp, i); + if (txring->avail == 0) + continue; + m = send_packets(txring, &targ->pkt, targ->g->pkt_size, + limit, fill_all); + sent += m; + targ->count = sent; + } + } + /* Tell the interface that we have new packets. */ + ioctl(fds[0].fd, NIOCTXSYNC, NULL); + + /* final part: wait all the TX queues to be empty. */ + for (i = targ->qfirst; i < targ->qlast; i++) { + txring = NETMAP_TXRING(nifp, i); + while (!NETMAP_TX_RING_EMPTY(txring)) { + ioctl(fds[0].fd, NIOCTXSYNC, NULL); + usleep(1); /* wait 1 tick */ + } + } + } + + gettimeofday(&targ->toc, NULL); + targ->completed = 1; + targ->count = sent; + +quit: + /* reset the ``used`` flag. */ + targ->used = 0; + + return (NULL); +} + + +static void +receive_pcap(u_char *user, __unused const struct pcap_pkthdr * h, + __unused const u_char * bytes) +{ + int *count = (int *)user; + (*count)++; +} + +static int +receive_packets(struct netmap_ring *ring, u_int limit, int skip_payload) +{ + u_int cur, rx; + + cur = ring->cur; + if (ring->avail < limit) + limit = ring->avail; + for (rx = 0; rx < limit; rx++) { + struct netmap_slot *slot = &ring->slot[cur]; + char *p = NETMAP_BUF(ring, slot->buf_idx); + + if (!skip_payload) + check_payload(p, slot->len); + + cur = NETMAP_RING_NEXT(ring, cur); + } + ring->avail -= rx; + ring->cur = cur; + + return (rx); +} + +static void * +receiver_body(void *data) +{ + struct targ *targ = (struct targ *) data; + struct pollfd fds[1]; + struct netmap_if *nifp = targ->nifp; + struct netmap_ring *rxring; + int i, received = 0; + + if (setaffinity(targ->thread, targ->affinity)) + goto quit; + + /* setup poll(2) machanism. */ + memset(fds, 0, sizeof(fds)); + fds[0].fd = targ->fd; + fds[0].events = (POLLIN); + + /* unbounded wait for the first packet. */ + for (;;) { + i = poll(fds, 1, 1000); + if (i > 0 && !(fds[0].revents & POLLERR)) + break; + D("waiting for initial packets, poll returns %d %d", i, fds[0].revents); + } + + /* main loop, exit after 1s silence */ + gettimeofday(&targ->tic, NULL); + if (targ->g->use_pcap) { + for (;;) { + pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap, NULL); + } + } else { + while (1) { + /* Once we started to receive packets, wait at most 1 seconds + before quitting. */ + if (poll(fds, 1, 1 * 1000) <= 0) { + gettimeofday(&targ->toc, NULL); + targ->toc.tv_sec -= 1; /* Substract timeout time. */ + break; + } + + for (i = targ->qfirst; i < targ->qlast; i++) { + int m; + + rxring = NETMAP_RXRING(nifp, i); + if (rxring->avail == 0) + continue; + + m = receive_packets(rxring, targ->g->burst, + SKIP_PAYLOAD); + received += m; + targ->count = received; + } + + // tell the card we have read the data + //ioctl(fds[0].fd, NIOCRXSYNC, NULL); + } + } + + targ->completed = 1; + targ->count = received; + +quit: + /* reset the ``used`` flag. */ + targ->used = 0; + + return (NULL); +} + +static void +tx_output(uint64_t sent, int size, double delta) +{ + double amount = 8.0 * (1.0 * size * sent) / delta; + double pps = sent / delta; + char units[4] = { '\0', 'K', 'M', 'G' }; + int aunit = 0, punit = 0; + + while (amount >= 1000) { + amount /= 1000; + aunit += 1; + } + while (pps >= 1000) { + pps /= 1000; + punit += 1; + } + + printf("Sent %llu packets, %d bytes each, in %.2f seconds.\n", + sent, size, delta); + printf("Speed: %.2f%cpps. Bandwidth: %.2f%cbps.\n", + pps, units[punit], amount, units[aunit]); +} + + +static void +rx_output(uint64_t received, double delta) +{ + + double pps = received / delta; + char units[4] = { '\0', 'K', 'M', 'G' }; + int punit = 0; + + while (pps >= 1000) { + pps /= 1000; + punit += 1; + } + + printf("Received %llu packets, in %.2f seconds.\n", received, delta); + printf("Speed: %.2f%cpps.\n", pps, units[punit]); +} + +static void +usage(void) +{ + const char *cmd = "pkt-gen"; + fprintf(stderr, + "Usage:\n" + "%s arguments\n" + "\t-i interface interface name\n" + "\t-t pkts_to_send also forces send mode\n" + "\t-r pkts_to_receive also forces receive mode\n" + "\t-l pkts_size in bytes excluding CRC\n" + "\t-d dst-ip end with %%n to sweep n addresses\n" + "\t-s src-ip end with %%n to sweep n addresses\n" + "\t-D dst-mac end with %%n to sweep n addresses\n" + "\t-S src-mac end with %%n to sweep n addresses\n" + "\t-b burst size testing, mostly\n" + "\t-c cores cores to use\n" + "\t-p threads processes/threads to use\n" + "\t-T report_ms milliseconds between reports\n" + "\t-w wait_for_link_time in seconds\n" + "", + cmd); + + exit(0); +} + + +int +main(int arc, char **argv) +{ + int i, fd; + + struct glob_arg g; + + struct nmreq nmr; + void *mmap_addr; /* the mmap address */ + void *(*td_body)(void *) = receiver_body; + int ch; + int report_interval = 1000; /* report interval */ + char *ifname = NULL; + int wait_link = 2; + int devqueues = 1; /* how many device queues */ + + bzero(&g, sizeof(g)); + + g.src_ip = "10.0.0.1"; + g.dst_ip = "10.1.0.1"; + g.dst_mac = "ff:ff:ff:ff:ff:ff"; + g.src_mac = NULL; + g.pkt_size = 60; + g.burst = 512; // default + g.nthreads = 1; + g.cpus = 1; + + while ( (ch = getopt(arc, argv, + "i:t:r:l:d:s:D:S:b:c:p:T:w:v")) != -1) { + switch(ch) { + default: + D("bad option %c %s", ch, optarg); + usage(); + break; + case 'i': /* interface */ + ifname = optarg; + break; + case 't': /* send */ + td_body = sender_body; + g.npackets = atoi(optarg); + break; + case 'r': /* receive */ + td_body = receiver_body; + g.npackets = atoi(optarg); + break; + case 'l': /* pkt_size */ + g.pkt_size = atoi(optarg); + break; + case 'd': + g.dst_ip = optarg; + break; + case 's': + g.src_ip = optarg; + break; + case 'T': /* report interval */ + report_interval = atoi(optarg); + break; + case 'w': + wait_link = atoi(optarg); + break; + case 'b': /* burst */ + g.burst = atoi(optarg); + break; + case 'c': + g.cpus = atoi(optarg); + break; + case 'p': + g.nthreads = atoi(optarg); + break; + + case 'P': + g.use_pcap = 1; + break; + + case 'D': /* destination mac */ + g.dst_mac = optarg; + { + struct ether_addr *mac = ether_aton(g.dst_mac); + D("ether_aton(%s) gives %p", g.dst_mac, mac); + } + break; + case 'S': /* source mac */ + g.src_mac = optarg; + break; + case 'v': + verbose++; + } + } + + if (ifname == NULL) { + D("missing ifname"); + usage(); + } + { + int n = system_ncpus(); + if (g.cpus < 0 || g.cpus > n) { + D("%d cpus is too high, have only %d cpus", g.cpus, n); + usage(); + } + if (g.cpus == 0) + g.cpus = n; + } + if (g.pkt_size < 16 || g.pkt_size > 1536) { + D("bad pktsize %d\n", g.pkt_size); + usage(); + } + + bzero(&nmr, sizeof(nmr)); + /* + * Open the netmap device to fetch the number of queues of our + * interface. + * + * The first NIOCREGIF also detaches the card from the + * protocol stack and may cause a reset of the card, + * which in turn may take some time for the PHY to + * reconfigure. + */ + fd = open("/dev/netmap", O_RDWR); + if (fd == -1) { + D("Unable to open /dev/netmap"); + // fail later + } else { + if ((ioctl(fd, NIOCGINFO, &nmr)) == -1) { + D("Unable to get if info without name"); + } else { + D("map size is %d Kb", nmr.nr_memsize >> 10); + } + bzero(&nmr, sizeof(nmr)); + strncpy(nmr.nr_name, ifname, sizeof(nmr.nr_name)); + if ((ioctl(fd, NIOCGINFO, &nmr)) == -1) { + D("Unable to get if info for %s", ifname); + } + devqueues = nmr.nr_numrings; + } + + /* validate provided nthreads. */ + if (g.nthreads < 1 || g.nthreads > devqueues) { + D("bad nthreads %d, have %d queues", g.nthreads, devqueues); + // continue, fail later + } + + if (td_body == sender_body && g.src_mac == NULL) { + static char mybuf[20] = "ff:ff:ff:ff:ff:ff"; + /* retrieve source mac address. */ + if (source_hwaddr(ifname, mybuf) == -1) { + D("Unable to retrieve source mac"); + // continue, fail later + } + g.src_mac = mybuf; + } + + /* + * Map the netmap shared memory: instead of issuing mmap() + * inside the body of the threads, we prefer to keep this + * operation here to simplify the thread logic. + */ + D("mmapping %d Kbytes", nmr.nr_memsize>>10); + mmap_addr = (struct netmap_d *) mmap(0, nmr.nr_memsize, + PROT_WRITE | PROT_READ, + MAP_SHARED, fd, 0); + if (mmap_addr == MAP_FAILED) { + D("Unable to mmap %d KB", nmr.nr_memsize >> 10); + // continue, fail later + } + + /* + * Register the interface on the netmap device: from now on, + * we can operate on the network interface without any + * interference from the legacy network stack. + * + * We decide to put the first interface registration here to + * give time to cards that take a long time to reset the PHY. + */ + if (ioctl(fd, NIOCREGIF, &nmr) == -1) { + D("Unable to register interface %s", ifname); + //continue, fail later + } + + + /* Print some debug information. */ + fprintf(stdout, + "%s %s: %d queues, %d threads and %d cpus.\n", + (td_body == sender_body) ? "Sending on" : "Receiving from", + ifname, + devqueues, + g.nthreads, + g.cpus); + if (td_body == sender_body) { + fprintf(stdout, "%s -> %s (%s -> %s)\n", + g.src_ip, g.dst_ip, + g.src_mac, g.dst_mac); + } + + /* Exit if something went wrong. */ + if (fd < 0) { + D("aborting"); + usage(); + } + + + /* Wait for PHY reset. */ + D("Wait %d secs for phy reset", wait_link); + sleep(wait_link); + D("Ready..."); + + /* Install ^C handler. */ + global_nthreads = g.nthreads; + signal(SIGINT, sigint_h); + + if (g.use_pcap) { + // XXX g.p = pcap_open_live(..); + } + + targs = calloc(g.nthreads, sizeof(*targs)); + /* + * Now create the desired number of threads, each one + * using a single descriptor. + */ + for (i = 0; i < g.nthreads; i++) { + struct netmap_if *tnifp; + struct nmreq tifreq; + int tfd; + + if (g.use_pcap) { + tfd = -1; + tnifp = NULL; + } else { + /* register interface. */ + tfd = open("/dev/netmap", O_RDWR); + if (tfd == -1) { + D("Unable to open /dev/netmap"); + continue; + } + + bzero(&tifreq, sizeof(tifreq)); + strncpy(tifreq.nr_name, ifname, sizeof(tifreq.nr_name)); + tifreq.nr_ringid = (g.nthreads > 1) ? (i | NETMAP_HW_RING) : 0; + + /* + * if we are acting as a receiver only, do not touch the transmit ring. + * This is not the default because many apps may use the interface + * in both directions, but a pure receiver does not. + */ + if (td_body == receiver_body) { + tifreq.nr_ringid |= NETMAP_NO_TX_POLL; + } + + if ((ioctl(tfd, NIOCREGIF, &tifreq)) == -1) { + D("Unable to register %s", ifname); + continue; + } + tnifp = NETMAP_IF(mmap_addr, tifreq.nr_offset); + } + /* start threads. */ + bzero(&targs[i], sizeof(targs[i])); + targs[i].g = &g; + targs[i].used = 1; + targs[i].completed = 0; + targs[i].fd = tfd; + targs[i].nmr = tifreq; + targs[i].nifp = tnifp; + targs[i].qfirst = (g.nthreads > 1) ? i : 0; + targs[i].qlast = (g.nthreads > 1) ? i+1 : tifreq.nr_numrings; + targs[i].me = i; + targs[i].affinity = g.cpus ? i % g.cpus : -1; + if (td_body == sender_body) { + /* initialize the packet to send. */ + initialize_packet(&targs[i]); + } + + if (pthread_create(&targs[i].thread, NULL, td_body, + &targs[i]) == -1) { + D("Unable to create thread %d", i); + targs[i].used = 0; + } + } + + { + uint64_t my_count = 0, prev = 0; + uint64_t count = 0; + double delta_t; + struct timeval tic, toc; + + gettimeofday(&toc, NULL); + for (;;) { + struct timeval now, delta; + uint64_t pps; + int done = 0; + + delta.tv_sec = report_interval/1000; + delta.tv_usec = (report_interval%1000)*1000; + select(0, NULL, NULL, NULL, &delta); + gettimeofday(&now, NULL); + timersub(&now, &toc, &toc); + my_count = 0; + for (i = 0; i < g.nthreads; i++) { + my_count += targs[i].count; + if (targs[i].used == 0) + done++; + } + pps = toc.tv_sec* 1000000 + toc.tv_usec; + if (pps < 10000) + continue; + pps = (my_count - prev)*1000000 / pps; + D("%llu pps", pps); + prev = my_count; + toc = now; + if (done == g.nthreads) + break; + } + + timerclear(&tic); + timerclear(&toc); + for (i = 0; i < g.nthreads; i++) { + /* + * Join active threads, unregister interfaces and close + * file descriptors. + */ + pthread_join(targs[i].thread, NULL); + ioctl(targs[i].fd, NIOCUNREGIF, &targs[i].nmr); + close(targs[i].fd); + + if (targs[i].completed == 0) + continue; + + /* + * Collect threads o1utput and extract information about + * how log it took to send all the packets. + */ + count += targs[i].count; + if (!timerisset(&tic) || timercmp(&targs[i].tic, &tic, <)) + tic = targs[i].tic; + if (!timerisset(&toc) || timercmp(&targs[i].toc, &toc, >)) + toc = targs[i].toc; + } + + /* print output. */ + timersub(&toc, &tic, &toc); + delta_t = toc.tv_sec + 1e-6* toc.tv_usec; + if (td_body == sender_body) + tx_output(count, g.pkt_size, delta_t); + else + rx_output(count, delta_t); + } + + ioctl(fd, NIOCUNREGIF, &nmr); + munmap(mmap_addr, nmr.nr_memsize); + close(fd); + + return (0); +} +/* end of file */