1
0
mirror of https://git.FreeBSD.org/src.git synced 2024-12-13 10:02:38 +00:00
freebsd/sys/net/if_llatbl.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

312 lines
11 KiB
C
Raw Normal View History

/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
2008-12-15 06:53:09 +00:00
* Copyright (c) 2004 Luigi Rizzo, Alessandro Cerri. All rights reserved.
* Copyright (c) 2004-2008 Qing Li. All rights reserved.
* Copyright (c) 2008 Kip Macy. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#ifndef _NET_IF_LLATBL_H_
#define _NET_IF_LLATBL_H_
#include <sys/_eventhandler.h>
2008-12-15 06:53:09 +00:00
#include <sys/_rwlock.h>
#include <netinet/in.h>
#include <sys/epoch.h>
#include <sys/ck.h>
2008-12-15 06:53:09 +00:00
struct ifnet;
struct sysctl_req;
struct rt_msghdr;
struct rt_addrinfo;
struct llentry;
CK_LIST_HEAD(llentries, llentry);
2008-12-15 06:53:09 +00:00
Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102
2015-12-31 05:03:27 +00:00
#define LLE_MAX_LINKHDR 24 /* Full IB header */
2008-12-15 06:53:09 +00:00
/*
* Code referencing llentry must at least hold
* a shared lock
*/
struct llentry {
CK_LIST_ENTRY(llentry) lle_next;
union {
struct in_addr addr4;
struct in6_addr addr6;
} r_l3addr;
Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102
2015-12-31 05:03:27 +00:00
char r_linkdata[LLE_MAX_LINKHDR]; /* L2 data */
uint8_t r_hdrlen; /* length for LL header */
uint8_t r_family; /* Upper layer proto family */
uint8_t spare0[2];
Remove LLE read lock from IPv4 fast path. LLE structure is mostly unchanged during its lifecycle. To be more specific, there are 2 things relevant for fast path lookup code: 1) link-level address change. Since r286722, these updates are performed under AFDATA WLOCK. 2) Some sort of feedback indicating that this particular entry is used so we re-send arp request to perform reachability verification instead of expiring entry. The only signal that is needed from fast path is something like binary yes/no. The latter is solved by the following changes: 1) introduce special r_skip_req field which is read lockless by fast path, but updated under (new) req_mutex mutex. If this field is non-zero, then fast path will acquire lock and set it back to 0. 2) introduce simple state machine: incomplete->reachable<->verify->deleted. Before that we implicitely had incomplete->reachable->deleted state machine, with V_arpt_keep between "reachable" and "deleted". Verification was performed in runtime 5 seconds before V_arpt_keep expire. This is changed to "change state to verify 5 seconds before V_arpt_keep, set r_skip_req to non-zero value and check it every second". If the value is zero - then send arp verification probe. These changes do not introduce any signifficant control plane overhead: typically lle callout timer would fire 1 time more each V_arpt_keep (1200s) for used lles and up to arp_maxtries (5) for dead lles. As a result, all packets towards "reachable" lle are handled by fast path without acquiring lle read lock. Additional "req_mutex" is needed because callout / arpresolve_slow() or eventhandler might keep LLE lock for signifficant amount of time, which might not be feasible for fast path locking (e.g. having rmlock as ether AFDATA or lltable own lock). Differential Revision: https://reviews.freebsd.org/D3688
2015-12-05 09:50:37 +00:00
uint16_t r_flags; /* LLE runtime flags */
uint16_t r_skip_req; /* feedback from fast path */
2008-12-15 06:53:09 +00:00
struct lltable *lle_tbl;
struct llentries *lle_head;
void (*lle_free)(struct llentry *);
2008-12-15 06:53:09 +00:00
struct mbuf *la_hold;
int la_numheld; /* # of packets currently held */
2008-12-15 06:53:09 +00:00
time_t la_expire;
uint16_t la_flags;
2008-12-15 06:53:09 +00:00
uint16_t la_asked;
uint16_t la_preempt;
int16_t ln_state; /* IPv6 has ND6_LLINFO_NOSTATE == -2 */
uint16_t ln_router;
2008-12-15 06:53:09 +00:00
time_t ln_ntick;
Remove LLE read lock from IPv6 fast path. LLE structure is mostly unchanged during its lifecycle: there are only 2 things relevant for fast path lookup code: 1) link-level address change. Since r286722, these updates are performed under AFDATA WLOCK. 2) Some sort of feedback indicating that this particular entry is used so we send NS to perform reachability verification instead of expiring entry. The only signal that is needed from fast path is something like binary yes/no. The latter is solved by the following changes: Special r_skip_req (introduced in D3688) value is used for fast path feedback. It is read lockless by fast path, but updated under req_mutex mutex. If this field is non-zero, then fast path will acquire lock and set it back to 0. After transitioning to STALE state, callout timer is armed to run each V_nd6_delay seconds to make sure that if packet was transmitted at the start of given interval, we would be able to switch to PROBE state in V_nd6_delay seconds as user expects. (in STALE state) timer is rescheduled until original V_nd6_gctimer expires keeping lle in STALE state (remaining timer value stored in lle_remtime). (in STALE state) timer is rescheduled if packet was transmitted less that V_nd6_delay seconds ago to make sure we transition to PROBE state exactly after V_n6_delay seconds. As a result, all packets towards lle in REACHABLE/STALE/PROBE states are handled by fast path without acquiring lle read lock. Differential Revision: https://reviews.freebsd.org/D3780
2015-12-13 07:39:49 +00:00
time_t lle_remtime; /* Real time remaining */
time_t lle_hittime; /* Time when r_skip_req was unset */
2008-12-15 06:53:09 +00:00
int lle_refcnt;
Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102
2015-12-31 05:03:27 +00:00
char *ll_addr; /* link-layer address */
CK_SLIST_HEAD(llentry_children_head,llentry) lle_children; /* child encaps */
CK_SLIST_ENTRY(llentry) lle_child_next; /* child encaps */
struct llentry *lle_parent; /* parent for a child */
CK_LIST_ENTRY(llentry) lle_chain; /* chain of deleted items */
struct callout lle_timer;
struct rwlock lle_lock;
Remove LLE read lock from IPv4 fast path. LLE structure is mostly unchanged during its lifecycle. To be more specific, there are 2 things relevant for fast path lookup code: 1) link-level address change. Since r286722, these updates are performed under AFDATA WLOCK. 2) Some sort of feedback indicating that this particular entry is used so we re-send arp request to perform reachability verification instead of expiring entry. The only signal that is needed from fast path is something like binary yes/no. The latter is solved by the following changes: 1) introduce special r_skip_req field which is read lockless by fast path, but updated under (new) req_mutex mutex. If this field is non-zero, then fast path will acquire lock and set it back to 0. 2) introduce simple state machine: incomplete->reachable<->verify->deleted. Before that we implicitely had incomplete->reachable->deleted state machine, with V_arpt_keep between "reachable" and "deleted". Verification was performed in runtime 5 seconds before V_arpt_keep expire. This is changed to "change state to verify 5 seconds before V_arpt_keep, set r_skip_req to non-zero value and check it every second". If the value is zero - then send arp verification probe. These changes do not introduce any signifficant control plane overhead: typically lle callout timer would fire 1 time more each V_arpt_keep (1200s) for used lles and up to arp_maxtries (5) for dead lles. As a result, all packets towards "reachable" lle are handled by fast path without acquiring lle read lock. Additional "req_mutex" is needed because callout / arpresolve_slow() or eventhandler might keep LLE lock for signifficant amount of time, which might not be feasible for fast path locking (e.g. having rmlock as ether AFDATA or lltable own lock). Differential Revision: https://reviews.freebsd.org/D3688
2015-12-05 09:50:37 +00:00
struct mtx req_mtx;
struct epoch_context lle_epoch_ctx;
2008-12-15 06:53:09 +00:00
};
#define LLE_WLOCK(lle) rw_wlock(&(lle)->lle_lock)
#define LLE_RLOCK(lle) rw_rlock(&(lle)->lle_lock)
#define LLE_WUNLOCK(lle) rw_wunlock(&(lle)->lle_lock)
#define LLE_RUNLOCK(lle) rw_runlock(&(lle)->lle_lock)
#define LLE_DOWNGRADE(lle) rw_downgrade(&(lle)->lle_lock)
#define LLE_TRY_UPGRADE(lle) rw_try_upgrade(&(lle)->lle_lock)
#define LLE_LOCK_INIT(lle) rw_init_flags(&(lle)->lle_lock, "lle", RW_DUPOK)
#define LLE_LOCK_DESTROY(lle) rw_destroy(&(lle)->lle_lock)
2008-12-15 06:53:09 +00:00
#define LLE_WLOCK_ASSERT(lle) rw_assert(&(lle)->lle_lock, RA_WLOCKED)
Remove LLE read lock from IPv4 fast path. LLE structure is mostly unchanged during its lifecycle. To be more specific, there are 2 things relevant for fast path lookup code: 1) link-level address change. Since r286722, these updates are performed under AFDATA WLOCK. 2) Some sort of feedback indicating that this particular entry is used so we re-send arp request to perform reachability verification instead of expiring entry. The only signal that is needed from fast path is something like binary yes/no. The latter is solved by the following changes: 1) introduce special r_skip_req field which is read lockless by fast path, but updated under (new) req_mutex mutex. If this field is non-zero, then fast path will acquire lock and set it back to 0. 2) introduce simple state machine: incomplete->reachable<->verify->deleted. Before that we implicitely had incomplete->reachable->deleted state machine, with V_arpt_keep between "reachable" and "deleted". Verification was performed in runtime 5 seconds before V_arpt_keep expire. This is changed to "change state to verify 5 seconds before V_arpt_keep, set r_skip_req to non-zero value and check it every second". If the value is zero - then send arp verification probe. These changes do not introduce any signifficant control plane overhead: typically lle callout timer would fire 1 time more each V_arpt_keep (1200s) for used lles and up to arp_maxtries (5) for dead lles. As a result, all packets towards "reachable" lle are handled by fast path without acquiring lle read lock. Additional "req_mutex" is needed because callout / arpresolve_slow() or eventhandler might keep LLE lock for signifficant amount of time, which might not be feasible for fast path locking (e.g. having rmlock as ether AFDATA or lltable own lock). Differential Revision: https://reviews.freebsd.org/D3688
2015-12-05 09:50:37 +00:00
#define LLE_REQ_INIT(lle) mtx_init(&(lle)->req_mtx, "lle req", \
NULL, MTX_DEF)
#define LLE_REQ_DESTROY(lle) mtx_destroy(&(lle)->req_mtx)
#define LLE_REQ_LOCK(lle) mtx_lock(&(lle)->req_mtx)
#define LLE_REQ_UNLOCK(lle) mtx_unlock(&(lle)->req_mtx)
2008-12-15 06:53:09 +00:00
#define LLE_IS_VALID(lle) (((lle) != NULL) && ((lle) != (void *)-1))
#define LLE_SF(_fam, _flags) (((_flags) & 0xFFFF) | ((_fam) << 16))
2008-12-15 06:53:09 +00:00
#define LLE_ADDREF(lle) do { \
LLE_WLOCK_ASSERT(lle); \
KASSERT((lle)->lle_refcnt >= 0, \
("negative refcnt %d on lle %p", \
(lle)->lle_refcnt, (lle))); \
2008-12-15 06:53:09 +00:00
(lle)->lle_refcnt++; \
} while (0)
2008-12-15 06:53:09 +00:00
#define LLE_REMREF(lle) do { \
LLE_WLOCK_ASSERT(lle); \
KASSERT((lle)->lle_refcnt > 0, \
("bogus refcnt %d on lle %p", \
(lle)->lle_refcnt, (lle))); \
2008-12-15 06:53:09 +00:00
(lle)->lle_refcnt--; \
} while (0)
#define LLE_FREE_LOCKED(lle) do { \
if ((lle)->lle_refcnt == 1) \
(lle)->lle_free(lle); \
2008-12-15 06:53:09 +00:00
else { \
LLE_REMREF(lle); \
2008-12-15 06:53:09 +00:00
LLE_WUNLOCK(lle); \
} \
/* guard against invalid refs */ \
(lle) = NULL; \
2008-12-15 06:53:09 +00:00
} while (0)
#define LLE_FREE(lle) do { \
LLE_WLOCK(lle); \
LLE_FREE_LOCKED(lle); \
2008-12-15 06:53:09 +00:00
} while (0)
typedef struct llentry *(llt_lookup_t)(struct lltable *, u_int flags,
const struct sockaddr *l3addr);
typedef struct llentry *(llt_alloc_t)(struct lltable *, u_int flags,
const struct sockaddr *l3addr);
typedef void (llt_delete_t)(struct lltable *, struct llentry *);
typedef void (llt_prefix_free_t)(struct lltable *,
const struct sockaddr *addr, const struct sockaddr *mask, u_int flags);
typedef int (llt_dump_entry_t)(struct lltable *, struct llentry *,
struct sysctl_req *);
typedef uint32_t (llt_hash_t)(const struct llentry *, uint32_t);
typedef int (llt_match_prefix_t)(const struct sockaddr *,
const struct sockaddr *, u_int, struct llentry *);
typedef void (llt_free_entry_t)(struct lltable *, struct llentry *);
typedef void (llt_fill_sa_entry_t)(const struct llentry *, struct sockaddr *);
typedef void (llt_free_tbl_t)(struct lltable *);
typedef int (llt_link_entry_t)(struct lltable *, struct llentry *);
typedef int (llt_unlink_entry_t)(struct llentry *);
typedef void (llt_mark_used_t)(struct llentry *);
typedef void (llt_post_resolved_t)(struct lltable *, struct llentry *);
typedef int (llt_foreach_cb_t)(struct lltable *, struct llentry *, void *);
typedef int (llt_foreach_entry_t)(struct lltable *, llt_foreach_cb_t *, void *);
typedef bool (llt_match_cb_t)(struct lltable *, struct llentry *, void *);
2008-12-15 06:53:09 +00:00
struct lltable {
SLIST_ENTRY(lltable) llt_link;
sa_family_t llt_af;
uint8_t llt_flags;
uint8_t llt_spare[2];
int llt_hsize;
int llt_entries;
int llt_maxentries;
struct llentries *lle_head;
2008-12-15 06:53:09 +00:00
struct ifnet *llt_ifp;
llt_lookup_t *llt_lookup;
llt_alloc_t *llt_alloc_entry;
llt_delete_t *llt_delete_entry;
llt_prefix_free_t *llt_prefix_free;
llt_dump_entry_t *llt_dump_entry;
llt_hash_t *llt_hash;
llt_match_prefix_t *llt_match_prefix;
llt_free_entry_t *llt_free_entry;
llt_foreach_entry_t *llt_foreach_entry;
llt_link_entry_t *llt_link_entry;
llt_unlink_entry_t *llt_unlink_entry;
llt_fill_sa_entry_t *llt_fill_sa_entry;
llt_free_tbl_t *llt_free_tbl;
llt_mark_used_t *llt_mark_used;
llt_post_resolved_t *llt_post_resolved;
2008-12-15 06:53:09 +00:00
};
2008-12-15 06:53:09 +00:00
MALLOC_DECLARE(M_LLTABLE);
/*
* LLtable flags
*/
#define LLT_ADDEDPROXY 0x01 /* added a proxy llentry */
2008-12-15 06:53:09 +00:00
/*
* LLentry flags
2008-12-15 06:53:09 +00:00
*/
#define LLE_DELETED 0x0001 /* entry must be deleted */
#define LLE_STATIC 0x0002 /* entry is static */
#define LLE_IFADDR 0x0004 /* entry is interface addr */
#define LLE_VALID 0x0008 /* ll_addr is valid */
#define LLE_REDIRECT 0x0010 /* installed by redirect; has host rtentry */
2008-12-15 06:53:09 +00:00
#define LLE_PUB 0x0020 /* publish entry ??? */
#define LLE_LINKED 0x0040 /* linked to lookup structure */
#define LLE_CHILD 0x0080 /* Child LLE storing different AF encap */
/* LLE request flags */
#define LLE_EXCLUSIVE 0x2000 /* return lle xlocked */
Remove LLE read lock from IPv4 fast path. LLE structure is mostly unchanged during its lifecycle. To be more specific, there are 2 things relevant for fast path lookup code: 1) link-level address change. Since r286722, these updates are performed under AFDATA WLOCK. 2) Some sort of feedback indicating that this particular entry is used so we re-send arp request to perform reachability verification instead of expiring entry. The only signal that is needed from fast path is something like binary yes/no. The latter is solved by the following changes: 1) introduce special r_skip_req field which is read lockless by fast path, but updated under (new) req_mutex mutex. If this field is non-zero, then fast path will acquire lock and set it back to 0. 2) introduce simple state machine: incomplete->reachable<->verify->deleted. Before that we implicitely had incomplete->reachable->deleted state machine, with V_arpt_keep between "reachable" and "deleted". Verification was performed in runtime 5 seconds before V_arpt_keep expire. This is changed to "change state to verify 5 seconds before V_arpt_keep, set r_skip_req to non-zero value and check it every second". If the value is zero - then send arp verification probe. These changes do not introduce any signifficant control plane overhead: typically lle callout timer would fire 1 time more each V_arpt_keep (1200s) for used lles and up to arp_maxtries (5) for dead lles. As a result, all packets towards "reachable" lle are handled by fast path without acquiring lle read lock. Additional "req_mutex" is needed because callout / arpresolve_slow() or eventhandler might keep LLE lock for signifficant amount of time, which might not be feasible for fast path locking (e.g. having rmlock as ether AFDATA or lltable own lock). Differential Revision: https://reviews.freebsd.org/D3688
2015-12-05 09:50:37 +00:00
#define LLE_UNLOCKED 0x4000 /* return lle unlocked */
Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102
2015-12-31 05:03:27 +00:00
#define LLE_ADDRONLY 0x4000 /* return lladdr instead of full header */
#define LLE_CREATE 0x8000 /* hint to avoid lle lookup */
Remove LLE read lock from IPv4 fast path. LLE structure is mostly unchanged during its lifecycle. To be more specific, there are 2 things relevant for fast path lookup code: 1) link-level address change. Since r286722, these updates are performed under AFDATA WLOCK. 2) Some sort of feedback indicating that this particular entry is used so we re-send arp request to perform reachability verification instead of expiring entry. The only signal that is needed from fast path is something like binary yes/no. The latter is solved by the following changes: 1) introduce special r_skip_req field which is read lockless by fast path, but updated under (new) req_mutex mutex. If this field is non-zero, then fast path will acquire lock and set it back to 0. 2) introduce simple state machine: incomplete->reachable<->verify->deleted. Before that we implicitely had incomplete->reachable->deleted state machine, with V_arpt_keep between "reachable" and "deleted". Verification was performed in runtime 5 seconds before V_arpt_keep expire. This is changed to "change state to verify 5 seconds before V_arpt_keep, set r_skip_req to non-zero value and check it every second". If the value is zero - then send arp verification probe. These changes do not introduce any signifficant control plane overhead: typically lle callout timer would fire 1 time more each V_arpt_keep (1200s) for used lles and up to arp_maxtries (5) for dead lles. As a result, all packets towards "reachable" lle are handled by fast path without acquiring lle read lock. Additional "req_mutex" is needed because callout / arpresolve_slow() or eventhandler might keep LLE lock for signifficant amount of time, which might not be feasible for fast path locking (e.g. having rmlock as ether AFDATA or lltable own lock). Differential Revision: https://reviews.freebsd.org/D3688
2015-12-05 09:50:37 +00:00
/* LLE flags used by fastpath code */
#define RLLE_VALID 0x0001 /* entry is valid */
#define RLLE_IFADDR LLE_IFADDR /* entry is ifaddr */
2008-12-15 06:53:09 +00:00
#define LLATBL_HASH(key, mask) \
(((((((key >> 8) ^ key) >> 8) ^ key) >> 8) ^ key) & mask)
struct lltable *lltable_allocate_htbl(uint32_t hsize);
2008-12-15 06:53:09 +00:00
void lltable_free(struct lltable *);
void lltable_link(struct lltable *llt);
void lltable_prefix_free(int, struct sockaddr *,
struct sockaddr *, u_int);
2008-12-15 06:53:09 +00:00
int lltable_sysctl_dumparp(int, struct sysctl_req *);
size_t lltable_append_entry_queue(struct llentry *,
struct mbuf *, size_t);
2008-12-15 06:53:09 +00:00
struct lltable *in_lltable_get(struct ifnet *ifp);
struct lltable *in6_lltable_get(struct ifnet *ifp);
struct lltable *lltable_get(struct ifnet *ifp, int family);
size_t llentry_free(struct llentry *);
2008-12-15 06:53:09 +00:00
/* helper functions */
size_t lltable_drop_entry_queue(struct llentry *);
2015-11-07 11:12:00 +00:00
void lltable_set_entry_addr(struct ifnet *ifp, struct llentry *lle,
Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102
2015-12-31 05:03:27 +00:00
const char *linkhdr, size_t linkhdrsize, int lladdr_off);
Remove LLE read lock from IPv6 fast path. LLE structure is mostly unchanged during its lifecycle: there are only 2 things relevant for fast path lookup code: 1) link-level address change. Since r286722, these updates are performed under AFDATA WLOCK. 2) Some sort of feedback indicating that this particular entry is used so we send NS to perform reachability verification instead of expiring entry. The only signal that is needed from fast path is something like binary yes/no. The latter is solved by the following changes: Special r_skip_req (introduced in D3688) value is used for fast path feedback. It is read lockless by fast path, but updated under req_mutex mutex. If this field is non-zero, then fast path will acquire lock and set it back to 0. After transitioning to STALE state, callout timer is armed to run each V_nd6_delay seconds to make sure that if packet was transmitted at the start of given interval, we would be able to switch to PROBE state in V_nd6_delay seconds as user expects. (in STALE state) timer is rescheduled until original V_nd6_gctimer expires keeping lle in STALE state (remaining timer value stored in lle_remtime). (in STALE state) timer is rescheduled if packet was transmitted less that V_nd6_delay seconds ago to make sure we transition to PROBE state exactly after V_n6_delay seconds. As a result, all packets towards lle in REACHABLE/STALE/PROBE states are handled by fast path without acquiring lle read lock. Differential Revision: https://reviews.freebsd.org/D3780
2015-12-13 07:39:49 +00:00
int lltable_try_set_entry_addr(struct ifnet *ifp, struct llentry *lle,
Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102
2015-12-31 05:03:27 +00:00
const char *linkhdr, size_t linkhdrsize, int lladdr_off);
Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102
2015-12-31 05:03:27 +00:00
int lltable_calc_llheader(struct ifnet *ifp, int family, char *lladdr,
char *buf, size_t *bufsize, int *lladdr_off);
void lltable_update_ifaddr(struct lltable *llt);
struct llentry *lltable_alloc_entry(struct lltable *llt, u_int flags,
const struct sockaddr *l4addr);
void lltable_free_entry(struct lltable *llt, struct llentry *lle);
int lltable_delete_addr(struct lltable *llt, u_int flags,
const struct sockaddr *l3addr);
int lltable_link_entry(struct lltable *llt, struct llentry *lle);
int lltable_unlink_entry(struct lltable *llt, struct llentry *lle);
void lltable_link_child_entry(struct llentry *parent_lle, struct llentry *child_lle);
void lltable_unlink_child_entry(struct llentry *child_lle);
void lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa);
struct ifnet *lltable_get_ifp(const struct lltable *llt);
int lltable_get_af(const struct lltable *llt);
bool lltable_acquire_wlock(struct ifnet *ifp, struct llentry *lle);
int lltable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f,
void *farg);
void lltable_delete_conditional(struct lltable *llt, llt_match_cb_t *func,
void *farg);
2008-12-15 06:53:09 +00:00
/*
* Generic link layer address lookup function.
*/
static __inline struct llentry *
lla_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr)
{
return (llt->llt_lookup(llt, flags, l3addr));
2008-12-15 06:53:09 +00:00
}
void llentry_request_feedback(struct llentry *lle);
void llentry_mark_used(struct llentry *lle);
time_t llentry_get_hittime(struct llentry *lle);
int llentry_get_upper_family(const struct llentry *lle, int default_family);
/*
* Notify the LLE code that the entry was used by datapath.
*/
static __inline void
llentry_provide_feedback(struct llentry *lle)
{
if (__predict_true(lle->r_skip_req == 0))
return;
llentry_mark_used(lle);
}
struct llentry *llentry_lookup_family(struct llentry *lle, int family);
2008-12-15 06:53:09 +00:00
int lla_rt_output(struct rt_msghdr *, struct rt_addrinfo *);
enum {
LLENTRY_RESOLVED,
LLENTRY_TIMEDOUT,
LLENTRY_DELETED,
LLENTRY_EXPIRED,
};
typedef void (*lle_event_fn)(void *, struct llentry *, int);
EVENTHANDLER_DECLARE(lle_event, lle_event_fn);
2008-12-15 06:53:09 +00:00
#endif /* _NET_IF_LLATBL_H_ */